diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d019d937e36b9f90f4ef838fc8f01c17d1746ad
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b9dbd6da26402b76b495ee26389fdafeb63b201
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc24f1c4f1696ebc0908b6cc043db79a5c52222c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..483bf08614088b6cd4d06b5974e0cc7788c4a248
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/aot_compile_types.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8224610d66fd1c5004dfc7eadc82e27ee50ab21
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbd1ab5c9577c92db3aaafe816e32b77bec83b09
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6174d5a00087c1655e8a73a1f66723bb5ea92cd9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/codegen.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bda9742a3b5977d27386d273a27da388828814d5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0c2043734d21e9627a895dfc5e3bbb22619d446
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/comptime.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9be1ba6f1270c476be449f498261329b3735b33e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/create_parameter_op.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3bd69ca253be1166c547063ccc1e7c27070590
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/dce_extra_outputs.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..378840fbffe2837421c1c2ec4369a2d66e85a224
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/decorators.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9111a0d62d5ea0f01e737607491357ba1bc76bbe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..192d0e77aa3f30d9d109c4f05571fea9f3f48655
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/distributed.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8133a950f518dae3f2bbf10e80d143006666ca0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/exc.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9809387cacd51acd44bea3c77f2144d21476a8af
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d4adcbcfb4db85d82f9cd093430755405b8ac72
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_break_hints.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bf4dae6626ef438ba2b578c9dbc36f4a6b182b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/graph_region_tracker.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e630df369770af988af38199677d09f08e09ebe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/hooks.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f22e785e6e2d126d2e596e9ecbd0d9ce0bff6a2b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/metrics_context.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bca1967136deed7a85f9362845a92fac83e0b712
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25a00e7e65e45c0659fb28025ab8e06823b3a878
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/pgo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8effb2515a01e1602d83267b5c6e39a46ba2032
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/precompile_context.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..361ace1626099e497a41dbe61ee4df54c14f19e3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/profiler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae40dc16ff7c3204adca1ec979b1c674448a6367
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6bf882ffa83a508d059f4af8c5b4d6e42706afb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb96db69290ba932955aad575c20279229f9442d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d26020cb49577f2e62772e3f93c846d2da62fa8e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/source.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3307040faddcfe47c9035ac2039323681b3a65ed
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd392de1bb4e3db5fdb00f824484130d2c517a37
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/__pycache__/types.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d308b2b9c0e1ffe51b6dd11dcca0aad86a4b4cce
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56a22011447d2c103fb3614ef1d6e6e7ae1f973a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0b8fd1274ee33ddad10f03f118dd8088745cbfc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36d491f75a164f1e7635a82ebf77f3512a5b7f36
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a73c9fe94823366198b03f23fcb556cbebdb76
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5497c473c1cdf7c00db59ea77cb331eadbfc7853
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5943b219427149b89e50685aa25d6b39096743bd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e09f0b408a98b42e6e7ff3783db082542c5ee34
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e09c989ca10d3dc5cbd33381310d9819a671787
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/repro/__pycache__/aoti.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5483b9d95ee87c3a349ec943b3cf6136600001c3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e71e5ef93742375e00e8b276f18e3993b269227d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ba84a84251db6229c38b5f2c48b233fe594fbb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__init__.py
@@ -0,0 +1,41 @@
+import types
+
+from .modules import *  # noqa: F403
+from .modules.fused import _FusedModule  # noqa: F403
+
+
+# # Subpackages
+# from . import qat  # noqa: F403
+# from . import quantized  # noqa: F403
+
+__all__ = [
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "LinearReLU",
+    "BNReLU2d",
+    "BNReLU3d",
+    "LinearBn1d",
+    "LinearLeakyReLU",
+    "LinearTanh",
+    "ConvAdd2d",
+    "ConvAddReLU2d",
+]
+
+
+# We are exposing all subpackages to the end-user.
+# Because of possible inter-dependency, we want to avoid
+# the cyclic imports, thus implementing lazy version
+# as per https://peps.python.org/pep-0562/
+def __getattr__(name: str) -> types.ModuleType:
+    if name in __all__:
+        import importlib
+
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74572b52e820de65e63ae05585bfe1f41c34ded9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..132137b7357378fe29ef9a63310a554725aea86a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__init__.py
@@ -0,0 +1,41 @@
+from .fused import (  # noqa: F401
+    _FusedModule,
+    BNReLU2d,
+    BNReLU3d,
+    ConvAdd2d,
+    ConvAddReLU2d,
+    ConvBn1d,
+    ConvBn2d,
+    ConvBn3d,
+    ConvBnReLU1d,
+    ConvBnReLU2d,
+    ConvBnReLU3d,
+    ConvReLU1d,
+    ConvReLU2d,
+    ConvReLU3d,
+    LinearBn1d,
+    LinearLeakyReLU,
+    LinearReLU,
+    LinearTanh,
+)
+
+
+__all__ = [
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "LinearReLU",
+    "BNReLU2d",
+    "BNReLU3d",
+    "LinearBn1d",
+    "LinearLeakyReLU",
+    "LinearTanh",
+    "ConvAdd2d",
+    "ConvAddReLU2d",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a82f8dab4a219e2ec35a172cd1a33815874becef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5acc7bfc26ef4b76adbdaa7f0aaad074f1dd5b5d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..d189e3d92447da930ba487034b58c623e2e7a4ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/modules/fused.py
@@ -0,0 +1,289 @@
+# mypy: allow-untyped-defs
+import torch
+from torch.nn import (
+    BatchNorm1d,
+    BatchNorm2d,
+    BatchNorm3d,
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    Linear,
+    ReLU,
+)
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+
+__all__ = [
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "LinearReLU",
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBn3d",
+    "ConvBnReLU3d",
+    "BNReLU2d",
+    "BNReLU3d",
+    "LinearBn1d",
+    "LinearLeakyReLU",
+    "LinearTanh",
+    "ConvAdd2d",
+    "ConvAddReLU2d",
+]
+
+
+# Used for identifying intrinsic modules used in quantization
+class _FusedModule(torch.nn.Sequential):
+    pass
+
+
+class ConvReLU1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv1d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv1d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, relu)
+
+
+class ConvReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv2d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, relu)
+
+
+class ConvReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv3d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv3d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, relu)
+
+
+class LinearReLU(_FusedModule):
+    r"""This is a sequential container which calls the Linear and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, linear, relu):
+        assert (
+            type_before_parametrizations(linear) == Linear
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(linear)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(linear, relu)
+
+
+class ConvBn1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 1d and Batch Norm 1d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn):
+        assert (
+            type_before_parametrizations(conv) == Conv1d
+            and type_before_parametrizations(bn) == BatchNorm1d
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+        super().__init__(conv, bn)
+
+
+class ConvBn2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 2d and Batch Norm 2d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn):
+        assert (
+            type_before_parametrizations(conv) == Conv2d
+            and type_before_parametrizations(bn) == BatchNorm2d
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+        super().__init__(conv, bn)
+
+
+class ConvBnReLU1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 1d, Batch Norm 1d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv1d
+            and type_before_parametrizations(bn) == BatchNorm1d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, bn, relu)
+
+
+class ConvBnReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 2d, Batch Norm 2d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv2d
+            and type_before_parametrizations(bn) == BatchNorm2d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, bn, relu)
+
+
+class ConvBn3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 3d and Batch Norm 3d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn):
+        assert (
+            type_before_parametrizations(conv) == Conv3d
+            and type_before_parametrizations(bn) == BatchNorm3d
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+        super().__init__(conv, bn)
+
+
+class ConvBnReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 3d, Batch Norm 3d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, bn, relu):
+        assert (
+            type_before_parametrizations(conv) == Conv3d
+            and type_before_parametrizations(bn) == BatchNorm3d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(conv, bn, relu)
+
+
+class BNReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the BatchNorm 2d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, batch_norm, relu):
+        assert (
+            type_before_parametrizations(batch_norm) == BatchNorm2d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(batch_norm, relu)
+
+
+class BNReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the BatchNorm 3d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, batch_norm, relu):
+        assert (
+            type_before_parametrizations(batch_norm) == BatchNorm3d
+            and type_before_parametrizations(relu) == ReLU
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+        super().__init__(batch_norm, relu)
+
+
+class LinearBn1d(_FusedModule):
+    r"""This is a sequential container which calls the Linear and BatchNorm1d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, linear, bn):
+        assert (
+            type_before_parametrizations(linear) == Linear
+            and type_before_parametrizations(bn) == BatchNorm1d
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(linear)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+        super().__init__(linear, bn)
+
+
+class LinearLeakyReLU(_FusedModule):
+    r"""This is a sequential container which calls the Linear and LeakyReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, linear, leaky_relu):
+        assert type(linear) is Linear and type(leaky_relu) is torch.nn.LeakyReLU, (
+            f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}"
+        )
+        super().__init__(linear, leaky_relu)
+
+
+class LinearTanh(_FusedModule):
+    r"""This is a sequential container which calls the Linear and Tanh modules.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, linear, tanh):
+        assert type(linear) is Linear and type(tanh) is torch.nn.Tanh, (
+            f"Incorrect types for input modules{type(linear)}{type(tanh)}"
+        )
+        super().__init__(linear, tanh)
+
+
+class ConvAdd2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d modules with extra Add.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, add):
+        super().__init__(conv)
+        self.add = add
+
+    def forward(self, x1, x2):  # type: ignore[override]
+        r"""Applies convolution to x1 and adds the result to x2."""
+        return self.add(self[0](x1), x2)
+
+
+class ConvAddReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d, add, Relu.
+    During quantization this will be replaced with the corresponding fused module."""
+
+    def __init__(self, conv, add, relu):
+        super().__init__(conv)
+        self.add = add
+        self.relu = relu
+
+    def forward(self, x1, x2):  # type: ignore[override]
+        r"""Applies convolution to x1, adds the result to x2, and applies ReLU."""
+        return self.relu(self.add(self[0](x1), x2))
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f49dc17b617241eae01635293e1203d6f994f61
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18534bbc588e7480ac6529c6648c5976eadaea3a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py
@@ -0,0 +1,32 @@
+from .conv_fused import (
+    ConvBn1d,
+    ConvBn2d,
+    ConvBn3d,
+    ConvBnReLU1d,
+    ConvBnReLU2d,
+    ConvBnReLU3d,
+    ConvReLU1d,
+    ConvReLU2d,
+    ConvReLU3d,
+    freeze_bn_stats,
+    update_bn_stats,
+)
+from .linear_fused import LinearBn1d
+from .linear_relu import LinearReLU
+
+
+__all__ = [
+    "LinearReLU",
+    "LinearBn1d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "update_bn_stats",
+    "freeze_bn_stats",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d192c40cfc9cb81d5283dac9d0ecfe8cacb77c5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fedc28632165898913829990fc0448331cc7594
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2505e061a09624a7f47415610567c7069ae3ed57
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03742ef1550d71d7777d1bd1abc6dca514501008
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..10f67764d8f05143e4bcc15ad1196f801015370a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -0,0 +1,958 @@
+# mypy: allow-untyped-defs
+import math
+from typing import ClassVar
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.qat as nnqat
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.nn.parameter import Parameter
+from torch.nn.utils import fuse_conv_bn_weights
+
+
+__all__ = [
+    "ConvBn1d",
+    "ConvBnReLU1d",
+    "ConvReLU1d",
+    "ConvBn2d",
+    "ConvBnReLU2d",
+    "ConvReLU2d",
+    "ConvBn3d",
+    "ConvBnReLU3d",
+    "ConvReLU3d",
+    "update_bn_stats",
+    "freeze_bn_stats",
+]
+_BN_CLASS_MAP = {
+    1: nn.BatchNorm1d,
+    2: nn.BatchNorm2d,
+    3: nn.BatchNorm3d,
+}
+
+
+class _ConvBnNd(nn.modules.conv._ConvNd, nni._FusedModule):
+    _version = 2
+    _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]]
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode,
+        # BatchNormNd args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+        dim=2,
+    ):
+        nn.modules.conv._ConvNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            False,
+            padding_mode,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.freeze_bn = freeze_bn if self.training else True
+        self.bn = _BN_CLASS_MAP[dim](out_channels, eps, momentum, True, True)
+        self.weight_fake_quant = self.qconfig.weight()
+        if bias:
+            self.bias = Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_bn_parameters()
+
+        # this needs to be called after reset_bn_parameters,
+        # as they modify the same state
+        if self.training:
+            if freeze_bn:
+                self.freeze_bn_stats()
+            else:
+                self.update_bn_stats()
+        else:
+            self.freeze_bn_stats()
+
+        self._enable_slow_path_for_better_numerical_stability = False
+
+    def reset_running_stats(self):
+        self.bn.reset_running_stats()
+
+    def reset_bn_parameters(self):
+        self.bn.reset_running_stats()
+        init.uniform_(self.bn.weight)
+        init.zeros_(self.bn.bias)
+        # note: below is actually for conv, not BN
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def update_bn_stats(self):
+        self.freeze_bn = False
+        self.bn.training = True
+        return self
+
+    def freeze_bn_stats(self):
+        self.freeze_bn = True
+        self.bn.training = False
+        return self
+
+    def _forward(self, input):
+        if self._enable_slow_path_for_better_numerical_stability:
+            return self._forward_slow(input)
+        return self._forward_approximate(input)
+
+    def _forward_approximate(self, input):
+        """Approximated method to fuse conv and bn. It requires only one forward pass.
+        conv_orig = conv / scale_factor where scale_factor = bn.weight / running_std
+        """
+        assert self.bn.running_var is not None
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = self.weight_fake_quant(
+            self.weight * scale_factor.reshape(weight_shape)
+        )
+        # using zero bias here since the bias for original conv
+        # will be added later
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias, dtype=input.dtype)
+        else:
+            zero_bias = torch.zeros(
+                self.out_channels, device=scaled_weight.device, dtype=input.dtype
+            )
+        conv = self._conv_forward(input, scaled_weight, zero_bias)
+        conv_orig = conv / scale_factor.reshape(bias_shape)
+        if self.bias is not None:
+            conv_orig = conv_orig + self.bias.reshape(bias_shape)
+        conv = self.bn(conv_orig)
+        return conv
+
+    def _forward_slow(self, input):
+        """
+        A more accurate but slow method to compute conv bn fusion, following https://arxiv.org/pdf/1806.08342.pdf
+        It requires two forward passes but handles the case bn.weight == 0
+
+        Conv: Y = WX + B_c
+        Conv without bias: Y0 = WX = Y - B_c, Y = Y0 + B_c
+
+        Batch statistics:
+          mean_Y = Y.mean()
+                 = Y0.mean() + B_c
+          var_Y = (Y - mean_Y)^2.mean()
+                = (Y0 - Y0.mean())^2.mean()
+        BN (r: bn.weight, beta: bn.bias):
+          Z = r * (Y - mean_Y) / sqrt(var_Y + eps) + beta
+            = r * (Y0 - Y0.mean()) / sqrt(var_Y + eps) + beta
+
+        Fused Conv BN training (std_Y = sqrt(var_Y + eps)):
+          Z = (r * W / std_Y) * X + r * (B_c - mean_Y) / std_Y + beta
+            = (r * W / std_Y) * X - r * Y0.mean() / std_Y + beta
+
+        Fused Conv BN inference (running_std = sqrt(running_var + eps)):
+          Z = (r * W / running_std) * X - r * (running_mean - B_c) / running_std + beta
+
+        QAT with fused conv bn:
+          Z_train = fake_quant(r * W / running_std) * X * (running_std / std_Y) - r * Y0.mean() / std_Y + beta
+                  = conv(X, fake_quant(r * W / running_std)) * (running_std / std_Y) - r * Y0.mean() / std_Y + beta
+          Z_inference = conv(X, fake_quant(r * W / running_std)) - r * (running_mean - B_c) / running_std + beta
+        """
+
+        assert self.bn.running_var is not None
+        assert self.bn.running_mean is not None
+
+        # using zero bias here since the bias for original conv
+        # will be added later
+        zero_bias = torch.zeros(
+            self.out_channels, device=self.weight.device, dtype=input.dtype
+        )
+
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+
+        if self.bn.training:
+            # needed to compute batch mean/std
+            conv_out = self._conv_forward(input, self.weight, zero_bias)
+            # update bn statistics
+            with torch.no_grad():
+                conv_out_bias = (
+                    conv_out
+                    if self.bias is None
+                    else conv_out + self.bias.reshape(bias_shape)
+                )
+                self.bn(conv_out_bias)
+
+            # fused conv + bn without bias using bn running statistics
+            running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+            scale_factor = self.bn.weight / running_std
+            scaled_weight = self.weight_fake_quant(
+                self.weight * scale_factor.reshape(weight_shape)
+            )
+            # fused conv without bias for inference: (r * W / running_std) * X
+            conv_bn = self._conv_forward(input, scaled_weight, zero_bias)
+
+            avg_dims = [0] + list(range(2, len(self.weight.shape)))
+            batch_mean = conv_out.mean(avg_dims)
+            batch_var = torch.square(conv_out - batch_mean.reshape(bias_shape)).mean(
+                avg_dims
+            )
+            batch_std = torch.sqrt(batch_var + self.bn.eps)
+
+            # scale to use batch std in training mode
+            # conv(X, r * W / std_Y) = conv(X, r * W / running_std) * (running_std / std_Y)
+            unscale_factor = running_std / batch_std
+            conv_bn *= unscale_factor.reshape(bias_shape)
+
+            fused_mean = batch_mean
+            fused_std = batch_std
+        else:
+            # fused conv + bn without bias using bn running statistics
+            running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+            scale_factor = self.bn.weight / running_std
+            scaled_weight = self.weight_fake_quant(
+                self.weight * scale_factor.reshape(weight_shape)
+            )
+            # fused conv without bias for inference: (r * W / running_std) * X
+            conv_bn = self._conv_forward(input, scaled_weight, zero_bias)
+
+            fused_mean = self.bn.running_mean - (
+                self.bias if self.bias is not None else 0
+            )
+            fused_std = running_std
+
+        # fused bias = beta - r * mean / std
+        fused_bias = self.bn.bias - self.bn.weight * fused_mean / fused_std
+        conv_bn += fused_bias.reshape(bias_shape)
+
+        # HACK to let conv bias participate in loss to avoid DDP error (parameters
+        #   were not used in producing loss)
+        if self.bias is not None:
+            conv_bn += (self.bias - self.bias).reshape(bias_shape)
+
+        return conv_bn
+
+    def forward(self, input):
+        return self._forward(input)
+
+    def train(self, mode=True):
+        """
+        Batchnorm's training behavior is using the self.training flag. Prevent
+        changing it if BN is frozen. This makes sure that calling `model.train()`
+        on a model with a frozen BN will behave properly.
+        """
+        self.training = mode
+        if not self.freeze_bn:
+            for module in self.children():
+                module.train(mode)
+        return self
+
+    # ===== Serialization version history =====
+    #
+    # Version 1/None
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- gamma : Tensor
+    #   |--- beta : Tensor
+    #   |--- running_mean : Tensor
+    #   |--- running_var : Tensor
+    #   |--- num_batches_tracked : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- bn : Module
+    #        |--- weight : Tensor (moved from v1.self.gamma)
+    #        |--- bias : Tensor (moved from v1.self.beta)
+    #        |--- running_mean : Tensor (moved from v1.self.running_mean)
+    #        |--- running_var : Tensor (moved from v1.self.running_var)
+    #        |--- num_batches_tracked : Tensor (moved from v1.self.num_batches_tracked)
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version == 1:
+            # BN related parameters and buffers were moved into the BN module for v2
+            v2_to_v1_names = {
+                "bn.weight": "gamma",
+                "bn.bias": "beta",
+                "bn.running_mean": "running_mean",
+                "bn.running_var": "running_var",
+                "bn.num_batches_tracked": "num_batches_tracked",
+            }
+            for v2_name, v1_name in v2_to_v1_names.items():
+                if prefix + v1_name in state_dict:
+                    state_dict[prefix + v2_name] = state_dict[prefix + v1_name]
+                    state_dict.pop(prefix + v1_name)
+                elif prefix + v2_name in state_dict:
+                    # there was a brief period where forward compatibility
+                    # for this module was broken (between
+                    # https://github.com/pytorch/pytorch/pull/38478
+                    # and https://github.com/pytorch/pytorch/pull/38820)
+                    # and modules emitted the v2 state_dict format while
+                    # specifying that version == 1. This patches the forward
+                    # compatibility issue by allowing the v2 style entries to
+                    # be used.
+                    pass
+                elif strict:
+                    missing_keys.append(prefix + v2_name)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module or qparams_dict
+
+        Args: `mod` a float module, either produced by torch.ao.quantization utilities
+        or directly from user
+        """
+        # The ignore is because _FLOAT_MODULE is a TypeVar here where the bound
+        # has no __name__ (code is fine though)
+        assert type(mod) is cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        qconfig = mod.qconfig
+        conv, bn = mod[0], mod[1]  # type: ignore[index]
+        qat_convbn = cls(
+            conv.in_channels,
+            conv.out_channels,
+            conv.kernel_size,
+            conv.stride,
+            conv.padding,
+            conv.dilation,
+            conv.groups,
+            conv.bias is not None,
+            conv.padding_mode,
+            bn.eps,
+            bn.momentum,
+            False,
+            qconfig,
+        )
+        qat_convbn.weight = conv.weight
+        qat_convbn.bias = conv.bias
+        qat_convbn.bn.weight = bn.weight
+        qat_convbn.bn.bias = bn.bias
+        qat_convbn.bn.running_mean = bn.running_mean
+        qat_convbn.bn.running_var = bn.running_var
+        # mypy error: Cannot determine type of 'num_batches_tracked'
+        qat_convbn.bn.num_batches_tracked = bn.num_batches_tracked
+        return qat_convbn
+
+    def to_float(self):
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined]
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.bias is not None,
+            self.padding_mode,
+        )
+        conv.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            conv.bias = torch.nn.Parameter(self.bias.detach())
+
+        if cls._FLOAT_BN_MODULE:  # type: ignore[attr-defined]
+            # fuse bn into conv
+            assert self.bn.running_var is not None and self.bn.running_mean is not None
+            conv.weight, conv.bias = fuse_conv_bn_weights(
+                conv.weight,
+                conv.bias,
+                self.bn.running_mean,
+                self.bn.running_var,
+                self.bn.eps,
+                self.bn.weight,
+                self.bn.bias,
+            )
+
+        if cls._FLOAT_RELU_MODULE:  # type: ignore[attr-defined]
+            modules = []
+            modules.append(conv)
+            relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
+            modules.append(relu)
+            conv_relu = cls._FUSED_FLOAT_MODULE(*modules)  # type: ignore[attr-defined]
+            conv_relu.train(self.training)
+            return conv_relu
+        else:
+            conv.train(self.training)
+            return conv
+
+
+class ConvBn1d(_ConvBnNd, nn.Conv1d):
+    r"""
+    A ConvBn1d module is a module fused from Conv1d and BatchNorm1d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv1d` and
+    :class:`torch.nn.BatchNorm1d`.
+
+    Similar to :class:`torch.nn.Conv1d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBn1d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+
+    def __init__(
+        self,
+        # Conv1d args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm1d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        _ConvBnNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _single(0),
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+            dim=1,
+        )
+
+
+class ConvBnReLU1d(ConvBn1d):
+    r"""
+    A ConvBnReLU1d module is a module fused from Conv1d, BatchNorm1d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv1d` and
+    :class:`torch.nn.BatchNorm1d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv1d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    # base class defines _FLOAT_MODULE as "ConvBn1d"
+    _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBnReLU1d
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE: ClassVar[type[nn.Module] | None] = nni.ConvReLU1d
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv1d, BatchNorm1d, and ReLU."""
+        return F.relu(self._forward(input))
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(mod, use_precomputed_fake_quant)
+
+
+class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
+    r"""A ConvReLU1d module is a fused module of Conv1d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv1d` and
+    :class:`~torch.nn.BatchNorm1d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nni.ConvReLU1d]] = nni.ConvReLU1d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        qconfig=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv1d and ReLU."""
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class ConvBn2d(_ConvBnNd, nn.Conv2d):
+    r"""
+    A ConvBn2d module is a module fused from Conv2d and BatchNorm2d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv2d` and
+    :class:`torch.nn.BatchNorm2d`.
+
+    Similar to :class:`torch.nn.Conv2d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nni.ConvBn2d]] = nni.ConvBn2d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm2d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm2d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        _ConvBnNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+            dim=2,
+        )
+
+
+class ConvBnReLU2d(ConvBn2d):
+    r"""
+    A ConvBnReLU2d module is a module fused from Conv2d, BatchNorm2d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv2d` and
+    :class:`torch.nn.BatchNorm2d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    # base class defines _FLOAT_MODULE as "ConvBn2d"
+    _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU2d]] = nni.ConvBnReLU2d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm2d]] = nn.BatchNorm2d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU2d] | None] = nni.ConvReLU2d
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv2d, BatchNorm2d, and ReLU."""
+        return F.relu(self._forward(input))
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(mod, use_precomputed_fake_quant)
+
+
+class ConvReLU2d(nnqat.Conv2d, nni._FusedModule):
+    r"""A ConvReLU2d module is a fused module of Conv2d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv2d` and
+    :class:`~torch.nn.BatchNorm2d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvReLU2d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        qconfig=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv2d and ReLU."""
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class ConvBn3d(_ConvBnNd, nn.Conv3d):
+    r"""
+    A ConvBn3d module is a module fused from Conv3d and BatchNorm3d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv3d` and
+    :class:`torch.nn.BatchNorm3d`.
+
+    Similar to :class:`torch.nn.Conv3d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nni.ConvBn3d]] = nni.ConvBn3d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = nn.BatchNorm3d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm3d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        _ConvBnNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+            dim=3,
+        )
+
+
+class ConvBnReLU3d(ConvBn3d):
+    r"""
+    A ConvBnReLU3d module is a module fused from Conv3d, BatchNorm3d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv3d` and
+    :class:`torch.nn.BatchNorm3d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU3d]] = nni.ConvBnReLU3d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm3d]] = nn.BatchNorm3d
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.ReLU] | None] = nn.ReLU
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d] | None] = nni.ConvReLU3d
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv3d, BatchNorm3d, and ReLU."""
+        return F.relu(ConvBn3d._forward(self, input))
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class ConvReLU3d(nnqat.Conv3d, nni._FusedModule):
+    r"""A ConvReLU3d module is a fused module of Conv3d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv3d` and
+    :class:`~torch.nn.BatchNorm3d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d]] = nni.ConvReLU3d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _FLOAT_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _FLOAT_RELU_MODULE: ClassVar[type[nn.Module] | None] = nn.ReLU
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        qconfig=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        r"""Performs forward pass through fused Conv3d and ReLU."""
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a QAT module from a floating point module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+def update_bn_stats(mod):
+    if type(mod) in {
+        ConvBnReLU1d,
+        ConvBnReLU2d,
+        ConvBnReLU3d,
+        ConvBn1d,
+        ConvBn2d,
+        ConvBn3d,
+    }:
+        mod.update_bn_stats()
+
+
+def freeze_bn_stats(mod):
+    if type(mod) in {
+        ConvBnReLU1d,
+        ConvBnReLU2d,
+        ConvBnReLU3d,
+        ConvBn1d,
+        ConvBn2d,
+        ConvBn3d,
+    }:
+        mod.freeze_bn_stats()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..8458cef76ee3a37bce33d924d2d60d2ca971a614
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -0,0 +1,191 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+
+
+__all__ = [
+    "LinearBn1d",
+]
+
+
+class LinearBn1d(nn.modules.linear.Linear, nni._FusedModule):
+    r"""
+    A LinearBn1d module is a module fused from Linear and BatchNorm1d, attached
+    with FakeQuantize modules for weight, used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Linear` and
+    :class:torch.nn.BatchNorm1d`.
+
+    Similar to :class:`torch.nn.Linear`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+
+    def __init__(
+        self,
+        # Linear args
+        in_features,
+        out_features,
+        bias=True,
+        # BatchNorm1d args
+        # num_features: out_features
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        nn.modules.linear.Linear.__init__(self, in_features, out_features, bias)
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.freeze_bn = freeze_bn if self.training else True
+        self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True)
+        self.weight_fake_quant = self.qconfig.weight()
+        if bias:
+            self.bias = Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_bn_parameters()
+
+        # this needs to be called after reset_bn_parameters,
+        # as they modify the same state
+        if self.training:
+            if freeze_bn:
+                self.freeze_bn_stats()
+            else:
+                self.update_bn_stats()
+        else:
+            self.freeze_bn_stats()
+
+    def reset_running_stats(self):
+        self.bn.reset_running_stats()
+
+    def reset_bn_parameters(self):
+        self.bn.reset_running_stats()
+        init.uniform_(self.bn.weight)
+        init.zeros_(self.bn.bias)
+
+    def update_bn_stats(self):
+        self.freeze_bn = False
+        self.bn.training = True
+        return self
+
+    def freeze_bn_stats(self):
+        self.freeze_bn = True
+        self.bn.training = False
+        return self
+
+    def forward(self, input):
+        assert self.bn.running_var is not None
+
+        # Scale the linear weights by BN's running statistics to reduce
+        # weight jitter, see https://arxiv.org/pdf/1806.08342.pdf, page 18
+        # for motivation.
+        #
+        # Instead of
+        #
+        #   x1 = F.linear(x0, fq(w), b)
+        #   x2 = self.bn(x1)
+        #
+        # We have
+        #
+        #   # scale the weight by previous batch's running statistics
+        #   scale_factor = bn.w / bn.running_std_from_prev_batch
+        #   # do the linear transformation without bias
+        #   x1_scaled = F.linear(x0, fq(w * scale_factor), 0)
+        #   # reverse the scaling and add original bias
+        #   x1_orig = x1_scaled / scale_factor + b
+        #   x2 = self.bn(x1_orig)
+
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = self.weight_fake_quant(
+            self.weight * scale_factor.reshape(weight_shape)
+        )
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias)
+        else:
+            zero_bias = torch.zeros(self.out_features, device=scaled_weight.device)
+        linear_out = F.linear(input, scaled_weight, zero_bias)
+        linear_out_orig = linear_out / scale_factor.reshape(bias_shape)
+        if self.bias is not None:
+            linear_out_orig = linear_out_orig + self.bias.reshape(bias_shape)
+        bn_out = self.bn(linear_out_orig)
+        return bn_out
+
+    def train(self, mode=True):
+        """
+        Batchnorm's training behavior is using the self.training flag. Prevent
+        changing it if BN is frozen. This makes sure that calling `model.train()`
+        on a model with a frozen BN will behave properly.
+        """
+        self.training = mode
+        if not self.freeze_bn:
+            for module in self.children():
+                module.train(mode)
+        return self
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module or qparams_dict
+
+        Args:
+            mod: A float module, either produced by torch.ao.quantization
+                utilities or directly from the user.
+        """
+        assert type(mod) is nni.LinearBn1d, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + nni.LinearBn1d.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid config"
+        qconfig = mod.qconfig
+        linear, bn = mod[0], mod[1]
+        qat_linearbn = cls(
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            bn.eps,
+            bn.momentum,
+            False,
+            qconfig,
+        )
+        qat_linearbn.weight = linear.weight  # type: ignore[assignment]
+        qat_linearbn.bias = linear.bias  # type: ignore[assignment]
+        qat_linearbn.bn.weight = bn.weight  # type: ignore[assignment]
+        qat_linearbn.bn.bias = bn.bias  # type: ignore[assignment]
+        qat_linearbn.bn.running_mean = bn.running_mean  # type: ignore[assignment]
+        qat_linearbn.bn.running_var = bn.running_var  # type: ignore[assignment]
+        qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked  # type: ignore[assignment]
+        return qat_linearbn
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features)
+        assert self.bn.running_var is not None and self.bn.running_mean is not None
+        linear.weight, linear.bias = fuse_linear_bn_weights(
+            self.weight,
+            self.bias,
+            self.bn.running_mean,
+            self.bn.running_var,
+            self.bn.eps,
+            self.bn.weight,
+            self.bn.bias,
+        )
+        return linear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..183286ebb8dad25e49cd2fcd7c2dba2436003823
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.qat as nnqat
+import torch.nn.functional as F
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule
+
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import QConfigAny
+
+
+__all__ = ["LinearReLU"]
+
+
+class LinearReLU(nnqat.Linear, _FusedModule):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules, attached with
+    FakeQuantize modules for weight, used in
+    quantization aware training.
+
+    We adopt the same interface as :class:`torch.nn.Linear`.
+
+    Similar to `torch.ao.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.qat.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    # pyrefly: ignore [bad-override]
+    _FLOAT_MODULE = nni.LinearReLU
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        qconfig: QConfigAny = None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias, qconfig)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(
+        cls,
+        mod: torch.nn.Module,
+        use_precomputed_fake_quant: bool = False,
+    ) -> LinearReLU:
+        return super().from_float(mod, use_precomputed_fake_quant)  # type: ignore[no-untyped-call,no-any-return]
+
+    def to_float(self) -> nni.LinearReLU:
+        linear = torch.nn.Linear(
+            self.in_features, self.out_features, self.bias is not None
+        )
+        linear.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            linear.bias = torch.nn.Parameter(self.bias.detach())
+        relu = torch.nn.ReLU()
+        return torch.ao.nn.intrinsic.LinearReLU(linear, relu)  # type: ignore[no-untyped-call]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6af3b4aeee893966323cc4e73a27ff41814fc251
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py
@@ -0,0 +1,15 @@
+from .modules import *  # noqa: F403
+
+
+__all__ = [
+    "BNReLU2d",
+    "BNReLU3d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "LinearReLU",
+    "LinearLeakyReLU",
+    "LinearTanh",
+    "ConvAdd2d",
+    "ConvAddReLU2d",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10580962ae775a5a49ac465319aaa6fd4edf2f11
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..956eef6919409cf2426f95790f2f066faeaff6cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a6c3c57c7828861b574e76b134aee2c23f0aad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,6 @@
+from .linear_relu import LinearReLU
+
+
+__all__ = [
+    "LinearReLU",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0e26bfd4772a75c5cb704d49b6d765155d4fb3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2debf206f3a9c51d4625a0676dc559e0c21add3d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..620d24ae43e466ecd7883acf7df627641ebfdb24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -0,0 +1,72 @@
+from typing import Any
+from typing_extensions import Self
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.quantized.dynamic as nnqd
+
+
+__all__ = ["LinearReLU"]
+
+
+class LinearReLU(nnqd.Linear):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules that can be used
+    for dynamic quantization.
+    Supports both, FP16 and INT8 quantization.
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.dynamic.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.dynamic.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    # pyrefly: ignore [bad-override]
+    _FLOAT_MODULE = nni.LinearReLU
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        dtype: torch.dtype = torch.qint8,
+    ) -> None:
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self._packed_params.dtype == torch.qint8:
+            # TODO check if we should set reduce_rage = True by default here
+            Y = torch.ops.quantized.linear_relu_dynamic(
+                x, self._packed_params._packed_params, reduce_range=True
+            )
+        elif self._packed_params.dtype == torch.float16:
+            Y = torch.ops.quantized.linear_relu_dynamic_fp16(
+                x, self._packed_params._packed_params
+            )
+        else:
+            raise RuntimeError("Unsupported dtype on dynamic quantized linear relu!")
+        return Y.to(x.dtype)
+
+    def _get_name(self) -> str:
+        return "DynamicQuantizedLinearReLU"
+
+    @classmethod
+    def from_float(
+        cls, mod: torch.nn.Module, use_precomputed_fake_quant: bool = False
+    ) -> Self:
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qlinear_relu: Any) -> Self:  # type: ignore[override]
+        return super().from_reference(ref_qlinear_relu[0])
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7fa4dcec2597e18c002489405894ea7251d5156
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py
@@ -0,0 +1,18 @@
+from .bn_relu import BNReLU2d, BNReLU3d
+from .conv_add import ConvAdd2d, ConvAddReLU2d
+from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
+from .linear_relu import LinearLeakyReLU, LinearReLU, LinearTanh
+
+
+__all__ = [
+    "LinearReLU",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "BNReLU2d",
+    "BNReLU3d",
+    "LinearLeakyReLU",
+    "LinearTanh",
+    "ConvAdd2d",
+    "ConvAddReLU2d",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57fbe9f680f564c2be42a8155f1c615a622afcf3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c9bbce6d61bde4e58f90703f5e5f3b2d5464220
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d50d2c92d0aae6038fd9a2ce8d9dbeaffbbe61f2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33e90ba6621d722045b0b5491b9cfd292f779110
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f697d44edac8de37994418f7c93c0fcdf2199d1b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f05618c0949e1164f05cbd1edbfb8eb6440063e9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
@@ -0,0 +1,113 @@
+# mypy: allow-untyped-defs
+
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.ao.nn.quantized as nnq
+
+
+__all__ = ["BNReLU2d", "BNReLU3d"]
+
+
+class BNReLU2d(nnq.BatchNorm2d):
+    r"""
+    A BNReLU2d module is a fused module of BatchNorm2d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.BatchNorm2d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU2d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super().__init__(
+            num_features, eps=eps, momentum=momentum, device=device, dtype=dtype
+        )
+
+    def forward(self, input):
+        r"""Applies fused BatchNorm2d and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return torch.ops.quantized.batch_norm2d_relu(
+            input,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            self.eps,
+            self.scale,
+            self.zero_point,
+        )
+
+    def _get_name(self):
+        return "QuantizedBNReLU2d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        # TODO: Add qat support for BNReLU2d
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
+
+
+class BNReLU3d(nnq.BatchNorm3d):
+    r"""
+    A BNReLU3d module is a fused module of BatchNorm3d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm3d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.BatchNorm3d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU3d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super().__init__(
+            num_features, eps=eps, momentum=momentum, device=device, dtype=dtype
+        )
+
+    def forward(self, input):
+        r"""Applies fused BatchNorm3d and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        return torch.ops.quantized.batch_norm3d_relu(
+            input,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            self.eps,
+            self.scale,
+            self.zero_point,
+        )
+
+    def _get_name(self):
+        return "QuantizedBNReLU3d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        # TODO: Add qat support for BNReLU3d
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..82d5673e7173c56b5b56d2bd48a0b154bbfdfe9e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
@@ -0,0 +1,153 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.ao.nn.quantized as nnq
+import torch.nn.functional as F
+
+
+_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
+
+
+class ConvAdd2d(nnq.Conv2d):
+    r"""
+    A ConvAdd2d module is a fused module of Conv2d and Add
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAdd2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input, extra_input):  # type: ignore[override]
+        r"""Applies fused quantized Conv2d and addition."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return torch.ops.quantized.conv2d_add(
+            input, extra_input, self._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedConvAdd2d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+
+class ConvAddReLU2d(nnq.Conv2d):
+    r"""
+    A ConvAddReLU2d module is a fused module of Conv2d, Add and Relu
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAddReLU2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input, extra_input):  # type: ignore[override]
+        r"""Applies fused quantized Conv2d, addition, and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return torch.ops.quantized.conv2d_add_relu(
+            input, extra_input, self._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedConvAddReLU2d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c31df28905cd7c9c17147c965f5bd2199af2920a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -0,0 +1,276 @@
+# mypy: allow-untyped-defs
+
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.ao.nn.quantized as nnq
+import torch.nn.functional as F
+from torch.nn.utils import fuse_conv_bn_weights
+
+
+__all__ = [
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+]
+
+_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
+
+
+# TODO: factor out the common parts to ConvNd
+class ConvReLU1d(nnq.Conv1d):
+    r"""
+    A ConvReLU1d module is a fused module of Conv1d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv1d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv1d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input):
+        r"""Applies fused quantized Conv1d and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != "zeros":
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return torch.ops.quantized.conv1d_relu(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedConvReLU1d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight,
+                mod.bias,
+                mod.bn.running_mean,
+                mod.bn.running_var,
+                mod.bn.eps,
+                mod.bn.weight,
+                mod.bn.bias,
+            )
+        return super().from_float(mod, use_precomputed_fake_quant)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU1d, (
+            "BatchNorm1d should be fused into Conv1d before converting to reference module"
+        )
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+
+class ConvReLU2d(nnq.Conv2d):
+    r"""
+    A ConvReLU2d module is a fused module of Conv2d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input):
+        r"""Applies fused quantized Conv2d and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return torch.ops.quantized.conv2d_relu(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedConvReLU2d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight,
+                mod.bias,
+                mod.bn.running_mean,
+                mod.bn.running_var,
+                mod.bn.eps,
+                mod.bn.weight,
+                mod.bn.bias,
+            )
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU2d, (
+            "BatchNorm2d should be fused into Conv2d before converting to reference module"
+        )
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+
+class ConvReLU3d(nnq.Conv3d):
+    r"""
+    A ConvReLU3d module is a fused module of Conv3d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv3d`.
+
+    Attributes: Same as torch.ao.nn.quantized.Conv3d
+
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        assert padding_mode != "reflect", "Conv3d does not support reflection padding"
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input):
+        r"""Applies fused quantized Conv3d and ReLU."""
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return torch.ops.quantized.conv3d_relu(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedConvReLU3d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module."""
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight,
+                mod.bias,
+                mod.bn.running_mean,
+                mod.bn.running_var,
+                mod.bn.eps,
+                mod.bn.weight,
+                mod.bn.bias,
+            )
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Creates a quantized module from a reference module."""
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU3d, (
+            "BatchNorm3d should be fused into Conv3d before converting to reference module"
+        )
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec84101ee0da62e3923362f444368b2a429d8b3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -0,0 +1,190 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.quantized as nnq
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+
+
+__all__ = [
+    "LinearReLU",
+    "LinearLeakyReLU",
+    "LinearTanh",
+]
+
+
+class LinearReLU(nnq.Linear):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_relu(
+            x, self._packed_params._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedLinearReLU"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(mod, use_precomputed_fake_quant)
+
+    @classmethod
+    def from_reference(cls, ref_linear_relu, output_scale, output_zero_point):
+        return super().from_reference(
+            ref_linear_relu[0], output_scale, output_zero_point
+        )
+
+
+class LinearLeakyReLU(nnq.Linear):
+    r"""
+    For onednn backend only
+    A LinearLeakyReLU module fused from Linear and LeakyReLU modules
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+        + negative_slope
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearLeakyReLU(20, 30, 0.01)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    _FLOAT_MODULE = nni.LinearLeakyReLU  # type: ignore[assignment]
+
+    def __init__(
+        self, in_features, out_features, negative_slope, bias=True, dtype=torch.qint8
+    ):
+        super().__init__(in_features, out_features, bias, dtype)
+        self.negative_slope = negative_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_leaky_relu(
+            x,
+            self._packed_params._packed_params,
+            self.scale,
+            self.zero_point,
+            self.negative_slope,
+        )
+
+    def _get_name(self):
+        return "QuantizedLinearLeakyReLU"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        assert type(mod) is nni.LinearLeakyReLU, (
+            "Input float module should be LinearLeakyReLU"
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        activation_post_process = mod.activation_post_process
+        leaky_relu = mod[1]
+        mod = mod[0]
+        weight_post_process = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
+        assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear_leaky_relu = cls(
+            mod.in_features, mod.out_features, leaky_relu.negative_slope, dtype=dtype
+        )
+        qlinear_leaky_relu.set_weight_bias(qweight, mod.bias)  # type: ignore[arg-type]
+        qlinear_leaky_relu.scale = float(act_scale)
+        qlinear_leaky_relu.zero_point = int(act_zp)
+        return qlinear_leaky_relu
+
+    @classmethod
+    def from_reference(cls, ref_mod, output_scale, output_zero_point):
+        linear = ref_mod[0]
+        leaky_relu = ref_mod[1]
+        qlinear_leaky_relu = cls(
+            linear.in_features, linear.out_features, leaky_relu.negative_slope
+        )
+        qweight = linear.get_quantized_weight()
+        qlinear_leaky_relu.set_weight_bias(qweight, linear.bias)
+        qlinear_leaky_relu.scale = float(output_scale)
+        qlinear_leaky_relu.zero_point = int(output_zero_point)
+        return qlinear_leaky_relu
+
+
+class LinearTanh(nnq.Linear):
+    r"""
+    A LinearTanh module fused from Linear and Tanh modules
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearTanh(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    _FLOAT_MODULE = nni.LinearTanh  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_tanh(
+            x, self._packed_params._packed_params, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedLinearTanh"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        assert type(mod) is nni.LinearTanh, "Input float module should be LinearTanh"
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        activation_post_process = mod.activation_post_process
+        mod = mod[0]
+        weight_post_process = mod.qconfig.weight()  # type: ignore[union-attr,operator]
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
+        assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear_tanh = cls(mod.in_features, mod.out_features, dtype=dtype)
+        qlinear_tanh.set_weight_bias(qweight, mod.bias)  # type: ignore[arg-type]
+        qlinear_tanh.scale = float(act_scale)
+        qlinear_tanh.zero_point = int(act_zp)
+        return qlinear_tanh
+
+    @classmethod
+    def from_reference(cls, ref_mod, output_scale, output_zero_point):
+        linear = ref_mod[0]
+        qlinear_tanh = cls(linear.in_features, linear.out_features)
+        qweight = linear.get_quantized_weight()
+        qlinear_tanh.set_weight_bias(qweight, linear.bias)
+        qlinear_tanh.scale = float(output_scale)
+        qlinear_tanh.zero_point = int(output_zero_point)
+        return qlinear_tanh
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e28e0968a60d7612ebbd26d5f607b4407c2d380
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/__init__.py
@@ -0,0 +1,13 @@
+from .conv import Conv1d, Conv2d, Conv3d
+from .embedding_ops import Embedding, EmbeddingBag
+from .linear import Linear
+
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d228d56fce129860f0ebad805b042771b941804
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/conv.py
@@ -0,0 +1,312 @@
+# mypy: allow-untyped-defs
+from typing import ClassVar, Literal
+
+import torch
+import torch.nn as nn
+from torch.ao.nn.intrinsic import _FusedModule
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+from torch.nn.modules.utils import _pair, _single, _triple
+
+
+__all__ = ["Conv1d", "Conv2d", "Conv3d"]
+
+
+class _ConvNd(nn.modules.conv._ConvNd):
+    _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]]
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple[int, ...],
+        stride: tuple[int, ...],
+        padding: str | tuple[int, ...],
+        dilation: tuple[int, ...],
+        transposed: bool,
+        output_padding: tuple[int, ...],
+        groups: int,
+        bias: bool,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"],
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        nn.modules.conv._ConvNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @staticmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module
+
+        Args:
+           `mod`: a float module, either produced by torch.ao.quantization utilities
+           or directly from user
+        """
+        assert type(mod) is cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        if issubclass(type(mod), _FusedModule):
+            mod = mod[0]
+        qconfig = mod.qconfig
+        qat_conv = cls(
+            mod.in_channels,
+            mod.out_channels,
+            mod.kernel_size,
+            stride=mod.stride,
+            padding=mod.padding,
+            dilation=mod.dilation,
+            groups=mod.groups,
+            bias=mod.bias is not None,
+            padding_mode=mod.padding_mode,
+            qconfig=qconfig,
+        )
+        qat_conv.weight = mod.weight
+        qat_conv.bias = mod.bias
+        return qat_conv
+
+    def to_float(self):
+        """This works for both single qat conv, and the qat conv - relu modules
+        to convert the qat module to a floating point module
+        """
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined]
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.bias is not None,
+            self.padding_mode,
+        )
+        conv.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            conv.bias = torch.nn.Parameter(self.bias.detach())
+        # conv relu
+        if issubclass(cls, _FusedModule):
+            modules = [conv]
+            assert hasattr(cls, "_FLOAT_RELU_MODULE")
+            relu = cls._FLOAT_RELU_MODULE()
+            modules.append(relu)
+            # pyrefly: ignore [missing-attribute]
+            fused = cls._FLOAT_MODULE(*modules)
+            fused.train(self.training)
+            return fused
+        else:
+            return conv
+
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    r"""
+    A Conv1d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as :class:`~torch.nn.Conv1d`
+
+    Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: str | _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_single(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        return super().from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    r"""
+    A Conv2d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv2d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d
+    for documentation.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: str | _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_pair(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        return super().from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class Conv3d(_ConvNd, nn.Conv3d):
+    r"""
+    A Conv3d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv3d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv3d#torch.nn.Conv3d
+    for documentation.
+
+    Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: str | _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_triple(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        return super().from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f69e70abcf1d43c4a96ca15dae355c31f66a627
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/embedding_ops.py
@@ -0,0 +1,251 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+__all__ = ["Embedding", "EmbeddingBag"]
+
+
+class Embedding(nn.Embedding):
+    r"""
+    An embedding bag module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Embedding`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
+    for documentation.
+
+    Similar to `torch.nn.Embedding`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+
+    _FLOAT_MODULE = nn.Embedding
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        device=None,
+        dtype=None,
+        qconfig=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            padding_idx,
+            max_norm,
+            norm_type,
+            scale_grad_by_freq,
+            sparse,
+            _weight,
+            # pyrefly: ignore [bad-argument-type]
+            **factory_kwargs,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, (
+            "Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got "
+            + str(qconfig.weight().qscheme)
+        )
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input) -> Tensor:
+        return F.embedding(
+            input,
+            self.weight_fake_quant(self.weight),
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module
+
+        Args: `mod` a float module, either produced by torch.ao.quantization utilities
+        or directly from user
+        """
+        assert type(mod) is cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, (
+            "Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got "
+            + str(weight_qscheme)
+        )
+
+        qconfig = mod.qconfig
+        qat_embedding_bag = cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.padding_idx,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.sparse,
+            mod.weight,
+            qconfig=qconfig,
+        )
+
+        return qat_embedding_bag
+
+    def to_float(self):
+        embedding_bag = torch.nn.Embedding(
+            self.num_embeddings,
+            self.embedding_dim,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+            None,
+        )
+        embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
+        embedding_bag.train(self.training)
+        return embedding_bag
+
+
+class EmbeddingBag(nn.EmbeddingBag):
+    r"""
+    An embedding bag module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.EmbeddingBag`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag
+    for documentation.
+
+    Similar to `torch.nn.EmbeddingBag`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+
+    _FLOAT_MODULE = nn.EmbeddingBag
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        mode="mean",
+        sparse=False,
+        _weight=None,
+        include_last_offset=False,
+        padding_idx=None,
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            max_norm,
+            norm_type,
+            scale_grad_by_freq,
+            mode,
+            sparse,
+            _weight,
+            include_last_offset,
+            padding_idx,
+            **factory_kwargs,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, (
+            "Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got "
+            + str(qconfig.weight().qscheme)
+        )
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input, offsets=None, per_sample_weights=None) -> Tensor:
+        return F.embedding_bag(
+            input,
+            self.weight_fake_quant(self.weight),
+            offsets,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            per_sample_weights,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module
+
+        Args: `mod` a float module, either produced by torch.ao.quantization utilities
+        or directly from user
+        """
+        assert type(mod) is cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, (
+            "Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got "
+            + str(weight_qscheme)
+        )
+
+        qconfig = mod.qconfig
+        qat_embedding_bag = cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.mode,
+            mod.sparse,
+            mod.weight,
+            mod.include_last_offset,
+            mod.padding_idx,
+            qconfig=qconfig,
+        )
+
+        return qat_embedding_bag
+
+    def to_float(self):
+        embedding_bag = torch.nn.EmbeddingBag(
+            self.num_embeddings,
+            self.embedding_dim,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            None,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+        embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
+        embedding_bag.train(self.training)
+        return embedding_bag
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..5edf16ed3ea53d0323eda248b95703d5245b1786
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/qat/modules/linear.py
@@ -0,0 +1,97 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.nn.utils.parametrize import (
+    is_parametrized,
+    transfer_parametrizations_and_params,
+    type_before_parametrizations,
+)
+
+
+__all__ = ["Linear"]
+
+
+class Linear(nn.Linear):
+    r"""
+    A linear module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
+    for documentation.
+
+    Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+
+    _FLOAT_MODULE = nn.Linear
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        qconfig=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(in_features, out_features, bias, **factory_kwargs)
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return F.linear(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a qat module from a float module or qparams_dict
+        Args: `mod` a float module, either produced by torch.ao.quantization utilities
+        or directly from user
+        """
+        assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        if type_before_parametrizations(mod) == LinearReLU:
+            mod = mod[0]
+
+        qconfig = mod.qconfig
+        qat_linear = cls(
+            mod.in_features,
+            mod.out_features,
+            bias=mod.bias is not None,
+            qconfig=qconfig,
+        )
+
+        if is_parametrized(mod, "weight"):
+            transfer_parametrizations_and_params(mod, qat_linear, "weight")
+        else:
+            qat_linear.weight = mod.weight
+
+        if is_parametrized(mod, "bias"):
+            transfer_parametrizations_and_params(mod, qat_linear, "bias")
+        else:
+            qat_linear.bias = mod.bias
+
+        return qat_linear
+
+    def to_float(self):
+        linear = torch.nn.Linear(
+            self.in_features, self.out_features, self.bias is not None
+        )
+        linear.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            linear.bias = torch.nn.Parameter(self.bias.detach())
+        linear.train(self.training)
+        return linear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c08593a717f43bc70cda9fe4596e55ddd2d7204e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..221107660158171ada5d1823cc193666c9e152e7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__init__.py
@@ -0,0 +1,9 @@
+from .activation import MultiheadAttention
+from .rnn import LSTM, LSTMCell
+
+
+__all__ = [
+    "LSTM",
+    "LSTMCell",
+    "MultiheadAttention",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbe78331a171b4402511b4389f9ff5b90d79775b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..217f3dff3cbe037a14783a2a4bf720d5b2528bba
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..975d8bfe0b6a84e9c737d3cb572ff36b60ec0352
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d808d50c366c68b8aa0d61a50b9f6db2d72c9ff2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/activation.py
@@ -0,0 +1,579 @@
+# mypy: allow-untyped-defs
+import warnings
+
+import torch
+import torch.jit  # this is needed to avoid a circular import
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+
+__all__ = ["MultiheadAttention"]
+
+
+class MultiheadAttention(nn.MultiheadAttention):
+    _FLOAT_MODULE = nn.MultiheadAttention
+
+    r"""Quantizable implementation of the MultiheadAttention.
+
+    Note::
+        Please, refer to :class:`~torch.nn.MultiheadAttention` for more
+        information
+
+    Allows the model to jointly attend to information from different
+    representation subspaces.
+    See reference: Attention Is All You Need
+
+    The original MHA module is not quantizable.
+    This reimplements it by explicitly instantiating the linear layers.
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> multihead_attn = nnqa.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+
+    Note::
+        Please, follow the quantization flow to convert the quantizable MHA.
+    """
+    __constants__ = ["batch_first"]
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        kdim: int | None = None,
+        vdim: int | None = None,
+        batch_first: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            embed_dim,
+            num_heads,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            kdim,
+            vdim,
+            batch_first,
+            **factory_kwargs,
+        )
+        self.linear_Q = nn.Linear(
+            self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs
+        )
+        self.linear_K = nn.Linear(
+            self.kdim, self.embed_dim, bias=bias, **factory_kwargs
+        )
+        self.linear_V = nn.Linear(
+            self.vdim, self.embed_dim, bias=bias, **factory_kwargs
+        )
+        # for the type: ignore, see https://github.com/pytorch/pytorch/issues/58969
+        self.out_proj = nn.Linear(
+            self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs
+        )  # type: ignore[assignment]
+
+        # Functionals
+        self.q_scaling_product = torch.ao.nn.quantized.FloatFunctional()
+        # note: importing torch.ao.nn.quantized at top creates a circular import
+
+        # Quant/Dequant
+        self.quant_attn_output = torch.ao.quantization.QuantStub()
+        self.quant_attn_output_weights = torch.ao.quantization.QuantStub()
+        self.dequant_q = torch.ao.quantization.DeQuantStub()
+        self.dequant_k = torch.ao.quantization.DeQuantStub()
+        self.dequant_v = torch.ao.quantization.DeQuantStub()
+
+    def _get_name(self):
+        return "QuantizableMultiheadAttention"
+
+    @classmethod
+    def from_float(cls, other):
+        assert type(other) is cls._FLOAT_MODULE
+        assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
+        # Setting the dropout to 0.0!
+        observed = cls(
+            other.embed_dim,
+            other.num_heads,
+            other.dropout,
+            (other.in_proj_bias is not None),
+            (other.bias_k is not None),
+            other.add_zero_attn,
+            other.kdim,
+            other.vdim,
+            other.batch_first,
+        )
+        observed.bias_k = other.bias_k
+        observed.bias_v = other.bias_v
+        observed.qconfig = other.qconfig
+
+        # Set the linear weights
+        # for the type: ignores, see https://github.com/pytorch/pytorch/issues/58969
+        observed.out_proj.weight = other.out_proj.weight
+        observed.out_proj.bias = other.out_proj.bias
+        if other._qkv_same_embed_dim:
+            # Use separate params
+            bias = other.in_proj_bias
+            _start = 0
+            _end = _start + other.embed_dim
+            weight = other.in_proj_weight[_start:_end, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad)
+            observed.linear_Q.weight = torch.nn.Parameter(weight, weight.requires_grad)
+            observed.linear_Q.bias = bias
+
+            bias = other.in_proj_bias
+            _start = _end
+            _end = _start + other.embed_dim
+            weight = other.in_proj_weight[_start:_end, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad)
+            observed.linear_K.weight = torch.nn.Parameter(weight, weight.requires_grad)
+            observed.linear_K.bias = bias
+
+            bias = other.in_proj_bias
+            _start = _end
+            weight = other.in_proj_weight[_start:, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:], bias.requires_grad)
+            observed.linear_V.weight = torch.nn.Parameter(weight, weight.requires_grad)
+            observed.linear_V.bias = bias
+        else:
+            observed.linear_Q.weight = nn.Parameter(other.q_proj_weight)
+            observed.linear_K.weight = nn.Parameter(other.k_proj_weight)
+            observed.linear_V.weight = nn.Parameter(other.v_proj_weight)
+            if other.in_proj_bias is None:
+                # pyrefly: ignore [bad-assignment]
+                observed.linear_Q.bias = None
+                # pyrefly: ignore [bad-assignment]
+                observed.linear_K.bias = None
+                # pyrefly: ignore [bad-assignment]
+                observed.linear_V.bias = None
+            else:
+                observed.linear_Q.bias = nn.Parameter(
+                    other.in_proj_bias[0 : other.embed_dim]
+                )
+                observed.linear_K.bias = nn.Parameter(
+                    other.in_proj_bias[other.embed_dim : (other.embed_dim * 2)]
+                )
+                observed.linear_V.bias = nn.Parameter(
+                    other.in_proj_bias[(other.embed_dim * 2) :]
+                )
+        observed.eval()
+        # Explicit prepare
+        observed = torch.ao.quantization.prepare(observed, inplace=True)
+        return observed
+
+    @torch.jit.unused
+    def dequantize(self):
+        r"""Utility to convert the quantized MHA back to float.
+
+        The motivation for this is that it is not trivial to convert the weights
+        from the format that is used in the quantized version back to the
+        float.
+        """
+        fp = self._FLOAT_MODULE(
+            self.embed_dim,
+            self.num_heads,
+            self.dropout,
+            (self.linear_Q._weight_bias()[1] is not None),  # type: ignore[operator]
+            (self.bias_k is not None),
+            self.add_zero_attn,
+            self.kdim,
+            self.vdim,
+            self.batch_first,
+        )
+        assert fp._qkv_same_embed_dim == self._qkv_same_embed_dim
+        if self.bias_k is not None:
+            fp.bias_k = nn.Parameter(self.bias_k.dequantize())
+        if self.bias_v is not None:
+            fp.bias_v = nn.Parameter(self.bias_v.dequantize())
+
+        # Set the linear weights
+        # Note: Because the linear layers are quantized, mypy does not know how
+        # to deal with them -- might need to ignore the typing checks.
+        # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
+        w, b = self.out_proj._weight_bias()  # type: ignore[operator, has-type]
+        fp.out_proj.weight = nn.Parameter(w.dequantize())
+        if b is not None:
+            fp.out_proj.bias = nn.Parameter(b)
+
+        wQ, bQ = self.linear_Q._weight_bias()  # type: ignore[operator]
+        wQ = wQ.dequantize()
+        wK, bK = self.linear_K._weight_bias()  # type: ignore[operator]
+        wK = wK.dequantize()
+        wV, bV = self.linear_V._weight_bias()  # type: ignore[operator]
+        wV = wV.dequantize()
+        if fp._qkv_same_embed_dim:
+            # Use separate params
+            _start = 0
+            _end = _start + fp.embed_dim
+            fp.in_proj_weight[_start:_end, :] = wQ
+            if fp.in_proj_bias is not None:
+                # pyrefly: ignore [bad-argument-type]
+                assert all(bQ == 0)
+                fp.in_proj_bias[_start:_end] = bQ
+
+            _start = _end
+            _end = _start + fp.embed_dim
+            fp.in_proj_weight[_start:_end, :] = wK
+            if fp.in_proj_bias is not None:
+                # pyrefly: ignore [bad-argument-type]
+                assert all(bK == 0)
+                fp.in_proj_bias[_start:_end] = bK
+
+            _start = _end
+            fp.in_proj_weight[_start:, :] = wV
+            if fp.in_proj_bias is not None:
+                # pyrefly: ignore [bad-argument-type]
+                assert all(bV == 0)
+                fp.in_proj_bias[_start:] = bV
+        else:
+            fp.q_proj_weight = nn.Parameter(wQ)
+            fp.k_proj_weight = nn.Parameter(wK)
+            fp.v_proj_weight = nn.Parameter(wV)
+            if fp.in_proj_bias is None:
+                # pyrefly: ignore [bad-assignment]
+                self.linear_Q.bias = None
+                # pyrefly: ignore [bad-assignment]
+                self.linear_K.bias = None
+                # pyrefly: ignore [bad-assignment]
+                self.linear_V.bias = None
+            else:
+                fp.in_proj_bias[0 : fp.embed_dim] = bQ
+                fp.in_proj_bias[fp.embed_dim : (fp.embed_dim * 2)] = bK
+                fp.in_proj_bias[(fp.embed_dim * 2) :] = bV
+
+        return fp
+
+    @classmethod
+    def from_observed(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does float -> observed only
+        # See nn.quantized.MultiheadAttention
+        raise NotImplementedError(
+            "It looks like you are trying to prepare an "
+            "MHA module. Please, see "
+            "the examples on quantizable MHAs."
+        )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Tensor | None = None,
+        need_weights: bool = True,
+        attn_mask: Tensor | None = None,
+        average_attn_weights: bool = True,
+        is_causal: bool = False,
+    ) -> tuple[Tensor, Tensor | None]:
+        r"""
+        Note::
+            Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more
+            information
+
+        Args:
+            query, key, value: map a query and a set of key-value pairs to an output.
+                See "Attention Is All You Need" for more details.
+            key_padding_mask: if provided, specified padding elements in the key will
+                be ignored by the attention. When given a binary mask and a value is True,
+                the corresponding value on the attention layer will be ignored.
+            need_weights: output attn_output_weights.
+            attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+                the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+
+        Shape:
+            - Inputs:
+            - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+              the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
+            - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+              the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
+            - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+              the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
+            - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+              If a BoolTensor is provided, the positions with the
+              value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+            - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+              3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+              S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+              positions. If a BoolTensor is provided, positions with ``True``
+              is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+              is provided, it will be added to the attention weight.
+            - is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask.
+              Default: ``False``.
+            - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+              heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+              effect when ``need_weights=True.``. Default: True (i.e. average weights across heads)
+
+            - Outputs:
+            - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+              E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
+            - attn_output_weights: If ``average_attn_weights=True``, returns attention weights averaged
+              across heads of shape :math:`(N, L, S)`, where N is the batch size, L is the target sequence length,
+              S is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+              head of shape :math:`(N, num_heads, L, S)`.
+        """
+        return self._forward_impl(
+            query,
+            key,
+            value,
+            key_padding_mask,
+            need_weights,
+            attn_mask,
+            average_attn_weights,
+            is_causal,
+        )
+
+    def _forward_impl(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Tensor | None = None,
+        need_weights: bool = True,
+        attn_mask: Tensor | None = None,
+        average_attn_weights: bool = True,
+        is_causal: bool = False,
+    ) -> tuple[Tensor, Tensor | None]:
+        # This version will not deal with the static key/value pairs.
+        # Keeping it here for future changes.
+        #
+        # TODO: This method has some duplicate lines with the
+        # `torch.nn.functional.multi_head_attention`. Will need to refactor.
+        static_k = None
+        static_v = None
+
+        if attn_mask is not None and is_causal:
+            raise AssertionError("Only allow causal mask or attn_mask")
+
+        if is_causal:
+            raise AssertionError("causal mask not supported by AO MHA module")
+
+        if self.batch_first:
+            query, key, value = (x.transpose(0, 1) for x in (query, key, value))
+
+        tgt_len, bsz, embed_dim_to_check = query.size()
+        assert self.embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = self.embed_dim // self.num_heads
+        assert head_dim * self.num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+        scaling = float(head_dim) ** -0.5
+
+        q = self.linear_Q(query)
+        k = self.linear_K(key)
+        v = self.linear_V(value)
+
+        q = self.q_scaling_product.mul_scalar(q, scaling)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for `attn_mask` in `nn.MultiheadAttention` is deprecated. "
+                    "Use bool tensor instead.",
+                    stacklevel=3,
+                )
+                attn_mask = attn_mask.to(torch.bool)
+            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, (
+                f"Only float and bool types are supported for attn_mask, not {attn_mask.dtype}"
+            )
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * self.num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    f"attn_mask's dimension {attn_mask.dim()} is not supported"
+                )
+            # attn_mask's dim is 3 now.
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for `key_padding_mask` in `nn.MultiheadAttention` is deprecated. "
+                "Use bool tensor instead.",
+                stacklevel=3,
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+        if self.bias_k is not None and self.bias_v is not None:
+            if static_k is None and static_v is None:
+                # Explicitly assert that bias_k and bias_v are not None
+                # in a way that TorchScript can understand.
+                bias_k = self.bias_k
+                assert bias_k is not None
+                bias_v = self.bias_v
+                assert bias_v is not None
+
+                k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+                v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+                if attn_mask is not None:
+                    attn_mask = F.pad(attn_mask, (0, 1))
+                if key_padding_mask is not None:
+                    key_padding_mask = F.pad(key_padding_mask, (0, 1))
+            else:
+                assert static_k is None, "bias cannot be added to static key."
+                assert static_v is None, "bias cannot be added to static value."
+        else:
+            assert self.bias_k is None
+            assert self.bias_v is None
+
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+
+        if static_k is not None:
+            assert static_k.size(0) == bsz * self.num_heads
+            assert static_k.size(2) == head_dim
+            k = static_k
+
+        if static_v is not None:
+            assert static_v.size(0) == bsz * self.num_heads
+            assert static_v.size(2) == head_dim
+            v = static_v
+
+        # pyrefly: ignore [missing-attribute]
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.add_zero_attn:
+            src_len += 1
+            # pyrefly: ignore [missing-attribute]
+            k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:])
+            # pyrefly: ignore [missing-attribute]
+            if k.is_quantized:
+                k_zeros = torch.quantize_per_tensor(
+                    k_zeros,
+                    # pyrefly: ignore [missing-attribute]
+                    k.q_scale(),
+                    # pyrefly: ignore [missing-attribute]
+                    k.q_zero_point(),
+                    # pyrefly: ignore [missing-attribute]
+                    k.dtype,
+                )
+            # pyrefly: ignore [no-matching-overload]
+            k = torch.cat([k, k_zeros], dim=1)
+            # pyrefly: ignore [missing-attribute]
+            v_zeros = torch.zeros((v.size(0), 1) + k.size()[2:])
+            # pyrefly: ignore [missing-attribute]
+            if v.is_quantized:
+                v_zeros = torch.quantize_per_tensor(
+                    v_zeros,
+                    # pyrefly: ignore [missing-attribute]
+                    v.q_scale(),
+                    # pyrefly: ignore [missing-attribute]
+                    v.q_zero_point(),
+                    # pyrefly: ignore [missing-attribute]
+                    v.dtype,
+                )
+            # pyrefly: ignore [no-matching-overload]
+            v = torch.cat([v, v_zeros], dim=1)
+
+            if attn_mask is not None:
+                attn_mask = F.pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = F.pad(key_padding_mask, (0, 1))
+
+        # Leaving the quantized zone here
+        q = self.dequant_q(q)
+        k = self.dequant_k(k)
+        v = self.dequant_v(v)
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+        attn_output_weights = F.softmax(attn_output_weights, dim=-1)
+        attn_output_weights = F.dropout(
+            attn_output_weights, p=self.dropout, training=self.training
+        )
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * self.num_heads, tgt_len, head_dim]
+        if self.batch_first:
+            attn_output = attn_output.view(bsz, tgt_len, self.embed_dim)
+        else:
+            attn_output = (
+                attn_output.transpose(0, 1)
+                .contiguous()
+                .view(tgt_len, bsz, self.embed_dim)
+            )
+
+        # Reentering the quantized zone
+        attn_output = self.quant_attn_output(attn_output)
+        # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
+        attn_output = self.out_proj(attn_output)  # type: ignore[has-type]
+        attn_output_weights = self.quant_attn_output_weights(attn_output_weights)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            if average_attn_weights:
+                attn_output_weights = attn_output_weights.mean(dim=1)
+            return attn_output, attn_output_weights
+        else:
+            return attn_output, None
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e4bd902d1565360f72a5c4098b6e6d1590a146
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantizable/modules/rnn.py
@@ -0,0 +1,604 @@
+"""
+We will recreate all the RNN modules as we require the modules to be decomposed
+into its building blocks to be able to observe.
+"""
+
+# mypy: allow-untyped-defs
+
+import numbers
+import warnings
+
+import torch
+from torch import Tensor
+
+
+__all__ = ["LSTMCell", "LSTM"]
+
+
+class LSTMCell(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM) cell.
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell`
+
+    `split_gates`: specify True to compute the input/forget/cell/output gates separately
+    to avoid an intermediate tensor which is subsequently chunk'd. This optimization can
+    be beneficial for on-device inference latency. This flag is cascaded down from the
+    parent classes.
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTMCell(10, 20)
+        >>> input = torch.randn(6, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+    """
+
+    _FLOAT_MODULE = torch.nn.LSTMCell
+    __constants__ = ["split_gates"]  # for jit.script
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        *,
+        split_gates=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.input_size = input_dim
+        self.hidden_size = hidden_dim
+        self.bias = bias
+        self.split_gates = split_gates
+
+        if not split_gates:
+            self.igates: torch.nn.Module = torch.nn.Linear(
+                input_dim, 4 * hidden_dim, bias=bias, **factory_kwargs
+            )
+            self.hgates: torch.nn.Module = torch.nn.Linear(
+                hidden_dim, 4 * hidden_dim, bias=bias, **factory_kwargs
+            )
+            self.gates: torch.nn.Module = torch.ao.nn.quantized.FloatFunctional()
+        else:
+            # keep separate Linear layers for each gate
+            self.igates = torch.nn.ModuleDict()
+            self.hgates = torch.nn.ModuleDict()
+            self.gates = torch.nn.ModuleDict()
+            for g in ["input", "forget", "cell", "output"]:
+                # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]`
+                self.igates[g] = torch.nn.Linear(
+                    input_dim, hidden_dim, bias=bias, **factory_kwargs
+                )
+                # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]`
+                self.hgates[g] = torch.nn.Linear(
+                    hidden_dim, hidden_dim, bias=bias, **factory_kwargs
+                )
+                # pyre-fixme[29]: `Union[torch._tensor.Tensor, torch.nn.modules.module.Module]`
+                self.gates[g] = torch.ao.nn.quantized.FloatFunctional()
+
+        self.input_gate = torch.nn.Sigmoid()
+        self.forget_gate = torch.nn.Sigmoid()
+        self.cell_gate = torch.nn.Tanh()
+        self.output_gate = torch.nn.Sigmoid()
+
+        self.fgate_cx = torch.ao.nn.quantized.FloatFunctional()
+        self.igate_cgate = torch.ao.nn.quantized.FloatFunctional()
+        self.fgate_cx_igate_cgate = torch.ao.nn.quantized.FloatFunctional()
+
+        self.ogate_cy = torch.ao.nn.quantized.FloatFunctional()
+
+        self.initial_hidden_state_qparams: tuple[float, int] = (1.0, 0)
+        self.initial_cell_state_qparams: tuple[float, int] = (1.0, 0)
+        self.hidden_state_dtype: torch.dtype = torch.quint8
+        self.cell_state_dtype: torch.dtype = torch.quint8
+
+    def forward(
+        self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None
+    ) -> tuple[Tensor, Tensor]:
+        if hidden is None or hidden[0] is None or hidden[1] is None:
+            hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
+        hx, cx = hidden
+
+        if not self.split_gates:
+            igates = self.igates(x)
+            hgates = self.hgates(hx)
+            gates = self.gates.add(igates, hgates)  # type: ignore[operator]
+
+            input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1)
+
+            input_gate = self.input_gate(input_gate)
+            forget_gate = self.forget_gate(forget_gate)
+            cell_gate = self.cell_gate(cell_gate)
+            out_gate = self.output_gate(out_gate)
+        else:
+            # apply each input + hidden projection and add together
+            gate = {}
+            for (key, gates), igates, hgates in zip(
+                self.gates.items(),  # type: ignore[operator]
+                self.igates.values(),  # type: ignore[operator]
+                self.hgates.values(),  # type: ignore[operator]
+            ):
+                gate[key] = gates.add(igates(x), hgates(hx))
+
+            input_gate = self.input_gate(gate["input"])
+            forget_gate = self.forget_gate(gate["forget"])
+            cell_gate = self.cell_gate(gate["cell"])
+            out_gate = self.output_gate(gate["output"])
+
+        fgate_cx = self.fgate_cx.mul(forget_gate, cx)
+        igate_cgate = self.igate_cgate.mul(input_gate, cell_gate)
+        fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate)
+        cy = fgate_cx_igate_cgate
+
+        # TODO: make this tanh a member of the module so its qparams can be configured
+        tanh_cy = torch.tanh(cy)
+        hy = self.ogate_cy.mul(out_gate, tanh_cy)
+        return hy, cy
+
+    def initialize_hidden(
+        self, batch_size: int, is_quantized: bool = False
+    ) -> tuple[Tensor, Tensor]:
+        h, c = (
+            torch.zeros((batch_size, self.hidden_size)),
+            torch.zeros((batch_size, self.hidden_size)),
+        )
+        if is_quantized:
+            (h_scale, h_zp) = self.initial_hidden_state_qparams
+            (c_scale, c_zp) = self.initial_cell_state_qparams
+            h = torch.quantize_per_tensor(
+                h, scale=h_scale, zero_point=h_zp, dtype=self.hidden_state_dtype
+            )
+            c = torch.quantize_per_tensor(
+                c, scale=c_scale, zero_point=c_zp, dtype=self.cell_state_dtype
+            )
+        return h, c
+
+    def _get_name(self):
+        return "QuantizableLSTMCell"
+
+    @classmethod
+    def from_params(cls, wi, wh, bi=None, bh=None, split_gates=False):
+        """Uses the weights and biases to create a new LSTM cell.
+
+        Args:
+            wi, wh: Weights for the input and hidden layers
+            bi, bh: Biases for the input and hidden layers
+        """
+        assert (bi is None) == (bh is None)  # Either both None or both have values
+        input_size = wi.shape[1]
+        hidden_size = wh.shape[1]
+        cell = cls(
+            input_dim=input_size,
+            hidden_dim=hidden_size,
+            bias=(bi is not None),
+            split_gates=split_gates,
+        )
+
+        if not split_gates:
+            cell.igates.weight = torch.nn.Parameter(wi)
+            if bi is not None:
+                cell.igates.bias = torch.nn.Parameter(bi)
+            cell.hgates.weight = torch.nn.Parameter(wh)
+            if bh is not None:
+                cell.hgates.bias = torch.nn.Parameter(bh)
+        else:
+            # split weight/bias
+            for w, b, gates in zip([wi, wh], [bi, bh], [cell.igates, cell.hgates]):
+                for w_chunk, gate in zip(w.chunk(4, dim=0), gates.values()):  # type: ignore[operator]
+                    gate.weight = torch.nn.Parameter(w_chunk)
+
+                if b is not None:
+                    for b_chunk, gate in zip(b.chunk(4, dim=0), gates.values()):  # type: ignore[operator]
+                        gate.bias = torch.nn.Parameter(b_chunk)
+
+        return cell
+
+    @classmethod
+    def from_float(cls, other, use_precomputed_fake_quant=False, split_gates=False):
+        assert type(other) is cls._FLOAT_MODULE
+        assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
+        observed = cls.from_params(
+            other.weight_ih,
+            other.weight_hh,
+            other.bias_ih,
+            other.bias_hh,
+            split_gates=split_gates,
+        )
+        observed.qconfig = other.qconfig
+        observed.igates.qconfig = other.qconfig
+        observed.hgates.qconfig = other.qconfig
+        if split_gates:
+            # also apply qconfig directly to Linear modules
+            for g in observed.igates.values():
+                g.qconfig = other.qconfig
+            for g in observed.hgates.values():
+                g.qconfig = other.qconfig
+        return observed
+
+
+class _LSTMSingleLayer(torch.nn.Module):
+    r"""A single one-directional LSTM layer.
+
+    The difference between a layer and a cell is that the layer can process a
+    sequence, while the cell only expects an instantaneous value.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        *,
+        split_gates=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.cell = LSTMCell(
+            input_dim, hidden_dim, bias=bias, split_gates=split_gates, **factory_kwargs
+        )
+
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
+        result = []
+        seq_len = x.shape[0]
+        for i in range(seq_len):
+            hidden = self.cell(x[i], hidden)
+            result.append(hidden[0])  # type: ignore[index]
+        result_tensor = torch.stack(result, 0)
+        return result_tensor, hidden
+
+    @classmethod
+    def from_params(cls, *args, **kwargs):
+        cell = LSTMCell.from_params(*args, **kwargs)
+        layer = cls(
+            cell.input_size, cell.hidden_size, cell.bias, split_gates=cell.split_gates
+        )
+        layer.cell = cell
+        return layer
+
+
+class _LSTMLayer(torch.nn.Module):
+    r"""A single bi-directional LSTM layer."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        bias: bool = True,
+        batch_first: bool = False,
+        bidirectional: bool = False,
+        device=None,
+        dtype=None,
+        *,
+        split_gates=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.batch_first = batch_first
+        self.bidirectional = bidirectional
+        self.layer_fw = _LSTMSingleLayer(
+            input_dim, hidden_dim, bias=bias, split_gates=split_gates, **factory_kwargs
+        )
+        if self.bidirectional:
+            self.layer_bw = _LSTMSingleLayer(
+                input_dim,
+                hidden_dim,
+                bias=bias,
+                split_gates=split_gates,
+                **factory_kwargs,
+            )
+
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        if hidden is None:
+            hx_fw, cx_fw = (None, None)
+        else:
+            hx_fw, cx_fw = hidden
+        hidden_bw: tuple[Tensor, Tensor] | None = None
+        if self.bidirectional:
+            if hx_fw is None:
+                hx_bw = None
+            else:
+                hx_bw = hx_fw[1]
+                hx_fw = hx_fw[0]
+            if cx_fw is None:
+                cx_bw = None
+            else:
+                cx_bw = cx_fw[1]
+                cx_fw = cx_fw[0]
+            if hx_bw is not None and cx_bw is not None:
+                hidden_bw = hx_bw, cx_bw
+        if hx_fw is None and cx_fw is None:
+            hidden_fw = None
+        else:
+            hidden_fw = (
+                torch.jit._unwrap_optional(hx_fw),
+                torch.jit._unwrap_optional(cx_fw),
+            )
+        result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
+
+        if hasattr(self, "layer_bw") and self.bidirectional:
+            x_reversed = x.flip(0)
+            result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
+            result_bw = result_bw.flip(0)
+
+            result = torch.cat([result_fw, result_bw], result_fw.dim() - 1)
+            if hidden_fw is None and hidden_bw is None:
+                h = None
+                c = None
+            elif hidden_fw is None:
+                (h, c) = torch.jit._unwrap_optional(hidden_bw)
+            elif hidden_bw is None:
+                (h, c) = torch.jit._unwrap_optional(hidden_fw)
+            else:
+                h = torch.stack([hidden_fw[0], hidden_bw[0]], 0)  # type: ignore[list-item]
+                c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore[list-item]
+        else:
+            result = result_fw
+            h, c = torch.jit._unwrap_optional(hidden_fw)  # type: ignore[assignment]
+
+        if self.batch_first:
+            result.transpose_(0, 1)
+
+        return result, (h, c)
+
+    @classmethod
+    def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
+        r"""
+        There is no FP equivalent of this class. This function is here just to
+        mimic the behavior of the `prepare` within the `torch.ao.quantization`
+        flow.
+        """
+        assert hasattr(other, "qconfig") or (qconfig is not None)
+
+        input_size = kwargs.get("input_size", other.input_size)
+        hidden_size = kwargs.get("hidden_size", other.hidden_size)
+        bias = kwargs.get("bias", other.bias)
+        batch_first = kwargs.get("batch_first", other.batch_first)
+        bidirectional = kwargs.get("bidirectional", other.bidirectional)
+        split_gates = kwargs.get("split_gates", False)
+
+        layer = cls(
+            input_size,
+            hidden_size,
+            bias,
+            batch_first,
+            bidirectional,
+            split_gates=split_gates,
+        )
+        # pyrefly: ignore [bad-argument-type]
+        layer.qconfig = getattr(other, "qconfig", qconfig)
+        wi = getattr(other, f"weight_ih_l{layer_idx}")
+        wh = getattr(other, f"weight_hh_l{layer_idx}")
+        bi = getattr(other, f"bias_ih_l{layer_idx}", None)
+        bh = getattr(other, f"bias_hh_l{layer_idx}", None)
+
+        layer.layer_fw = _LSTMSingleLayer.from_params(
+            wi, wh, bi, bh, split_gates=split_gates
+        )
+
+        if other.bidirectional:
+            wi = getattr(other, f"weight_ih_l{layer_idx}_reverse")
+            wh = getattr(other, f"weight_hh_l{layer_idx}_reverse")
+            bi = getattr(other, f"bias_ih_l{layer_idx}_reverse", None)
+            bh = getattr(other, f"bias_hh_l{layer_idx}_reverse", None)
+            layer.layer_bw = _LSTMSingleLayer.from_params(
+                wi, wh, bi, bh, split_gates=split_gates
+            )
+        return layer
+
+
+class LSTM(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples below.
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+        >>> # To get the weights:
+        >>> # xdoctest: +SKIP
+        >>> print(rnn.layers[0].weight_ih)
+        tensor([[...]])
+        >>> print(rnn.layers[0].weight_hh)
+        AssertionError: There is no reverse path in the non-bidirectional layer
+    """
+
+    _FLOAT_MODULE = torch.nn.LSTM
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        device=None,
+        dtype=None,
+        *,
+        split_gates: bool = False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.training = False  # Default to eval mode. If we want to train, we will explicitly set to training.
+
+        if (
+            not isinstance(dropout, numbers.Number)
+            # pyrefly: ignore [unsupported-operation]
+            or not 0 <= dropout <= 1
+            or isinstance(dropout, bool)
+        ):
+            raise ValueError(
+                "dropout should be a number in range [0, 1] "
+                "representing the probability of an element being "
+                "zeroed"
+            )
+        # pyrefly: ignore [unsupported-operation]
+        if dropout > 0:
+            warnings.warn(
+                "dropout option for quantizable LSTM is ignored. "
+                "If you are training, please, use nn.LSTM version "
+                "followed by `prepare` step.",
+                stacklevel=2,
+            )
+            if num_layers == 1:
+                warnings.warn(
+                    "dropout option adds dropout after all but last "
+                    "recurrent layer, so non-zero dropout expects "
+                    f"num_layers greater than 1, but got dropout={dropout} "
+                    f"and num_layers={num_layers}",
+                    stacklevel=2,
+                )
+
+        layers = [
+            _LSTMLayer(
+                self.input_size,
+                self.hidden_size,
+                self.bias,
+                batch_first=False,
+                bidirectional=self.bidirectional,
+                split_gates=split_gates,
+                **factory_kwargs,
+            )
+        ]
+        layers.extend(
+            _LSTMLayer(
+                self.hidden_size,
+                self.hidden_size,
+                self.bias,
+                batch_first=False,
+                bidirectional=self.bidirectional,
+                split_gates=split_gates,
+                **factory_kwargs,
+            )
+            for _ in range(1, num_layers)
+        )
+        self.layers = torch.nn.ModuleList(layers)
+
+    def forward(self, x: Tensor, hidden: tuple[Tensor, Tensor] | None = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        max_batch_size = x.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if hidden is None:
+            zeros = torch.zeros(
+                num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=torch.float,
+                device=x.device,
+            )
+            zeros.squeeze_(0)
+            if x.is_quantized:
+                zeros = torch.quantize_per_tensor(
+                    zeros, scale=1.0, zero_point=0, dtype=x.dtype
+                )
+            hxcx = [(zeros, zeros) for _ in range(self.num_layers)]
+        else:
+            hidden_non_opt = torch.jit._unwrap_optional(hidden)
+            if isinstance(hidden_non_opt[0], Tensor):
+                hx = hidden_non_opt[0].reshape(
+                    self.num_layers, num_directions, max_batch_size, self.hidden_size
+                )
+                cx = hidden_non_opt[1].reshape(
+                    self.num_layers, num_directions, max_batch_size, self.hidden_size
+                )
+                hxcx = [
+                    (hx[idx].squeeze(0), cx[idx].squeeze(0))
+                    for idx in range(self.num_layers)
+                ]
+            else:
+                hxcx = hidden_non_opt
+
+        hx_list = []
+        cx_list = []
+        for idx, layer in enumerate(self.layers):
+            x, (h, c) = layer(x, hxcx[idx])
+            hx_list.append(torch.jit._unwrap_optional(h))
+            cx_list.append(torch.jit._unwrap_optional(c))
+        hx_tensor = torch.stack(hx_list)
+        cx_tensor = torch.stack(cx_list)
+
+        # We are creating another dimension for bidirectional case
+        # need to collapse it
+        hx_tensor = hx_tensor.reshape(-1, hx_tensor.shape[-2], hx_tensor.shape[-1])
+        cx_tensor = cx_tensor.reshape(-1, cx_tensor.shape[-2], cx_tensor.shape[-1])
+
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        return x, (hx_tensor, cx_tensor)
+
+    def _get_name(self):
+        return "QuantizableLSTM"
+
+    @classmethod
+    def from_float(cls, other, qconfig=None, split_gates=False):
+        assert isinstance(other, cls._FLOAT_MODULE)
+        assert hasattr(other, "qconfig") or qconfig
+        observed = cls(
+            other.input_size,
+            other.hidden_size,
+            other.num_layers,
+            other.bias,
+            other.batch_first,
+            other.dropout,
+            other.bidirectional,
+            split_gates=split_gates,
+        )
+        # pyrefly: ignore [bad-argument-type]
+        observed.qconfig = getattr(other, "qconfig", qconfig)
+        for idx in range(other.num_layers):
+            observed.layers[idx] = _LSTMLayer.from_float(
+                other, idx, qconfig, batch_first=False, split_gates=split_gates
+            )
+
+        # Prepare the model
+        if other.training:
+            observed.train()
+            observed = torch.ao.quantization.prepare_qat(observed, inplace=True)
+        else:
+            observed.eval()
+            observed = torch.ao.quantization.prepare(observed, inplace=True)
+        return observed
+
+    @classmethod
+    def from_observed(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does float -> observed only
+        raise NotImplementedError(
+            "It looks like you are trying to convert a "
+            "non-quantizable LSTM module. Please, see "
+            "the examples on quantizable LSTMs."
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77e97d8595282f3d69963ee129fa473249e3ae29
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__init__.py
@@ -0,0 +1,39 @@
+from . import functional
+from .modules import *  # noqa: F403
+from .modules import MaxPool2d
+
+
+__all__ = [
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "DeQuantize",
+    "ELU",
+    "Embedding",
+    "EmbeddingBag",
+    "GroupNorm",
+    "Hardswish",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "LayerNorm",
+    "LeakyReLU",
+    "Linear",
+    "LSTM",
+    "MultiheadAttention",
+    "Quantize",
+    "ReLU6",
+    "Sigmoid",
+    "Softmax",
+    "Dropout",
+    "PReLU",
+    # Wrapper modules
+    "FloatFunctional",
+    "FXFloatFunctional",
+    "QFunctional",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42623256e6253b588ec56788a646b92d194e32cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6c44be65fdad7dad42c21a07331d76c95d7f6b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d79bdbfe83209f18b17cc8c7b245f322871d6c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1833c1d585aea7fa67c80b5f6eff38b370243b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..969fd6f121f5ddb72ed2e8e158e3ee7e990cfd0c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,26 @@
+from .conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+)
+from .linear import Linear
+from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNNCell
+
+
+__all__ = [
+    "Linear",
+    "LSTM",
+    "GRU",
+    "LSTMCell",
+    "RNNCell",
+    "GRUCell",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b622b49c7e96bdface694f6184dbd94f5c3f98ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..958edd5594a24a3bebe7420507c148d67d8af6c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5221799d456a5b460fa0b053332b22e566f2dc85
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbd5adbb7df8c9be323288580935ac25a1ae1ddf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c3f6acd093477a44057ade1fb48107709eda89
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -0,0 +1,530 @@
+# mypy: allow-untyped-defs
+r"""Dynamically quantized convolution modules."""
+
+import warnings
+from typing import ClassVar, Literal
+
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch._ops import ops
+from torch.ao.nn.quantized.modules.conv import _reverse_repeat_padding
+from torch.nn.common_types import _size_1_t
+from torch.nn.modules.utils import _pair, _single, _triple
+
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+]
+
+
+class Conv1d(nnq.Conv1d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv1d` and :class:`~torch.ao.nn.quantized.dynamic.Conv1d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.quantized.dynamic.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 100)
+        >>> output = m(input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+        reduce_range=True,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
+        )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        # pyrefly: ignore [bad-assignment]
+        padding = padding if isinstance(padding, str) else _single(padding)
+        dilation = _single(dilation)
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConv1d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != "zeros":
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv1d_dynamic(input, self._packed_params, reduce_range)
+
+
+class Conv2d(nnq.Conv2d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv2d` and :class:`~torch.ao.nn.quantized.dynamic.Conv2d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module "
+            "has poor numerical accuracy and its use is not recommended",
+            stacklevel=2,
+        )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConv2d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv2d_dynamic(input, self._packed_params, reduce_range)
+
+
+class Conv3d(nnq.Conv3d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv3d` and :class:`~torch.ao.nn.quantized.dynamic.Conv3d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2))
+        >>> input = torch.randn(20, 16, 56, 56, 56)
+        >>> output = m(input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[type[nn.Module] | None] = None
+    _NNI_CONV_RELU_MODULE: ClassVar[type[nn.Module] | None] = None
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
+        )
+        assert padding_mode != "reflect", "Conv3d does not support reflection padding"
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        super()._init(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConv3d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv3d_dynamic(input, self._packed_params, reduce_range)
+
+
+class ConvTranspose1d(nnq.ConvTranspose1d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose1d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv1d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nndq.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nndq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nndq.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nndq.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose1d]] = nn.ConvTranspose1d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
+        )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConvTranspose1d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        return torch.ops.quantized.conv_transpose1d_dynamic(
+            input, self._packed_params, reduce_range
+        )
+
+
+class ConvTranspose2d(nnq.ConvTranspose2d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose2d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv2d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose2d]] = nn.ConvTranspose2d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
+        )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConvTranspose2d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return ops.quantized.conv_transpose2d_dynamic(
+            input, self._packed_params, reduce_range
+        )
+
+
+class ConvTranspose3d(nnq.ConvTranspose3d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose3d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv3d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With cubic kernels and equal stride
+        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-cubic kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose3d]] = nn.ConvTranspose3d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        warnings.warn(
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
+        )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "DynamicQuantizedConvTranspose3d"
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, T, H, W)`!")
+        return ops.quantized.conv_transpose3d_dynamic(
+            input, self._packed_params, reduce_range
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..523ff78c31cf141e680e0a3374bcb5f1252cf7d7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -0,0 +1,168 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.quantized as nnq
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+
+
+__all__ = [
+    "Linear",
+]
+
+
+class Linear(nnq.Linear):
+    r"""
+    A dynamic quantized linear module with floating point tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation.
+
+    Similar to :class:`torch.nn.Linear`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module which are of
+                         shape :math:`(\text{out\_features}, \text{in\_features})`.
+        bias (Tensor): the non-learnable floating point bias of the module of shape
+                       :math:`(\text{out\_features})`. If :attr:`bias` is ``True``,
+                       the values are initialized to zero.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.quantized.dynamic.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    # version used in this class is different from the parent class nnq.Linear
+    _version = 4
+
+    def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias_, dtype=dtype)
+        # We don't muck around with buffers or attributes or anything here
+        # to keep the module simple. *everything* is simply a Python attribute.
+        # Serialization logic is explicitly handled in the below serialization and
+        # deserialization modules
+        self.version = 4
+
+    def forward(self, x):
+        # Note that we can handle self.bias == None case.
+        if self._packed_params.dtype == torch.qint8:
+            if self.version is None or self.version < 4:
+                Y = torch.ops.quantized.linear_dynamic(
+                    x, self._packed_params._packed_params
+                )
+            else:
+                Y = torch.ops.quantized.linear_dynamic(
+                    x, self._packed_params._packed_params, reduce_range=True
+                )
+        elif self._packed_params.dtype == torch.float16:
+            Y = torch.ops.quantized.linear_dynamic_fp16(
+                x, self._packed_params._packed_params
+            )
+        else:
+            raise RuntimeError("Unsupported dtype on dynamic quantized linear!")
+        return Y.to(x.dtype)
+
+    def _get_name(self):
+        return "DynamicQuantizedLinear"
+
+    def extra_repr(self):
+        extra_repr_str = f"in_features={self.in_features}, out_features={self.out_features}, dtype={self._packed_params.dtype}"
+        if self._packed_params.dtype == torch.qint8:
+            extra_repr_str += f", qscheme={self.weight().qscheme()}"
+        return extra_repr_str
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        self.version = version
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a dynamic quantized module from a float module or qparams_dict
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+        """
+        float_modules = [
+            torch.nn.Linear,
+            torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
+            torch.ao.nn.intrinsic.modules.fused.LinearReLU,
+            torch.ao.nn.qat.dynamic.Linear,
+        ]
+
+        assert type(mod) in float_modules, (
+            "nn.quantized.dynamic.Linear.from_float only works for one of"
+            + str([float_mod.__name__ for float_mod in float_modules])
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        if type(mod) is nni.LinearReLU:
+            mod = mod[0]
+        # pyrefly: ignore [missing-attribute]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            # pyrefly: ignore [not-callable]
+            weight_observer = mod.qconfig.weight()
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+
+            weight_observer = default_dynamic_qconfig.weight()
+        dtype = weight_observer.dtype
+        assert dtype in [torch.qint8, torch.float16], (
+            "The only supported dtypes for "
+            f"dynamic quantized linear are qint8 and float16 got: {dtype}"
+        )
+        weight_observer(mod.weight)
+        if dtype == torch.qint8:
+            qweight = _quantize_weight(mod.weight.float(), weight_observer)
+        elif dtype == torch.float16:
+            qweight = mod.weight.float()
+        else:
+            raise RuntimeError(
+                "Unsupported dtype specified for dynamic quantized Linear!"
+            )
+        qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
+        # pyrefly: ignore [bad-argument-type]
+        qlinear.set_weight_bias(qweight, mod.bias)
+        return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear):  # type: ignore[override]
+        """Create a (fbgemm/qnnpack) dynamic quantized module from a reference quantized
+        module
+        Args:
+            ref_qlinear (Module): a reference quantized  module, either produced by
+            torch.ao.quantization functions or provided by the user
+        """
+        qlinear = cls(
+            ref_qlinear.in_features,
+            ref_qlinear.out_features,
+            dtype=ref_qlinear.weight_dtype,
+        )
+        qweight = ref_qlinear.get_quantized_weight()
+        bias = ref_qlinear.bias
+        qlinear.set_weight_bias(qweight, bias)
+        return qlinear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ebe4b6a15af499f38a0d70ca93870cf1d6c224f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -0,0 +1,1366 @@
+# mypy: allow-untyped-defs
+import numbers
+import warnings
+from typing_extensions import deprecated
+
+import torch
+import torch.nn as nn
+from torch import Tensor  # noqa: F401
+from torch._jit_internal import Dict, List, Optional, Tuple, Union  # noqa: F401
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+from torch.nn.utils.rnn import PackedSequence
+
+
+__all__ = [
+    "pack_weight_bias",
+    "PackedParameter",
+    "RNNBase",
+    "LSTM",
+    "GRU",
+    "RNNCellBase",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+    "apply_permutation",
+]
+
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+
+@deprecated(
+    "`apply_permutation` is deprecated, please use `tensor.index_select(dim, permutation)` instead",
+    category=FutureWarning,
+)
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return _apply_permutation(tensor, permutation, dim)
+
+
+def pack_weight_bias(qweight, bias, dtype):
+    if dtype == torch.qint8:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   w_ih, w_hh
+        packed_weight = torch.ops.quantized.linear_prepack(qweight, bias)
+
+        return packed_weight
+    else:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   packed_ih, packed_hh, b_ih, b_hh
+        packed_weight = torch.ops.quantized.linear_prepack_fp16(qweight, bias)
+
+        return packed_weight
+
+
+class PackedParameter(torch.nn.Module):
+    def __init__(self, param):
+        super().__init__()
+        self.param = param
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "param"] = self.param
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.param = state_dict[prefix + "param"]
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class RNNBase(torch.nn.Module):
+    _FLOAT_MODULE = nn.RNNBase
+
+    _version = 2
+
+    def __init__(
+        self,
+        mode,
+        input_size,
+        hidden_size,
+        num_layers=1,
+        bias=True,
+        batch_first=False,
+        dropout=0.0,
+        bidirectional=False,
+        dtype=torch.qint8,
+    ):
+        super().__init__()
+
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.dtype = dtype
+        self.version = 2
+        self.training = False
+        num_directions = 2 if bidirectional else 1
+
+        # "type: ignore" is required since ints and Numbers are not fully comparable
+        # https://github.com/python/mypy/issues/8566
+        if (
+            not isinstance(dropout, numbers.Number)
+            or not 0 <= dropout <= 1  # type: ignore[operator]
+            or isinstance(dropout, bool)
+        ):
+            raise ValueError(
+                "dropout should be a number in range [0, 1] "
+                "representing the probability of an element being "
+                "zeroed"
+            )
+        if dropout > 0 and num_layers == 1:  # type: ignore[operator]
+            warnings.warn(
+                "dropout option adds dropout after all but last "
+                "recurrent layer, so non-zero dropout expects "
+                f"num_layers greater than 1, but got dropout={dropout} and "
+                f"num_layers={num_layers}",
+                stacklevel=2,
+            )
+
+        if mode == "LSTM":
+            gate_size = 4 * hidden_size
+        elif mode == "GRU":
+            gate_size = 3 * hidden_size
+        else:
+            raise ValueError("Unrecognized RNN mode: " + mode)
+
+        _all_weight_values = []
+        for layer in range(num_layers):
+            for _ in range(num_directions):
+                layer_input_size = (
+                    input_size if layer == 0 else hidden_size * num_directions
+                )
+
+                w_ih = torch.randn(gate_size, layer_input_size).to(torch.float)
+                w_hh = torch.randn(gate_size, hidden_size).to(torch.float)
+                b_ih = torch.randn(gate_size).to(torch.float)
+                b_hh = torch.randn(gate_size).to(torch.float)
+                if dtype == torch.qint8:
+                    w_ih = torch.quantize_per_tensor(
+                        w_ih, scale=0.1, zero_point=0, dtype=torch.qint8
+                    )
+                    w_hh = torch.quantize_per_tensor(
+                        w_hh, scale=0.1, zero_point=0, dtype=torch.qint8
+                    )
+                    packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh)
+                    if self.version is None or self.version < 2:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, b_ih, b_hh
+                            )
+                        )
+                    else:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, b_ih, b_hh, True
+                            )
+                        )
+                else:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh
+                    )
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        self._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+    def _get_name(self):
+        return "DynamicQuantizedRNN"
+
+    def extra_repr(self):
+        s = "{input_size}, {hidden_size}"
+        if self.num_layers != 1:
+            s += ", num_layers={num_layers}"
+        if self.bias is not True:
+            s += ", bias={bias}"
+        if self.batch_first is not False:
+            s += ", batch_first={batch_first}"
+        if self.dropout != 0:
+            s += ", dropout={dropout}"
+        if self.bidirectional is not False:
+            s += ", bidirectional={bidirectional}"
+        return s.format(**self.__dict__)
+
+    def __repr__(self):
+        # We don't want to show `ModuleList` children, hence custom
+        # `__repr__`. This is the same as nn.Module.__repr__, except the check
+        # for the `PackedParameter` and `nn.ModuleList`.
+        # You should still override `extra_repr` to add more info.
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for key, module in self._modules.items():
+            if isinstance(module, (PackedParameter, nn.ModuleList)):
+                continue
+            mod_str = repr(module)
+            mod_str = nn.modules.module._addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + "("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+        main_str += ")"
+        return main_str
+
+    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
+        expected_input_dim = 2 if batch_sizes is not None else 3
+        if input.dim() != expected_input_dim:
+            raise RuntimeError(
+                f"input must have {expected_input_dim} dimensions, got {input.dim()}"
+            )
+        if self.input_size != input.size(-1):
+            raise RuntimeError(
+                f"input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}"
+            )
+
+    def get_expected_hidden_size(
+        self, input: Tensor, batch_sizes: Optional[Tensor]
+    ) -> tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (
+            self.num_layers * num_directions,
+            mini_batch,
+            self.hidden_size,
+        )
+        return expected_hidden_size
+
+    def check_hidden_size(
+        self,
+        hx: Tensor,
+        expected_hidden_size: tuple[int, int, int],
+        msg: str = "Expected hidden size {}, got {}",
+    ) -> None:
+        if hx.size() != expected_hidden_size:
+            raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
+
+    def check_forward_args(
+        self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+        self.check_hidden_size(
+            hidden, expected_hidden_size, msg="Expected hidden size {}, got {}"
+        )
+
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        self.version = version
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def set_weight_bias(self, weight_bias_dict):
+        def weight_bias_name(ihhh, layer, suffix):
+            weight_name = f"weight_{ihhh}_l{layer}{suffix}"
+            bias_name = f"bias_{ihhh}_l{layer}{suffix}"
+            return weight_name, bias_name
+
+        num_directions = 2 if self.bidirectional else 1
+        # TODO: dedup with __init__ of RNNBase
+        _all_weight_values = []
+        for layer in range(self.num_layers):
+            for direction in range(num_directions):
+                suffix = "_reverse" if direction == 1 else ""
+                w_ih_name, b_ih_name = weight_bias_name("ih", layer, suffix)
+                w_hh_name, b_hh_name = weight_bias_name("hh", layer, suffix)
+                w_ih = weight_bias_dict[w_ih_name]
+                b_ih = weight_bias_dict[b_ih_name]
+                w_hh = weight_bias_dict[w_hh_name]
+                b_hh = weight_bias_dict[b_hh_name]
+                if w_ih.dtype == torch.qint8:
+                    packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh)
+                    if self.version is None or self.version < 2:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, b_ih, b_hh
+                            )
+                        )
+                    else:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, b_ih, b_hh, True
+                            )
+                        )
+                else:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh
+                    )
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        self._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        assert type(mod) in {
+            torch.nn.LSTM,
+            torch.nn.GRU,
+        }, "nn.quantized.dynamic.RNNBase.from_float only works for nn.LSTM and nn.GRU"
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer_method = mod.qconfig.weight
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+
+            weight_observer_method = default_dynamic_qconfig.weight
+
+        dtype = weight_observer_method().dtype
+        supported_scalar_types = [torch.qint8, torch.float16]
+        if dtype not in supported_scalar_types:
+            raise RuntimeError(
+                f"Unsupported dtype for dynamic RNN quantization: {dtype}"
+            )
+        # RNNBase can be either LSTM or GRU
+        qRNNBase: Union[LSTM, GRU]
+        if mod.mode == "LSTM":
+            qRNNBase = LSTM(
+                mod.input_size,
+                mod.hidden_size,
+                mod.num_layers,
+                mod.bias,
+                mod.batch_first,
+                mod.dropout,
+                mod.bidirectional,
+                dtype,
+            )
+        elif mod.mode == "GRU":
+            qRNNBase = GRU(
+                mod.input_size,
+                mod.hidden_size,
+                mod.num_layers,
+                mod.bias,
+                mod.batch_first,
+                mod.dropout,
+                mod.bidirectional,
+                dtype,
+            )
+        else:
+            raise NotImplementedError(
+                "Only LSTM/GRU is supported for QuantizedRNN for now"
+            )
+
+        num_directions = 2 if mod.bidirectional else 1
+
+        assert mod.bias
+
+        _all_weight_values = []
+        for layer in range(qRNNBase.num_layers):
+            for direction in range(num_directions):
+                suffix = "_reverse" if direction == 1 else ""
+
+                def retrieve_weight_bias(ihhh):
+                    weight_name = f"weight_{ihhh}_l{layer}{suffix}"
+                    bias_name = f"bias_{ihhh}_l{layer}{suffix}"
+                    weight = getattr(mod, weight_name)
+                    bias = getattr(mod, bias_name)
+                    return weight, bias
+
+                weight_ih, bias_ih = retrieve_weight_bias("ih")
+                weight_hh, bias_hh = retrieve_weight_bias("hh")
+
+                if dtype == torch.qint8:
+
+                    def quantize_and_pack(w, b):
+                        weight_observer = weight_observer_method()
+                        weight_observer(w)
+                        qweight = _quantize_weight(w.float(), weight_observer)
+                        packed_weight = torch.ops.quantized.linear_prepack(qweight, b)
+                        return packed_weight
+
+                    packed_ih = quantize_and_pack(weight_ih, bias_ih)
+                    packed_hh = quantize_and_pack(weight_hh, bias_hh)
+                    if qRNNBase.version is None or qRNNBase.version < 2:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, bias_ih, bias_hh
+                            )
+                        )
+                    else:
+                        cell_params = (
+                            torch.ops.quantized.make_quantized_cell_params_dynamic(
+                                packed_ih, packed_hh, bias_ih, bias_hh, True
+                            )
+                        )
+
+                elif dtype == torch.float16:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(
+                        weight_ih.float(), bias_ih
+                    )
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(
+                        weight_hh.float(), bias_hh
+                    )
+
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh
+                    )
+                else:
+                    raise RuntimeError(
+                        "Unsupported dtype specified for dynamic quantized LSTM!"
+                    )
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        qRNNBase._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+        return qRNNBase
+
+    def _weight_bias(self):
+        # Returns a dict of weights and biases
+        weight_bias_dict: Dict[str, Dict] = {"weight": {}, "bias": {}}
+        count = 0
+        num_directions = 2 if self.bidirectional else 1
+        for layer in range(self.num_layers):
+            for direction in range(num_directions):
+                suffix = "_reverse" if direction == 1 else ""
+                key_name1 = f"weight_ih_l{layer}{suffix}"
+                key_name2 = f"weight_hh_l{layer}{suffix}"
+                # packed weights are part of torchbind class, CellParamsSerializationType
+                # Within the packed weight class, the weight and bias are accessible as Tensors
+                packed_weight_bias = self._all_weight_values[  # type: ignore[index]
+                    count
+                ].param.__getstate__()[0][4]
+                weight_bias_dict["weight"][key_name1] = packed_weight_bias[
+                    0
+                ].__getstate__()[0][0]
+                weight_bias_dict["weight"][key_name2] = packed_weight_bias[
+                    1
+                ].__getstate__()[0][0]
+                key_name1 = f"bias_ih_l{layer}{suffix}"
+                key_name2 = f"bias_hh_l{layer}{suffix}"
+                weight_bias_dict["bias"][key_name1] = packed_weight_bias[
+                    0
+                ].__getstate__()[0][1]
+                weight_bias_dict["bias"][key_name2] = packed_weight_bias[
+                    1
+                ].__getstate__()[0][1]
+                count = count + 1
+        return weight_bias_dict
+
+    def get_weight(self):
+        return self._weight_bias()["weight"]
+
+    def get_bias(self):
+        return self._weight_bias()["bias"]
+
+
+class LSTM(RNNBase):
+    r"""
+    A dynamic quantized LSTM module with floating point tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.LSTM`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+    """
+
+    # pyrefly: ignore [bad-override]
+    _FLOAT_MODULE = nn.LSTM
+
+    __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
+
+    def __init__(self, *args, **kwargs):
+        super().__init__("LSTM", *args, **kwargs)
+
+    def _get_name(self):
+        return "DynamicQuantizedLSTM"
+
+    def forward_impl(
+        self,
+        input: Tensor,
+        hx: Optional[tuple[Tensor, Tensor]],
+        batch_sizes: Optional[Tensor],
+        max_batch_size: int,
+        sorted_indices: Optional[Tensor],
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            hx = (zeros, zeros)
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+
+        _all_params = [m.param for m in self._all_weight_values]
+        if batch_sizes is None:
+            result = torch.quantized_lstm(
+                input,
+                hx,
+                _all_params,
+                self.bias,
+                self.num_layers,
+                float(self.dropout),
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+                dtype=self.dtype,
+                use_dynamic=True,
+            )
+        else:
+            result = torch.quantized_lstm(
+                input,
+                batch_sizes,
+                hx,
+                _all_params,
+                self.bias,
+                self.num_layers,
+                float(self.dropout),
+                self.training,
+                self.bidirectional,
+                dtype=self.dtype,
+                use_dynamic=True,
+            )
+        output = result[0]
+        hidden = result[1:]
+
+        return output, hidden
+
+    @torch.jit.export
+    def forward_tensor(
+        self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
+        batch_sizes = None
+        max_batch_size = input.size(0) if self.batch_first else input.size(1)
+        sorted_indices = None
+        unsorted_indices = None
+
+        output, hidden = self.forward_impl(
+            input, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    @torch.jit.export
+    def forward_packed(
+        self, input: PackedSequence, hx: Optional[tuple[Tensor, Tensor]] = None
+    ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]:
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
+
+        output_, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        output = PackedSequence(output_, batch_sizes, sorted_indices, unsorted_indices)
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    # "type: ignore" is required due to issue #43072
+    def permute_hidden(  # type: ignore[override]
+        self,
+        hx: tuple[Tensor, Tensor],
+        permutation: Optional[Tensor],
+    ) -> tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(
+            hx[1], permutation
+        )
+
+    # "type: ignore" is required due to issue #43072
+    def check_forward_args(  # type: ignore[override]
+        self,
+        input: Tensor,
+        hidden: tuple[Tensor, Tensor],
+        batch_sizes: Optional[Tensor],
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(
+            hidden[0], expected_hidden_size, "Expected hidden[0] size {}, got {}"
+        )
+        self.check_hidden_size(
+            hidden[1], expected_hidden_size, "Expected hidden[1] size {}, got {}"
+        )
+
+    @torch.jit.ignore
+    def forward(self, input, hx=None):
+        if isinstance(input, PackedSequence):
+            return self.forward_packed(input, hx)
+        else:
+            return self.forward_tensor(input, hx)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 "
+        "exists in LSTM, may need to relax the assumption to support the use case"
+        qmod = cls(
+            ref_mod.input_size,
+            ref_mod.hidden_size,
+            ref_mod.num_layers,
+            ref_mod.bias,
+            ref_mod.batch_first,
+            ref_mod.dropout,
+            ref_mod.bidirectional,
+            # assuming there is layer 0, which should be OK
+            ref_mod.weight_ih_l0_dtype,
+        )
+        qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict())
+        return qmod
+
+
+class GRU(RNNBase):
+    r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)}
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two GRUs together to form a `stacked GRU`,
+            with the second GRU taking in outputs of the first GRU and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            GRU layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
+
+    Inputs: input, h_0
+        - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+          for details.
+        - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial hidden state for each element in the batch.
+          Defaults to zero if not provided. If the RNN is bidirectional,
+          num_directions should be 2, else it should be 1.
+
+    Outputs: output, h_n
+        - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
+          containing the output features h_t from the last layer of the GRU,
+          for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+          For the unpacked case, the directions can be separated
+          using ``output.view(seq_len, batch, num_directions, hidden_size)``,
+          with forward and backward being direction `0` and `1` respectively.
+
+          Similarly, the directions can be separated in the packed case.
+        - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the hidden state for `t = seq_len`
+
+          Like *output*, the layers can be separated using
+          ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
+
+    Shape:
+        - Input1: :math:`(L, N, H_{in})` tensor containing input features where
+          :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
+        - Input2: :math:`(S, N, H_{out})` tensor
+          containing the initial hidden state for each element in the batch.
+          :math:`H_{out}=\text{hidden\_size}`
+          Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
+          If the RNN is bidirectional, num_directions should be 2, else it should be 1.
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
+        - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks.
+        In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the
+        previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix
+        `W` and addition of bias:
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn})
+            \end{aligned}
+
+        This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}`
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn}))
+            \end{aligned}
+
+        This implementation differs on purpose for efficiency.
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.GRU(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    # pyrefly: ignore [bad-override]
+    _FLOAT_MODULE = nn.GRU
+
+    __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
+
+    def __init__(self, *args, **kwargs):
+        super().__init__("GRU", *args, **kwargs)
+
+    def _get_name(self):
+        return "DynamicQuantizedGRU"
+
+    def check_forward_args(
+        self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(
+            hidden, expected_hidden_size, "Expected hidden size {}, got {}"
+        )
+
+    def forward_impl(
+        self,
+        input: Tensor,
+        hx: Optional[Tensor],
+        batch_sizes: Optional[Tensor],
+        max_batch_size: int,
+        sorted_indices: Optional[Tensor],
+    ) -> tuple[Tensor, Tensor]:
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            hx = zeros
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+
+        _all_params = [m.param for m in self._all_weight_values]
+        if batch_sizes is None:
+            result = torch.quantized_gru(
+                input,
+                hx,
+                _all_params,
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = torch.quantized_gru(
+                input,
+                batch_sizes,
+                hx,
+                _all_params,
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1]
+
+        return output, hidden
+
+    @torch.jit.export
+    def forward_tensor(
+        self, input: Tensor, hx: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
+        batch_sizes = None
+        max_batch_size = input.size(0) if self.batch_first else input.size(1)
+        sorted_indices = None
+        unsorted_indices = None
+
+        output, hidden = self.forward_impl(
+            input, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    @torch.jit.export
+    def forward_packed(
+        self, input: PackedSequence, hx: Optional[Tensor] = None
+    ) -> tuple[PackedSequence, Tensor]:
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
+        output_, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        output = PackedSequence(output_, batch_sizes, sorted_indices, unsorted_indices)
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+    @torch.jit.ignore
+    def forward(self, input, hx=None):
+        if isinstance(input, PackedSequence):
+            return self.forward_packed(input, hx)
+        else:
+            return self.forward_tensor(input, hx)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 "
+        "exists in LSTM, may need to relax the assumption to support the use case"
+        qmod = cls(
+            ref_mod.input_size,
+            ref_mod.hidden_size,
+            ref_mod.num_layers,
+            ref_mod.bias,
+            ref_mod.batch_first,
+            ref_mod.dropout,
+            ref_mod.bidirectional,
+            # assuming there is layer 0, which should be OK
+            ref_mod.weight_ih_l0_dtype,
+        )
+        qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict())
+        return qmod
+
+
+class RNNCellBase(torch.nn.Module):
+    # _FLOAT_MODULE = nn.CellRNNBase
+    __constants__ = ["input_size", "hidden_size", "bias"]
+
+    def __init__(
+        self, input_size, hidden_size, bias=True, num_chunks=4, dtype=torch.qint8
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_dtype = dtype
+        if bias:
+            self.bias_ih = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
+            self.bias_hh = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
+        else:
+            self.register_parameter("bias_ih", None)
+            self.register_parameter("bias_hh", None)
+
+        weight_ih = torch.randn(num_chunks * hidden_size, input_size).to(torch.float)
+        weight_hh = torch.randn(num_chunks * hidden_size, hidden_size).to(torch.float)
+        if dtype == torch.qint8:
+            weight_ih = torch.quantize_per_tensor(
+                weight_ih, scale=1, zero_point=0, dtype=torch.qint8
+            )
+            weight_hh = torch.quantize_per_tensor(
+                weight_hh, scale=1, zero_point=0, dtype=torch.qint8
+            )
+
+        if dtype == torch.qint8:
+            # for each layer, for each direction we need to quantize and pack
+            # weights and pack parameters in this order:
+            #
+            #   w_ih, w_hh
+            packed_weight_ih = torch.ops.quantized.linear_prepack(
+                weight_ih, self.bias_ih
+            )
+            packed_weight_hh = torch.ops.quantized.linear_prepack(
+                weight_hh, self.bias_hh
+            )
+        else:
+            # for each layer, for each direction we need to quantize and pack
+            # weights and pack parameters in this order:
+            #
+            #   packed_ih, packed_hh, b_ih, b_hh
+            packed_weight_ih = torch.ops.quantized.linear_prepack_fp16(
+                weight_ih, self.bias_ih
+            )
+            packed_weight_hh = torch.ops.quantized.linear_prepack_fp16(
+                weight_hh, self.bias_hh
+            )
+
+        self._packed_weight_ih = packed_weight_ih
+        self._packed_weight_hh = packed_weight_hh
+
+    def _get_name(self):
+        return "DynamicQuantizedRNNBase"
+
+    def extra_repr(self):
+        s = "{input_size}, {hidden_size}"
+        if "bias" in self.__dict__ and self.bias is not True:
+            s += ", bias={bias}"
+        if "nonlinearity" in self.__dict__ and self.nonlinearity != "tanh":
+            s += ", nonlinearity={nonlinearity}"
+        return s.format(**self.__dict__)
+
+    def check_forward_input(self, input):
+        if input.size(1) != self.input_size:
+            raise RuntimeError(
+                f"input has inconsistent input_size: got {input.size(1)}, expected {self.input_size}"
+            )
+
+    def check_forward_hidden(
+        self, input: Tensor, hx: Tensor, hidden_label: str = ""
+    ) -> None:
+        if input.size(0) != hx.size(0):
+            raise RuntimeError(
+                f"Input batch size {input.size(0)} doesn't match hidden{hidden_label} batch size {hx.size(0)}"
+            )
+
+        if hx.size(1) != self.hidden_size:
+            raise RuntimeError(
+                f"hidden{hidden_label} has inconsistent hidden_size: got {hx.size(1)}, expected {self.hidden_size}"
+            )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        assert type(mod) in {
+            torch.nn.LSTMCell,
+            torch.nn.GRUCell,
+            torch.nn.RNNCell,
+        }, (
+            "nn.quantized.dynamic.RNNCellBase.from_float \
+                                 only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell"
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer_method = mod.qconfig.weight
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+
+            weight_observer_method = default_dynamic_qconfig.weight
+
+        dtype = weight_observer_method().dtype
+        supported_scalar_types = [torch.qint8, torch.float16]
+        if dtype not in supported_scalar_types:
+            raise RuntimeError(
+                f"Unsupported dtype for dynamic RNN quantization: {dtype}"
+            )
+
+        qRNNCellBase: Union[LSTMCell, GRUCell, RNNCell]
+
+        if type(mod) is torch.nn.LSTMCell:
+            qRNNCellBase = LSTMCell(
+                mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
+            )
+        elif type(mod) is torch.nn.GRUCell:
+            qRNNCellBase = GRUCell(
+                mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
+            )
+        elif type(mod) is torch.nn.RNNCell:
+            qRNNCellBase = RNNCell(
+                mod.input_size,
+                mod.hidden_size,
+                bias=mod.bias,
+                nonlinearity=mod.nonlinearity,
+                dtype=dtype,
+            )
+        else:
+            raise NotImplementedError(
+                "Only LSTMCell, GRUCell and RNNCell \
+            are supported for QuantizedRNN for now"
+            )
+
+        assert mod.bias
+
+        def _observe_and_quantize_weight(weight):
+            if dtype == torch.qint8:
+                weight_observer = weight_observer_method()
+                weight_observer(weight)
+                qweight = _quantize_weight(weight.float(), weight_observer)
+                return qweight
+            else:
+                return weight.float()
+
+        qRNNCellBase._packed_weight_ih = pack_weight_bias(
+            _observe_and_quantize_weight(mod.weight_ih), mod.bias_ih, dtype
+        )
+        qRNNCellBase._packed_weight_hh = pack_weight_bias(
+            _observe_and_quantize_weight(mod.weight_hh), mod.bias_hh, dtype
+        )
+        return qRNNCellBase
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_dtype"), "We are assuming weight_ih "
+        "exists in reference module, may need to relax the assumption to support the use case"
+        if hasattr(ref_mod, "nonlinearity"):
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                ref_mod.nonlinearity,
+                dtype=ref_mod.weight_ih_dtype,
+            )
+        else:
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                dtype=ref_mod.weight_ih_dtype,
+            )
+        weight_bias_dict = {
+            "weight": {
+                "weight_ih": ref_mod.get_quantized_weight_ih(),
+                "weight_hh": ref_mod.get_quantized_weight_hh(),
+            },
+            "bias": {
+                "bias_ih": ref_mod.bias_ih,
+                "bias_hh": ref_mod.bias_hh,
+            },
+        }
+        qmod.set_weight_bias(weight_bias_dict)
+        return qmod
+
+    def _weight_bias(self):
+        # Returns a dict of weights and biases
+        weight_bias_dict: Dict[str, Dict] = {"weight": {}, "bias": {}}
+        w1, b1 = self._packed_weight_ih.__getstate__()[0]
+        w2, b2 = self._packed_weight_hh.__getstate__()[0]
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
+        weight_bias_dict["weight"]["weight_ih"] = w1
+        weight_bias_dict["weight"]["weight_hh"] = w2
+        weight_bias_dict["bias"]["bias_ih"] = b1
+        weight_bias_dict["bias"]["bias_hh"] = b2
+        return weight_bias_dict
+
+    def get_weight(self):
+        return self._weight_bias()["weight"]
+
+    def get_bias(self):
+        return self._weight_bias()["bias"]
+
+    def set_weight_bias(self, weight_bias_dict):
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
+        self._packed_weight_ih = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_ih"],
+            weight_bias_dict["bias"]["bias_ih"],
+            self.weight_dtype,
+        )
+        self._packed_weight_hh = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_hh"],
+            weight_bias_dict["bias"]["bias_hh"],
+            self.weight_dtype,
+        )
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "_packed_weight_ih"] = self._packed_weight_ih
+        destination[prefix + "_packed_weight_hh"] = self._packed_weight_hh
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self._packed_weight_ih = state_dict.pop(prefix + "_packed_weight_ih")
+        self._packed_weight_hh = state_dict.pop(prefix + "_packed_weight_hh")
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class RNNCell(RNNCellBase):
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+    A dynamic quantized RNNCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.RNNCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.RNNCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.RNNCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    __constants__ = ["input_size", "hidden_size", "bias", "nonlinearity"]
+
+    def __init__(
+        self, input_size, hidden_size, bias=True, nonlinearity="tanh", dtype=torch.qint8
+    ):
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, dtype=dtype)
+        self.nonlinearity = nonlinearity
+
+    def _get_name(self):
+        return "DynamicQuantizedRNNCell"
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        self.check_forward_input(input)
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        self.check_forward_hidden(input, hx, "")
+        if self.nonlinearity == "tanh":
+            ret = torch.ops.quantized.quantized_rnn_tanh_cell_dynamic(
+                input,
+                hx,
+                self._packed_weight_ih,
+                self._packed_weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = torch.ops.quantized.quantized_rnn_relu_cell_dynamic(
+                input,
+                hx,
+                self._packed_weight_ih,
+                self._packed_weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}")
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class LSTMCell(RNNCellBase):
+    r"""A long short-term memory (LSTM) cell.
+
+    A dynamic quantized LSTMCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.LSTMCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.LSTMCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, num_chunks=4, **kwargs)  # type: ignore[misc]
+
+    def _get_name(self):
+        return "DynamicQuantizedLSTMCell"
+
+    def forward(
+        self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None
+    ) -> tuple[Tensor, Tensor]:
+        self.check_forward_input(input)
+        if hx is None:
+            zeros = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+            hx = (zeros, zeros)
+        self.check_forward_hidden(input, hx[0], "[0]")
+        self.check_forward_hidden(input, hx[1], "[1]")
+        return torch.ops.quantized.quantized_lstm_cell_dynamic(
+            input,
+            hx,
+            self._packed_weight_ih,
+            self._packed_weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class GRUCell(RNNCellBase):
+    r"""A gated recurrent unit (GRU) cell
+
+    A dynamic quantized GRUCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.GRUCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.GRUCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.GRUCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    def __init__(self, input_size, hidden_size, bias=True, dtype=torch.qint8):
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, dtype=dtype)
+
+    def _get_name(self):
+        return "DynamicQuantizedGRUCell"
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        self.check_forward_input(input)
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        self.check_forward_hidden(input, hx, "")
+        return torch.ops.quantized.quantized_gru_cell_dynamic(
+            input,
+            hx,
+            self._packed_weight_ih,
+            self._packed_weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(
+            mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..f84d41b58503ad1d86244c7aa358f09ad16acad2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/functional.py
@@ -0,0 +1,781 @@
+# mypy: allow-untyped-defs
+r"""Functional interface (quantized)."""
+
+import warnings
+
+import torch
+from torch import Tensor
+from torch.jit.annotations import BroadcastingList2
+from torch.nn.modules.utils import _pair, _triple
+
+from .modules.utils import _pair_from_first
+
+
+# Although some of the functions and docstrings are mirrored from the torch.nn,
+# we want to have them here for future changes.
+
+__all__ = [
+    "avg_pool2d",
+    "avg_pool3d",
+    "adaptive_avg_pool2d",
+    "adaptive_avg_pool3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "interpolate",
+    "linear",
+    "max_pool1d",
+    "max_pool2d",
+    "celu",
+    "leaky_relu",
+    "hardtanh",
+    "hardswish",
+    "threshold",
+    "elu",
+    "hardsigmoid",
+    "clamp",
+    "upsample",
+    "upsample_bilinear",
+    "upsample_nearest",
+]
+
+
+def avg_pool2d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    r"""
+    Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size
+    :math:`sH \times sW` steps. The number of output features is equal to the number of
+    input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AvgPool2d` for details and output shape.
+
+    Args:
+        input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        kernel_size: size of the pooling region. Can be a single number or a
+          tuple `(kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+          tuple `(sH, sW)`. Default: :attr:`kernel_size`
+        padding: implicit zero paddings on both sides of the input. Can be a
+          single number or a tuple `(padH, padW)`. Default: 0
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape. Default: ``False``
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation. Default: ``True``
+        divisor_override: if specified, it will be used as divisor, otherwise
+             size of the pooling region will be used. Default: None
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.avg_pool2d' must be quantized!")
+    return torch.nn.functional.avg_pool2d(
+        input,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+    )
+
+
+def avg_pool3d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    r"""
+    Applies 3D average-pooling operation in :math:`kD \ times kH \times kW` regions by step size
+    :math:`sD \times sH \times sW` steps. The number of output features is equal to the number of
+    input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    Args:
+        input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        kernel_size: size of the pooling region. Can be a single number or a
+          tuple `(kD, kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+          tuple `(sD, sH, sW)`. Default: :attr:`kernel_size`
+        padding: implicit zero paddings on both sides of the input. Can be a
+          single number or a tuple `(padD, padH, padW)`. Default: 0
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape. Default: ``False``
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation. Default: ``True``
+        divisor_override: if specified, it will be used as divisor, otherwise
+             size of the pooling region will be used. Default: None
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.avg_pool3d' must be quantized!")
+    return torch.nn.functional.avg_pool3d(
+        input,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+    )
+
+
+def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
+    r"""
+    Applies a 2D adaptive average pooling over a quantized input signal composed
+    of several quantized input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+                     double-integer tuple)
+    """
+    if not input.is_quantized:
+        raise ValueError(
+            "Input to 'quantized.functional.adaptive_avg_pool2d' must be quantized!"
+        )
+    return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
+
+
+def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
+    r"""
+    Applies a 3D adaptive average pooling over a quantized input signal composed
+    of several quantized input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+                     double-integer tuple)
+    """
+    if not input.is_quantized:
+        raise ValueError(
+            "Input to 'quantized.functional.adaptive_avg_pool3d' must be quantized!"
+        )
+    return torch.nn.functional.adaptive_avg_pool3d(input, output_size)
+
+
+def conv1d(
+    input,
+    weight,
+    bias,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    padding_mode="zeros",
+    scale=1.0,
+    zero_point=0,
+    dtype=torch.quint8,
+):
+    r"""
+    Applies a 1D convolution over a quantized 1D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv1d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
+        weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , iW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sW,)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padW,)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dW,)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+          number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(33, 16, 3, dtype=torch.float)
+        >>> inputs = torch.randn(20, 16, 50, dtype=torch.float)
+        >>> bias = torch.randn(33, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv1d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != "zeros":
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError(
+            "Only torch.quint8 is supported for activation tensor!"
+        )
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 3:
+        raise ValueError("Input shape must be `(N, C, L)`!")
+    stride = _pair_from_first(stride)
+    padding = _pair_from_first(padding)
+    dilation = _pair_from_first(dilation)
+
+    packed_params = torch.ops.quantized.conv1d_prepack(
+        weight, bias, stride, padding, dilation, groups
+    )
+    return torch.ops.quantized.conv1d(input, packed_params, scale, zero_point)
+
+
+def conv2d(
+    input,
+    weight,
+    bias,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    padding_mode="zeros",
+    scale=1.0,
+    zero_point=0,
+    dtype=torch.quint8,
+):
+    r"""
+    Applies a 2D convolution over a quantized 2D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv2d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sH, sW)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padH, padW)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dH, dW)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+          number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(8, 4, 3, 3, dtype=torch.float)
+        >>> inputs = torch.randn(1, 4, 5, 5, dtype=torch.float)
+        >>> bias = torch.randn(8, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv2d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != "zeros":
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError(
+            "Only torch.quint8 is supported for activation tensor!"
+        )
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 4:
+        raise ValueError("Input shape must be `(N, C, H, W)`!")
+    stride = _pair(stride)
+    padding = _pair(padding)
+    dilation = _pair(dilation)
+
+    packed_params = torch.ops.quantized.conv2d_prepack(
+        weight, bias, stride, padding, dilation, groups
+    )
+    return torch.ops.quantized.conv2d(input, packed_params, scale, zero_point)
+
+
+def conv3d(
+    input,
+    weight,
+    bias,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    padding_mode="zeros",
+    scale=1.0,
+    zero_point=0,
+    dtype=torch.quint8,
+):
+    r"""
+    Applies a 3D convolution over a quantized 3D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv3d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape
+          :math:`(\text{minibatch} , \text{in\_channels} , iD , iH , iW)`
+        weight: quantized filters of shape
+          :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kD , kH , kW)`
+        bias: **non-quantized** bias tensor of shape
+          :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sD, sH, sW)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padD, padH, padW)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dD, dH, dW)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be
+          divisible by the number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for
+          quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(8, 4, 3, 3, 3, dtype=torch.float)
+        >>> inputs = torch.randn(1, 4, 5, 5, 5, dtype=torch.float)
+        >>> bias = torch.randn(8, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv3d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != "zeros":
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError(
+            "Only torch.quint8 is supported for activation tensor!"
+        )
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 5:
+        raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+    stride = _triple(stride)
+    padding = _triple(padding)
+    dilation = _triple(dilation)
+
+    packed_params = torch.ops.quantized.conv3d_prepack(
+        weight, bias, stride, padding, dilation, groups
+    )
+    return torch.ops.quantized.conv3d(input, packed_params, scale, zero_point)
+
+
+def interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+    r"""Down/up samples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D/3D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.interpolate' must be quantized!")
+    return torch.nn.functional.interpolate(
+        input, size, scale_factor, mode, align_corners
+    )
+
+
+def linear(
+    input: Tensor,
+    weight: Tensor,
+    bias: Tensor | None = None,
+    scale: float | None = None,
+    zero_point: int | None = None,
+) -> Tensor:
+    r"""
+    Applies a linear transformation to the incoming quantized data:
+    :math:`y = xA^T + b`.
+    See :class:`~torch.ao.nn.quantized.Linear`
+
+    .. note::
+
+      Current implementation packs weights on every call, which has penalty on performance.
+      If you want to avoid the overhead, use :class:`~torch.ao.nn.quantized.Linear`.
+
+    Args:
+      input (Tensor): Quantized input of type `torch.quint8`
+      weight (Tensor): Quantized weight of type `torch.qint8`
+      bias (Tensor): None or fp32 bias of type `torch.float`
+      scale (double): output scale. If None, derived from the input scale
+      zero_point (long): output zero point. If None, derived from the input zero_point
+
+    Shape:
+        - Input: :math:`(N, *, in\_features)` where `*` means any number of
+          additional dimensions
+        - Weight: :math:`(out\_features, in\_features)`
+        - Bias: :math:`(out\_features)`
+        - Output: :math:`(N, *, out\_features)`
+    """
+    if scale is None:
+        scale = input.q_scale()
+    if zero_point is None:
+        zero_point = input.q_zero_point()
+    _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+    return torch.ops.quantized.linear(input, _packed_params, scale, zero_point)
+
+
+def max_pool1d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+    return_indices=False,
+):
+    r"""Applies a 1D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.ao.nn.quantized.MaxPool1d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(list[int], [])
+    return torch.nn.functional.max_pool1d(
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+def max_pool2d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+    return_indices=False,
+):
+    r"""Applies a 2D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.ao.nn.quantized.MaxPool2d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(list[int], [])
+    return torch.nn.functional.max_pool2d(
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+def celu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.0) -> Tensor:
+    r"""celu(input, scale, zero_point, alpha=1.) -> Tensor
+
+    Applies the quantized CELU function element-wise.
+
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x / \alpha) - 1))
+
+    Args:
+        input: quantized input
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.celu' must be quantized!")
+    return torch.ops.quantized.celu(input, scale, zero_point, alpha)
+
+
+def leaky_relu(
+    input: Tensor,
+    negative_slope: float = 0.01,
+    inplace: bool = False,
+    scale: float | None = None,
+    zero_point: int | None = None,
+):
+    r"""
+    Quantized version of the.
+    leaky_relu(input, negative_slope=0.01, inplace=False, scale, zero_point) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)`
+
+    Args:
+        input: Quantized input
+        negative_slope: The slope of the negative input
+        inplace: Inplace modification of the input tensor
+        scale, zero_point: Scale and zero point of the output tensor.
+
+    See :class:`~torch.nn.LeakyReLU` for more details.
+    """
+    if scale is not None and zero_point is not None:
+        assert not inplace, "Cannot rescale with `inplace`"
+        output = torch._empty_affine_quantized(
+            input.shape, scale=scale, zero_point=int(zero_point), dtype=input.dtype
+        )
+        torch._C._nn.leaky_relu(input, negative_slope, out=output)
+        return output
+    if inplace:
+        result = torch._C._nn.leaky_relu_(input, negative_slope)
+    else:
+        result = torch._C._nn.leaky_relu(input, negative_slope)
+    return result
+
+
+def hardtanh(
+    input: Tensor, min_val: float = -1.0, max_val: float = 1.0, inplace: bool = False
+) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardtanh`."""
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardtanh' must be quantized!")
+    if inplace:
+        return torch._C._nn.hardtanh_(input, min_val, max_val)
+    return torch._C._nn.hardtanh(input, min_val, max_val)
+
+
+def hardswish(input: Tensor, scale: float, zero_point: int) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardswish`.
+
+    Args:
+        input: quantized input
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardswish' must be quantized!")
+    return torch._ops.ops.quantized.hardswish(input, scale, zero_point)
+
+
+def threshold(input: Tensor, threshold: float, value: float) -> Tensor:
+    r"""Applies the quantized version of the threshold function element-wise:
+
+    .. math::
+        x = \begin{cases}
+                x & \text{if~} x > \text{threshold} \\
+                \text{value} & \text{otherwise}
+            \end{cases}
+
+    See :class:`~torch.nn.Threshold` for more details.
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.threshold' must be quantized!")
+    if threshold is None:
+        raise ValueError("Input to 'threshold' must be specified!")
+    if value is None:
+        raise ValueError("Input to 'value' must be specified!")
+    return torch._ops.ops.quantized.threshold(input, threshold, value)
+
+
+def elu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.0) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.elu`.
+
+    Args:
+        input: quantized input
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        alpha: the alpha constant
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.elu' must be quantized!")
+    return torch.ops.quantized.elu(input, scale, zero_point, alpha)
+
+
+def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardsigmoid`."""
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardsigmoid' must be quantized!")
+    if inplace:
+        return torch._C._nn.hardsigmoid_(input)  # type: ignore[attr-defined]
+    return torch._C._nn.hardsigmoid(input)
+
+
+def clamp(input: Tensor, min_: float, max_: float) -> Tensor:
+    r"""float(input, min\_, max\_) -> Tensor
+
+    Applies the clamp function element-wise.
+    See :class:`~torch.ao.nn.quantized.clamp` for more details.
+
+    Args:
+        input: quantized input
+        min_: minimum value for clamping
+        max_: maximum value for clamping
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.clamp' must be quantized!")
+    return torch.clamp(input, min_, max_)
+
+
+def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    r"""Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(...)``.
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): quantized input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to be an integer.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`bilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+    """
+    warnings.warn(
+        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
+    )
+    return interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def upsample_bilinear(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using bilinear upsampling.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with
+        ``nn.quantized.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int]): output spatial size.
+        scale_factor (int or Tuple[int, int]): multiplier for spatial size
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn(
+        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
+    )
+    return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)
+
+
+def upsample_nearest(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using nearest neighbours' pixel values.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(..., mode='nearest')``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial
+            size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn(
+        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
+    )
+    return interpolate(input, size, scale_factor, mode="nearest")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3bad8c49350f56e5e58235570799a8d0968296d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__init__.py
@@ -0,0 +1,162 @@
+# mypy: allow-untyped-defs
+import torch
+
+# The quantized modules use `torch.nn` and `torch.ao.nn.quantizable`
+# packages. However, the `quantizable` package uses "lazy imports"
+# to avoid circular dependency.
+# Hence we need to include it here to make sure it is resolved before
+# they are used in the modules.
+import torch.ao.nn.quantizable
+from torch.nn.modules.pooling import MaxPool2d
+
+from .activation import (
+    ELU,
+    Hardswish,
+    LeakyReLU,
+    MultiheadAttention,
+    PReLU,
+    ReLU6,
+    Sigmoid,
+    Softmax,
+)
+from .batchnorm import BatchNorm2d, BatchNorm3d
+from .conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+)
+from .dropout import Dropout
+from .embedding_ops import Embedding, EmbeddingBag
+from .functional_modules import FloatFunctional, FXFloatFunctional, QFunctional
+from .linear import Linear
+from .normalization import (
+    GroupNorm,
+    InstanceNorm1d,
+    InstanceNorm2d,
+    InstanceNorm3d,
+    LayerNorm,
+)
+from .rnn import LSTM
+
+
+__all__ = [
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "DeQuantize",
+    "ELU",
+    "Embedding",
+    "EmbeddingBag",
+    "GroupNorm",
+    "Hardswish",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "LayerNorm",
+    "LeakyReLU",
+    "Linear",
+    "LSTM",
+    "MultiheadAttention",
+    "Quantize",
+    "ReLU6",
+    "Sigmoid",
+    "Softmax",
+    "Dropout",
+    "PReLU",
+    # Wrapper modules
+    "FloatFunctional",
+    "FXFloatFunctional",
+    "QFunctional",
+]
+
+
+class Quantize(torch.nn.Module):
+    r"""Quantizes an incoming tensor
+
+    Args:
+     `scale`: scale of the output Quantized Tensor
+     `zero_point`: zero_point of output Quantized Tensor
+     `dtype`: data type of output Quantized Tensor
+     `factory_kwargs`: Dictionary of kwargs used for configuring initialization
+         of internal buffers. Currently, `device` and `dtype` are supported.
+         Example: `factory_kwargs={'device': 'cuda', 'dtype': torch.float64}`
+         will initialize internal buffers as type `torch.float64` on the current CUDA device.
+         Note that `dtype` only applies to floating-point buffers.
+
+    Examples::
+        >>> t = torch.tensor([[1., -1.], [1., -1.]])
+        >>> scale, zero_point, dtype = 1.0, 2, torch.qint8
+        >>> qm = Quantize(scale, zero_point, dtype)
+        >>> # xdoctest: +SKIP
+        >>> qt = qm(t)
+        >>> print(qt)
+        tensor([[ 1., -1.],
+                [ 1., -1.]], size=(2, 2), dtype=torch.qint8, scale=1.0, zero_point=2)
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(self, scale, zero_point, dtype, factory_kwargs=None):
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        super().__init__()
+        self.register_buffer("scale", torch.tensor([scale], **factory_kwargs))
+        self.register_buffer(
+            "zero_point",
+            torch.tensor(
+                [zero_point],
+                dtype=torch.long,
+                **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+            ),
+        )
+        self.dtype = dtype
+
+    def forward(self, X):
+        return torch.quantize_per_tensor(
+            X, float(self.scale), int(self.zero_point), self.dtype
+        )
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        assert hasattr(mod, "activation_post_process")
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Quantize(
+            scale.float().item(),
+            zero_point.long().item(),
+            mod.activation_post_process.dtype,
+        )
+
+    def extra_repr(self):
+        return f"scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}"
+
+
+class DeQuantize(torch.nn.Module):
+    r"""Dequantizes an incoming tensor
+
+    Examples::
+        >>> input = torch.tensor([[1., -1.], [1., -1.]])
+        >>> scale, zero_point, dtype = 1.0, 2, torch.qint8
+        >>> qm = Quantize(scale, zero_point, dtype)
+        >>> # xdoctest: +SKIP
+        >>> quantized_input = qm(input)
+        >>> dqm = DeQuantize()
+        >>> dequantized = dqm(quantized_input)
+        >>> print(dequantized)
+        tensor([[ 1., -1.],
+                [ 1., -1.]], dtype=torch.float32)
+    """
+
+    def forward(self, Xq):
+        return Xq.dequantize()
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        return DeQuantize()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6267c296257ac64d0540d25117ef3dc11b78c39d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb774117527e192d03049ea2a01ce0d044d64f9f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21f24a6b22fa14f5d48eaae306d23ce30de56010
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe8d3d70361bced0c2150fdb2145f2741941bbc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f1a4ccc6b5e2c83389f25e439d721e66ae46180
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cb1a4ea9e0dd1d940740d754c1ac4421bc0bfa9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2b47e95416e269cbdfab48594d4beb5a1a3c72d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f835eaaeca3a8fc7fcd688e1366b41440e568f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..376c4b0891ef56e2fb28d5d3c6e51c7683e826e0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52b634c99c8e2a42ac4152dd199ddefdf4febeb2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6281545f7d827e3ae4ab96af82b19f8d658dbd8d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ecf1d5c9a1e2c198d89f284e109dd9410994b60
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/activation.py
@@ -0,0 +1,351 @@
+# mypy: allow-untyped-defs
+from warnings import warn
+
+import torch
+
+
+__all__ = [
+    "ReLU6",
+    "Hardswish",
+    "ELU",
+    "LeakyReLU",
+    "Sigmoid",
+    "Softmax",
+    "MultiheadAttention",
+    "PReLU",
+]
+
+
+class ReLU6(torch.nn.ReLU):
+    r"""Applies the element-wise function:
+
+    :math:`\text{ReLU6}(x) = \min(\max(x_0, x), q(6))`, where :math:`x_0` is the
+    zero_point, and :math:`q(6)` is the quantized representation of number 6.
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: ../scripts/activation_images/ReLU6.png
+
+    Examples::
+
+        >>> m = nn.quantized.ReLU6()
+        >>> input = torch.randn(2)
+        >>> # xdoctest: +SKIP
+        >>> input = torch.quantize_per_tensor(input, 1.0, 0, dtype=torch.qint32)
+        >>> output = m(input)
+    """
+
+    def __init__(self, inplace=False):
+        super().__init__(inplace)
+        self.inplace = inplace
+
+    def forward(self, input):
+        return torch.ops.quantized.relu6(input, self.inplace)
+
+    def _get_name(self):
+        return "QuantizedReLU6"
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        return ReLU6(mod.inplace)
+
+
+class Hardswish(torch.nn.Hardswish):
+    r"""This is the quantized version of :class:`~torch.nn.Hardswish`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+
+    def __init__(self, scale, zero_point, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.hardswish(input, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return "QuantizedHardswish"
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Hardswish(float(scale), int(zero_point))
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point))
+
+
+class ELU(torch.nn.ELU):
+    r"""This is the quantized equivalent of :class:`~torch.nn.ELU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        alpha: the alpha constant
+    """
+
+    def __init__(self, scale, zero_point, alpha=1.0):
+        super().__init__(alpha)
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def forward(self, input):
+        return torch.ao.nn.quantized.functional.elu(
+            input, self.scale, self.zero_point, self.alpha
+        )
+
+    def _get_name(self):
+        return "QuantizedELU"
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return ELU(float(scale), int(zero_point), mod.alpha)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.alpha)
+
+
+class LeakyReLU(torch.nn.LeakyReLU):
+    r"""This is the quantized equivalent of :class:`~torch.nn.LeakyReLU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+    """
+
+    def __init__(
+        self,
+        scale: float,
+        zero_point: int,
+        negative_slope: float = 1e-2,
+        inplace: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(negative_slope, inplace)
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.leaky_relu(
+            input, self.negative_slope, self.inplace, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedLeakyReLU"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
+
+
+class Sigmoid(torch.nn.Sigmoid):
+    r"""This is the quantized equivalent of :class:`~torch.nn.Sigmoid`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+
+    def __init__(self, output_scale: float, output_zero_point: int):
+        super().__init__()
+        self.output_scale = output_scale
+        self.output_zero_point = output_zero_point
+
+    def forward(self, input):
+        return torch.ops.quantized.sigmoid(
+            input, self.output_scale, self.output_zero_point
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        (
+            output_scale,
+            output_zero_point,
+        ) = mod.activation_post_process.calculate_qparams()
+        return cls(float(output_scale), int(output_zero_point))
+
+
+class Softmax(torch.nn.Softmax):
+    r"""This is the quantized version of :class:`~torch.nn.Softmax`.
+
+    Args:
+        dim: A dimension along which Softmax will be computed (so every slice along dim will sum to 1).
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+
+    def __init__(self, dim=None, scale=1.0, zero_point=0):
+        super().__init__()
+        self.dim = dim
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def forward(self, input):
+        dim = self.dim
+        if dim is None:
+            stacklevel = 3
+            # Note: adding the mypy ignore on _get_softmax_dim seems less bad
+            # than making `_get_softmax_dim` an official API.
+            dim = torch.nn.functional._get_softmax_dim(  # type: ignore[attr-defined]
+                "softmax", input.dim(), stacklevel
+            )
+        return torch.ops.quantized.softmax(input, dim, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return "QuantizedSoftmax"
+
+    @staticmethod
+    def from_float(mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Softmax(mod.dim, float(scale), int(zero_point))
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.dim, float(scale), int(zero_point))
+
+
+class MultiheadAttention(torch.ao.nn.quantizable.MultiheadAttention):
+    # pyrefly: ignore [bad-override]
+    _FLOAT_MODULE = torch.ao.nn.quantizable.MultiheadAttention
+
+    def _get_name(self):
+        return "QuantizedMultiheadAttention"
+
+    @classmethod
+    def from_float(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does observed -> quantized only
+        raise NotImplementedError(
+            "It looks like you are trying to convert a "
+            "non-observed MHA module. Please, see "
+            "the examples on quantizable MHAs."
+        )
+
+    @classmethod
+    def from_observed(cls, other):
+        converted = torch.ao.quantization.convert(
+            other,
+            mapping=None,
+            inplace=False,
+            remove_qconfig=True,
+            convert_custom_config_dict=None,
+        )
+        converted.__class__ = cls
+        # Remove the parameters for the bias_k and bias_v to quantize them
+        # TODO: This is a potential source of accuracy drop.
+        #       quantized cat takes the scale and zp of the first
+        #       element, which might lose the precision in the bias_k
+        #       and the bias_v (which are cat'ed with k/v being first).
+        if converted.bias_k is not None:
+            bias_k = converted._parameters.pop("bias_k")
+            sc, zp = torch._choose_qparams_per_tensor(bias_k, reduce_range=False)
+            bias_k = torch.quantize_per_tensor(bias_k, sc, zp, torch.quint8)
+            setattr(converted, "bias_k", bias_k)  # noqa: B010
+
+        if converted.bias_v is not None:
+            bias_v = converted._parameters.pop("bias_v")
+            sc, zp = torch._choose_qparams_per_tensor(
+                bias_k,  # type: ignore[possibly-undefined]
+                reduce_range=False,
+            )
+            bias_v = torch.quantize_per_tensor(bias_v, sc, zp, torch.quint8)
+            setattr(converted, "bias_v", bias_v)  # noqa: B010
+
+        del converted.in_proj_weight
+        del converted.in_proj_bias
+
+        return converted
+
+
+class PReLU(torch.nn.Module):
+    r"""This is the quantized equivalent of :class:`~torch.nn.PReLU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        num_parameters: number of parameters: 1, or the number of channels at input. Default: 1
+    """
+
+    def __init__(
+        self, output_scale: float, output_zero_point: int, num_parameters: int = 1
+    ) -> None:
+        super().__init__()
+        self.num_parameters = num_parameters
+        self.scale = output_scale
+        self.zero_point = output_zero_point
+        w = torch.randn(num_parameters, dtype=torch.float)
+        qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.quint8)
+        self.set_weight(qw)
+
+    def set_weight(self, w: torch.Tensor) -> None:
+        self.weight = w
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.prelu(
+            input, self.weight, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedPReLU"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        qprelu = cls(float(scale), int(zero_point), mod.num_parameters)
+        float_wt = mod.weight.float()
+        observer = mod.qconfig.weight()
+        observer(float_wt)
+        if observer.dtype != torch.quint8:
+            warn(
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
+            )
+        wt_scale, wt_zp = observer.calculate_qparams()
+        qweight = torch.quantize_per_tensor(
+            float_wt, float(wt_scale), int(wt_zp), torch.quint8
+        )
+        qprelu.set_weight(qweight)
+        return qprelu
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        qprelu = cls(float(scale), int(zero_point), mod.num_parameters)
+        float_wt = mod.weight.float()
+        observer = mod.qconfig.weight()
+        observer(float_wt)
+        if observer.dtype != torch.quint8:
+            warn(
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
+            )
+        wt_scale, wt_zp = observer.calculate_qparams()
+        qweight = torch.quantize_per_tensor(
+            float_wt, float(wt_scale), int(wt_zp), torch.quint8
+        )
+        qprelu.set_weight(qweight)
+        return qprelu
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e6779c08b1f6af61c2377335b984c7f75a29a6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/batchnorm.py
@@ -0,0 +1,130 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.intrinsic as nni
+
+
+__all__ = ["BatchNorm2d", "BatchNorm3d"]
+
+
+class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm):
+    def __init__(
+        self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(num_features, eps, momentum, True, True, **factory_kwargs)
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(1.0, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(0, **factory_kwargs))
+
+    @staticmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        activation_post_process = mod.activation_post_process
+        if type(mod) is cls._NNI_BN_RELU_MODULE:
+            mod = mod[0]
+        scale, zero_point = activation_post_process.calculate_qparams()
+        new_mod = cls(mod.num_features, mod.eps)
+        new_mod.weight = mod.weight
+        new_mod.bias = mod.bias
+        new_mod.running_mean = mod.running_mean
+        new_mod.running_var = mod.running_var
+        new_mod.scale = scale
+        new_mod.zero_point = zero_point
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, bn, output_scale, output_zero_point):
+        qbn = cls(
+            bn.num_features,
+            bn.eps,
+            bn.momentum,
+            device=bn.weight.device,
+            dtype=bn.weight.dtype,
+        )
+        qbn.weight = bn.weight
+        qbn.bias = bn.bias
+        qbn.running_mean = bn.running_mean
+        qbn.running_var = bn.running_var
+        qbn.scale = output_scale
+        qbn.zero_point = output_zero_point
+        return qbn
+
+
+class BatchNorm2d(_BatchNorm):
+    r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`."""
+
+    _NNI_BN_RELU_MODULE = nni.BNReLU2d
+
+    def __init__(
+        self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedBatchNorm2d"
+
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm2d(
+            input,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            self.eps,
+            self.scale,
+            self.zero_point,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        return _BatchNorm.from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class BatchNorm3d(_BatchNorm):
+    r"""This is the quantized version of :class:`~torch.nn.BatchNorm3d`."""
+
+    _NNI_BN_RELU_MODULE = nni.BNReLU3d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedBatchNorm3d"
+
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm3d(
+            input,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            self.eps,
+            self.scale,
+            self.zero_point,
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        return _BatchNorm.from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a292d616a86c31d22550faa7d38d256350e4e91a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/conv.py
@@ -0,0 +1,1244 @@
+# mypy: allow-untyped-defs
+r"""Quantized convolution modules."""
+
+from typing import ClassVar, Literal, Optional
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.nn as nn
+import torch.nn.functional as F
+from torch._ops import ops
+from torch.nn.common_types import _size_1_t
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.nn.utils import fuse_conv_bn_weights
+
+from .utils import _quantize_weight, WeightedQuantizedModule
+
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+]
+
+_SUPPORTED_PADDING = {"zeros", "reflect"}
+
+
+def _reverse_repeat_padding(padding: list[int]) -> list[int]:
+    _reversed_padding_repeated_twice: list[int] = []
+    N = len(padding)
+    for idx in range(N):
+        _reversed_padding_repeated_twice.extend(padding[N - idx - 1] for _ in range(2))
+    return _reversed_padding_repeated_twice
+
+
+class _ConvNd(WeightedQuantizedModule):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        # All subclasses have this signature - See PR #49702s
+        raise NotImplementedError
+
+    def _init(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.output_padding = output_padding
+        self.groups = groups
+        if padding_mode not in _SUPPORTED_PADDING:
+            raise ValueError(
+                f"'padding_mode' {padding_mode} is not supported by quantized convolution"
+            )
+        self.padding_mode = padding_mode
+        # Initialize as NCHW. set_weight will internally transpose to NHWC.
+        if self.transposed:
+            weight_shape = [in_channels, out_channels // self.groups]
+        else:
+            weight_shape = [out_channels, in_channels // self.groups]
+        qweight = torch._empty_affine_quantized(
+            weight_shape + list(kernel_size),
+            scale=1,
+            zero_point=0,
+            dtype=torch.qint8,
+            **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+        )
+        bias_float = (
+            torch.zeros(
+                out_channels,
+                dtype=torch.float,
+                **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+            )
+            if bias
+            else None
+        )
+
+        self.set_weight_bias(qweight, bias_float)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    def set_weight_bias(self, qweight, bias_float):
+        raise NotImplementedError
+
+    def bias(self):
+        raise NotImplementedError
+
+    def _weight_bias(self):
+        raise NotImplementedError
+
+    def extra_repr(self):
+        s = (
+            "{in_channels}, {out_channels}, kernel_size={kernel_size}"
+            ", stride={stride}, scale={scale}, zero_point={zero_point}"
+        )
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.output_padding != (0,) * len(self.output_padding):
+            s += ", output_padding={output_padding}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias() is None:
+            s += ", bias=False"
+        return s.format(**self.__dict__)
+
+    # ===== Serialization methods =====
+    # The special consideration here is that we have to unpack the weights into
+    # their regular QTensor form for serialization. Packed weights should not
+    # live outside the process in which they were created, rather they should be
+    # derived from the QTensor weight.
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # TODO: maybe change to this when https://github.com/pytorch/pytorch/pull/32958 is landed
+    #   self
+    #   |--- _packed_params : Conv2dPackedParamsBase or Conv3dPackedParamsBase
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        (w, b) = self._weight_bias()
+        destination[prefix + "weight"] = w
+        destination[prefix + "bias"] = b
+        destination[prefix + "scale"] = torch.tensor(self.scale)
+        destination[prefix + "zero_point"] = torch.tensor(self.zero_point)
+
+    @torch.jit.export
+    def __getstate__(self):
+        (w, b) = self._weight_bias()
+        return (
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.transposed,
+            self.output_padding,
+            self.groups,
+            self.padding_mode,
+            w,
+            b,
+            self.scale,
+            self.zero_point,
+            self.training,
+        )
+
+    # ===== Deserialization methods =====
+    # Counterpart to the serialization methods, we must pack the serialized
+    # QTensor weight into its packed format for use by the FBGEMM ops.
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.set_weight_bias(state_dict[prefix + "weight"], state_dict[prefix + "bias"])
+        state_dict.pop(prefix + "weight")
+        state_dict.pop(prefix + "bias")
+        self.scale = float(state_dict[prefix + "scale"])
+        state_dict.pop(prefix + "scale")
+        self.zero_point = int(state_dict[prefix + "zero_point"])
+        state_dict.pop(prefix + "zero_point")
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        self.in_channels = state[0]
+        self.out_channels = state[1]
+        self.kernel_size = state[2]
+        self.stride = state[3]
+        self.padding = state[4]
+        self.dilation = state[5]
+        self.transposed = state[6]
+        self.output_padding = state[7]
+        self.groups = state[8]
+        self.padding_mode = state[9]
+        self.set_weight_bias(state[10], state[11])
+        self.scale = state[12]
+        self.zero_point = state[13]
+        self.training = state[14]
+
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        torch.nn.Module.__init__(new_instance)
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def __copy__(self):
+        return self.__deepcopy__({})
+
+    @classmethod
+    def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
+        r"""Creates a qconv object and returns it."""
+        if weight_post_process is None:
+            weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        assert weight_post_process.dtype == torch.qint8, (
+            "Weight observer must have a dtype of qint8"
+        )
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        # the __init__ call used is the one from derived classes and not the one from _ConvNd
+        qconv = cls(
+            mod.in_channels,
+            mod.out_channels,
+            mod.kernel_size,
+            mod.stride,
+            mod.padding,
+            mod.dilation,
+            mod.groups,
+            mod.bias is not None,
+            mod.padding_mode,
+        )
+        qconv.set_weight_bias(qweight, mod.bias)
+        if (
+            activation_post_process is None
+            or activation_post_process.dtype == torch.float
+        ):
+            return qconv  # dynamic quantization doesn't need scale/zero_point
+        else:
+            act_scale, act_zp = activation_post_process.calculate_qparams()
+            qconv.scale = float(act_scale)
+            qconv.zero_point = int(act_zp)
+            return qconv
+
+    @staticmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        if hasattr(mod, "weight_fake_quant"):
+            # assert type(mod) is cls.__QAT_MODULE, " nnq." + cls.__name__ + \
+            # ".from_float only works for " + cls.__QAT_MODULE.__name__
+            if type(mod) is cls._NNIQAT_CONV_BN_MODULE:
+                mod.weight, mod.bias = fuse_conv_bn_weights(
+                    mod.weight,
+                    mod.bias,
+                    mod.bn.running_mean,
+                    mod.bn.running_var,
+                    mod.bn.eps,
+                    mod.bn.weight,
+                    mod.bn.bias,
+                )
+            assert hasattr(mod, "activation_post_process"), (
+                "Input QAT module must have observer attached"
+            )
+            weight_post_process = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            assert type(mod) is cls._FLOAT_MODULE, (
+                " nnq."
+                + cls.__name__
+                + ".from_float only works for "
+                + cls._FLOAT_MODULE.__name__
+                + " but got:"
+                + str(type(mod))
+            )
+            assert hasattr(mod, "qconfig"), (
+                "Input float module must have qconfig defined."
+            )
+            activation_post_process = (
+                None
+                if not hasattr(mod, "activation_post_process")
+                else mod.activation_post_process
+            )
+            if type(mod) in [
+                cls._NNI_CONV_RELU_MODULE,
+                cls._NNI_CONV_ADD_MODULE,
+                cls._NNI_CONV_ADD_RELU_MODULE,
+            ]:
+                mod = mod[0]
+            weight_post_process = mod.qconfig.weight()
+        return cls.get_qconv(mod, activation_post_process, weight_post_process)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+        Args:
+            ref_qconv (Module): a reference quantized  module, either produced by torch.ao.quantization
+                                utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qconv = cls(
+            ref_qconv.in_channels,
+            ref_qconv.out_channels,
+            ref_qconv.kernel_size,  # type: ignore[arg-type]
+            ref_qconv.stride,  # type: ignore[arg-type]
+            ref_qconv.padding,  # type: ignore[arg-type]
+            ref_qconv.dilation,  # type: ignore[arg-type]
+            ref_qconv.groups,
+            ref_qconv.bias is not None,  # type: ignore[arg-type]
+            ref_qconv.padding_mode,
+            device=ref_qconv.weight.device,
+            dtype=ref_qconv.weight.dtype,
+        )
+        qweight = ref_qconv.get_quantized_weight()
+        qconv.set_weight_bias(qweight, ref_qconv.bias)
+        qconv.scale = float(output_scale)
+        qconv.zero_point = int(output_zero_point)
+        return qconv
+
+
+class Conv1d(_ConvNd):
+    r"""Applies a 1D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv1d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> m = nn.quantized.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 100)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0,
+        ...                                     dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn1d
+    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU1d
+    _NNI_CONV_ADD_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _NNI_CONV_ADD_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        # pyrefly: ignore [bad-assignment]
+        padding = padding if isinstance(padding, str) else _single(padding)
+        dilation = _single(dilation)
+
+        # Subclasses of _ConvNd needs to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _single(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConv1d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == "zeros":
+            self._packed_params = torch.ops.quantized.conv1d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups
+            )
+        else:
+            self._packed_params = torch.ops.quantized.conv1d_prepack(
+                w, b, self.stride, _pair(0), self.dilation, self.groups
+            )
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv1d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != "zeros":
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv1d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class Conv2d(_ConvNd):
+    r"""Applies a 2D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv2d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn2d
+    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU2d
+    _NNI_CONV_ADD_MODULE: ClassVar[type[nni.ConvAdd2d]] = nni.ConvAdd2d
+    _NNI_CONV_ADD_RELU_MODULE: ClassVar[type[nni.ConvAddReLU2d]] = nni.ConvAddReLU2d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConv2d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == "zeros":
+            self._packed_params = torch.ops.quantized.conv2d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups
+            )
+        else:
+            self._packed_params = torch.ops.quantized.conv2d_prepack(
+                w, b, self.stride, _pair(0), self.dilation, self.groups
+            )
+
+    def _weight_bias(self):
+        return self._packed_params.unpack()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv2d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+class Conv3d(_ConvNd):
+    r"""Applies a 3D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv3d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2))
+        >>> input = torch.randn(20, 16, 56, 56, 56)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
+    _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn3d
+    _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU3d
+    _NNI_CONV_ADD_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+    _NNI_CONV_ADD_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        assert padding_mode != "reflect", "Conv3d does not support reflection padding"
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConv3d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == "zeros":
+            self._packed_params = torch.ops.quantized.conv3d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups
+            )
+        else:
+            self._packed_params = torch.ops.quantized.conv3d_prepack(
+                w, b, self.stride, _triple(0), self.dilation, self.groups
+            )
+
+    def _weight_bias(self):
+        return self._packed_params.unpack()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != "zeros":
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(
+                input, _reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return ops.quantized.conv3d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(
+            cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+        )
+
+
+# === Transposed Convolutions ===
+
+
+class _ConvTransposeNd(_ConvNd):
+    _FLOAT_MODULE: ClassVar[type[nn.modules.conv._ConvNd]]
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode,
+        device=None,
+        dtype=None,
+    ):
+        if padding_mode != "zeros":
+            raise ValueError(
+                f'Only "zeros" padding mode is supported for {self.__class__.__name__}'
+            )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _input_padding(
+        self, kernel_size: list[int], dilation: list[int], padding: list[int]
+    ) -> list[int]:
+        res = torch.jit.annotate(list[int], [])
+        for kdx in range(len(kernel_size)):
+            pad = dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx]
+            res.append(pad)
+        return res
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+        r"""Creates a quantized module from a float module or qparams_dict.
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        # derived classes override cls._FLOAT_MODULE attribute
+        msg = (
+            " nnq."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        )
+        assert type(mod) is cls._FLOAT_MODULE, msg
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined."
+        weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
+        weight_post_process(mod.weight)
+        assert weight_post_process.dtype == torch.qint8, (
+            "Weight observer must have a dtype of qint8"
+        )
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        # the __init__ call used is the one from derived classes and not the one from _ConvTransposeNd
+        qconv = cls(
+            mod.in_channels,
+            mod.out_channels,
+            mod.kernel_size,  # type: ignore[call-arg]
+            mod.stride,
+            mod.padding,
+            mod.output_padding,
+            mod.groups,
+            mod.bias is not None,
+            mod.dilation,
+            mod.padding_mode,
+        )
+        qconv.set_weight_bias(qweight, mod.bias)
+        if (
+            not hasattr(mod, "activation_post_process")
+            or mod.activation_post_process.dtype == torch.float
+        ):
+            return qconv  # dynamic quantization doesn't need scale/zero_point
+        else:
+            act_scale, act_zp = mod.activation_post_process.calculate_qparams()  # type: ignore[operator, union-attr]
+            qconv.scale = float(act_scale)
+            qconv.zero_point = int(act_zp)
+            return qconv
+
+    @staticmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+        Args:
+            ref_qconvt (Module): a reference quantized  module, either produced by torch.ao.quantization
+                                 utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qconv = cls(
+            ref_qconvt.in_channels,
+            ref_qconvt.out_channels,
+            ref_qconvt.kernel_size,  # type: ignore[arg-type]
+            ref_qconvt.stride,  # type: ignore[arg-type]
+            ref_qconvt.padding,  # type: ignore[arg-type]
+            ref_qconvt.output_padding,  # type: ignore[arg-type]
+            ref_qconvt.groups,
+            ref_qconvt.bias is not None,  # type: ignore[arg-type]
+            ref_qconvt.dilation,  # type: ignore[arg-type]
+            ref_qconvt.padding_mode,
+            device=ref_qconvt.weight.device,
+            dtype=ref_qconvt.weight.dtype,
+        )
+        qweight = ref_qconvt.get_quantized_weight()
+        qconv.set_weight_bias(qweight, ref_qconvt.bias)
+        qconv.scale = float(output_scale)
+        qconv.zero_point = int(output_zero_point)
+        return qconv
+
+
+class ConvTranspose1d(_ConvTransposeNd):
+    r"""Applies a 1D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose1d`.
+
+    .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv1d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> torch.backends.quantized.engine = 'qnnpack'
+        >>> from torch.ao.nn import quantized as nnq
+        >>> # With square kernels and equal stride
+        >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose1d]] = nn.ConvTranspose1d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        output_padding = _single(output_padding)
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConvTranspose1d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(
+            w,
+            b,
+            self.stride,
+            self.padding,
+            self.output_padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv_transpose1d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        return torch.ops.quantized.conv_transpose1d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+        return _ConvTransposeNd.from_reference(
+            cls, ref_qconvt, output_scale, output_zero_point
+        )
+
+
+class ConvTranspose2d(_ConvTransposeNd):
+    r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose2d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv2d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # QNNPACK or FBGEMM as backend
+        >>> torch.backends.quantized.engine = 'qnnpack'
+        >>> # With square kernels and equal stride
+        >>> import torch.ao.nn.quantized as nnq
+        >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose2d]] = nn.ConvTranspose2d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConvTranspose2d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(
+            w,
+            b,
+            self.stride,
+            self.padding,
+            self.output_padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv2d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return ops.quantized.conv_transpose2d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+        return _ConvTransposeNd.from_reference(
+            cls, ref_qconvt, output_scale, output_zero_point
+        )
+
+
+class ConvTranspose3d(_ConvTransposeNd):
+    r"""Applies a 3D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose3d`.
+
+    .. note:: Currently only the FBGEMM engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'fbgemm'`
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv3d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> torch.backends.quantized.engine = 'fbgemm'
+        >>> from torch.ao.nn import quantized as nnq
+        >>> # With cubic kernels and equal stride
+        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-cubic kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
+        >>> input = torch.randn(20, 16, 50, 100, 100)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12, 12])
+    """
+
+    _FLOAT_MODULE: ClassVar[type[nn.ConvTranspose3d]] = nn.ConvTranspose3d
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        output_padding = _triple(output_padding)
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _get_name(self):
+        return "QuantizedConvTranspose3d"
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(
+            w,
+            b,
+            self.stride,
+            self.padding,
+            self.output_padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv3d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, T, H, W)`!")
+        return ops.quantized.conv_transpose3d(
+            input, self._packed_params, self.scale, self.zero_point
+        )
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+        return _ConvTransposeNd.from_reference(
+            cls, ref_qconvt, output_scale, output_zero_point
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..3744ca30d5a49ba92cbb86690f2683af02d594fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/dropout.py
@@ -0,0 +1,30 @@
+# mypy: allow-untyped-defs
+import torch
+
+
+__all__ = ["Dropout"]
+
+
+class Dropout(torch.nn.Dropout):
+    r"""This is the quantized equivalent of :class:`~torch.nn.Dropout`.
+        And this is a placeholder to enable models where fp32 tensors
+        had dropout to work with quantized tensors in train and eval mode.
+
+    Args:
+        p: probability of an element to be zeroed
+        inplace: can optionally do the operation in-place. Default: ``False``
+    """
+
+    def forward(self, input):
+        return input
+
+    def _get_name(self):
+        return "QuantizedDropout"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return cls(mod.p, mod.inplace)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.p, mod.inplace)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e843653ed27a49fa62d0f7e3408a7ac04f48fdf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -0,0 +1,413 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.nn as nn
+from torch import Tensor  # noqa: F401
+from torch._jit_internal import List, Optional  # noqa: F401
+
+from .utils import _hide_packed_params_repr, _quantize_weight
+
+
+__all__ = ["EmbeddingPackedParams", "Embedding", "EmbeddingBag"]
+
+
+class EmbeddingPackedParams(torch.nn.Module):
+    _version = 1
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
+        super().__init__()
+        self.dtype = dtype
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            scales = torch.ones(num_embeddings, dtype=torch.float)
+            zero_points = torch.zeros(num_embeddings, dtype=torch.float)
+            wq = torch._empty_per_channel_affine_quantized(
+                [num_embeddings, embedding_dim],
+                scales=scales,
+                zero_points=zero_points,
+                axis=0,
+                dtype=self.dtype,
+            )
+            self.set_weight(wq)
+        else:
+            raise NotImplementedError(
+                f"Unsupported dtype on quantized embedding! Supports quint8 and quint4x2. Got dtype: {dtype}"
+            )
+
+    @torch.jit.export
+    def set_weight(self, weight: torch.Tensor) -> None:
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight)
+        else:
+            raise NotImplementedError(
+                "Unsupported dtype for quantized embedding prepack! Supports quint8 and quint4x2."
+            )
+
+    @torch.jit.export
+    def _weight(self):
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            return torch.ops.quantized.embedding_bag_unpack(self._packed_weight)
+        else:
+            raise NotImplementedError(
+                "Unsupported dtype for quantized embedding unpack! Supports quint8 and quint4x2."
+            )
+
+    def forward(self, x):
+        return x
+
+    # Version 1
+    #   self
+    #   |--- _packed_weight : Tensor representing weight of EmbeddingPackedParamsBase
+    #   |--- dtype : torch.dtype
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "dtype"] = self.dtype
+        destination[prefix + "_packed_weight"] = self._weight()
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.dtype = state_dict[prefix + "dtype"]
+        state_dict.pop(prefix + "dtype")
+
+        weight = state_dict[prefix + "_packed_weight"]
+        state_dict.pop(prefix + "_packed_weight")
+        self.set_weight(weight)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def __repr__(self):
+        return self._weight().__repr__()
+
+
+class Embedding(torch.nn.Module):
+    r"""
+    A quantized Embedding module with quantized packed weights as inputs.
+    We adopt the same interface as `torch.nn.Embedding`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html for documentation.
+
+    Similar to :class:`~torch.nn.Embedding`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`.
+
+    Examples::
+        >>> m = nn.quantized.Embedding(num_embeddings=10, embedding_dim=12)
+        >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8])
+        >>> output = m(indices)
+        >>> print(output.size())
+        torch.Size([9, 12])
+
+    """
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        dtype=torch.quint8,
+    ) -> None:
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.dtype = dtype
+
+        if _weight is None:
+            scales = torch.ones(num_embeddings, dtype=torch.float)
+            zero_points = torch.zeros(num_embeddings, dtype=torch.float)
+            qweight = torch._empty_per_channel_affine_quantized(
+                [num_embeddings, embedding_dim],
+                scales=scales,
+                zero_points=zero_points,
+                axis=0,
+                dtype=torch.quint8,
+            )
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            qweight = _weight
+
+        self._packed_params = EmbeddingPackedParams(
+            num_embeddings, embedding_dim, dtype
+        )
+        self._packed_params.set_weight(qweight)
+
+    def forward(self, indices: Tensor) -> Tensor:
+        if self.dtype == torch.quint4x2:
+            return torch.ops.quantized.embedding_4bit(
+                self._packed_params._packed_weight, indices
+            )
+        else:
+            return torch.ops.quantized.embedding_byte(
+                self._packed_params._packed_weight, indices
+            )
+
+    def _get_name(self):
+        return "QuantizedEmbedding"
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, EmbeddingPackedParams)
+
+    def extra_repr(self):
+        extra_repr_str = (
+            f"num_embeddings={self.num_embeddings}, embedding_dim={self.embedding_dim}, "
+            f"dtype={self._packed_params.dtype}, qscheme={self.weight().qscheme()}"
+        )
+
+        return extra_repr_str
+
+    def set_weight(self, w: torch.Tensor) -> None:
+        self._packed_params.set_weight(w)
+
+    def weight(self):
+        return self._packed_params._weight()
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a quantized embedding module from a float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by user
+        """
+        if hasattr(mod, "weight_fake_quant"):
+            assert type(mod) is torch.ao.nn.qat.Embedding, (
+                "nnq."
+                + cls.__name__
+                + ".from_float "
+                + "with fake quant only works for "
+                + torch.ao.nn.qat.Embedding.__name__
+            )
+            weight_observer = mod.weight_fake_quant
+        else:
+            assert type(mod) is nn.Embedding, (
+                "nnq."
+                + cls.__name__
+                + ".from_float only works for "
+                + nn.Embedding.__name__
+            )
+            assert hasattr(mod, "qconfig"), (
+                "Embedding input float module must have qconfig defined"
+            )
+            from torch.ao.quantization import float_qparams_weight_only_qconfig
+
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+            else:
+                weight_observer = float_qparams_weight_only_qconfig.weight()
+
+        dtype = weight_observer.dtype
+        is_float_qparams_qconfig = (
+            weight_observer.qscheme == torch.per_channel_affine_float_qparams
+        )
+        assert is_float_qparams_qconfig, (
+            "Embedding quantization is only supported with float_qparams_weight_only_qconfig."
+        )
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, (
+            f"The only supported dtype for nnq.Embedding is torch.quint8 and torch.quint4x2, got {dtype}"
+        )
+
+        # Run the observer to calculate qparams.
+        weight_observer(mod.weight)
+        qweight = _quantize_weight(mod.weight.float(), weight_observer)
+
+        # Create quantized Embedding module and pass in the quantized weight
+        qembedding = Embedding(mod.num_embeddings, mod.embedding_dim)
+        qembedding.set_weight(qweight)
+        return qembedding
+
+    @classmethod
+    def from_reference(cls, ref_embedding):
+        qembedding = cls(
+            ref_embedding.num_embeddings,
+            ref_embedding.embedding_dim,
+            ref_embedding.padding_idx,
+            ref_embedding.max_norm,
+            ref_embedding.norm_type,
+            ref_embedding.scale_grad_by_freq,
+            ref_embedding.sparse,
+            ref_embedding.get_quantized_weight(),
+            ref_embedding.weight_dtype,
+        )
+        return qembedding
+
+
+class EmbeddingBag(Embedding):
+    r"""
+    A quantized EmbeddingBag module with quantized packed weights as inputs.
+    We adopt the same interface as `torch.nn.EmbeddingBag`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html for documentation.
+
+    Similar to :class:`~torch.nn.EmbeddingBag`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`.
+
+    Examples::
+        >>> m = nn.quantized.EmbeddingBag(num_embeddings=10, embedding_dim=12, include_last_offset=True, mode='sum')
+        >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3])
+        >>> offsets = torch.tensor([0, 19, 20, 28, 28, 32])
+        >>> output = m(indices, offsets)
+        >>> print(output.size())
+        torch.Size([5, 12])
+
+    """
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "sum",
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        include_last_offset: bool = False,
+        dtype=torch.quint8,
+    ) -> None:
+        super().__init__(num_embeddings, embedding_dim, _weight=_weight, dtype=dtype)
+
+        self.mode = mode
+        self.pruned_weights = False
+        self.include_last_offset = include_last_offset
+        self.dtype = dtype
+
+    def forward(
+        self,
+        indices: Tensor,
+        offsets: Optional[Tensor] = None,
+        per_sample_weights: Optional[Tensor] = None,
+        compressed_indices_mapping: Optional[Tensor] = None,
+    ) -> Tensor:
+        if self.dtype == torch.quint4x2:
+            return torch.ops.quantized.embedding_bag_4bit(
+                self._packed_params._packed_weight,
+                indices,
+                offsets,
+                False,
+                0,
+                self.pruned_weights,
+                per_sample_weights,
+                compressed_indices_mapping,
+                self.include_last_offset,
+            )
+        else:
+            return torch.ops.quantized.embedding_bag_byte(
+                self._packed_params._packed_weight,
+                indices,
+                offsets,
+                False,
+                0,
+                self.pruned_weights,
+                per_sample_weights,
+                compressed_indices_mapping,
+                self.include_last_offset,
+            )
+
+    def _get_name(self):
+        return "QuantizedEmbeddingBag"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a quantized embedding_bag module from a float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by user
+        """
+        if hasattr(mod, "weight_fake_quant"):
+            weight_observer = mod.weight_fake_quant
+        else:
+            assert type(mod) is nn.EmbeddingBag, (
+                "nnq."
+                + cls.__name__
+                + ".from_float only works for "
+                + nn.EmbeddingBag.__name__
+            )
+            assert hasattr(mod, "qconfig"), (
+                "EmbeddingBag input float module must have qconfig defined"
+            )
+            from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
+
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+            else:
+                weight_observer = float_qparams_weight_only_qconfig.weight()
+
+        dtype = weight_observer.dtype
+        is_float_qparams_qconfig = (
+            weight_observer.qscheme == torch.per_channel_affine_float_qparams
+        )
+        assert is_float_qparams_qconfig, (
+            "EmbeddingBag quantization is only supported with float_qparams_weight_only_qconfig."
+        )
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, (
+            f"The only supported dtype for nnq.EmbeddingBag is torch.quint8 and torch.quint4x2, got {dtype}"
+        )
+
+        # Run the observer to calculate qparams.
+        weight_observer(mod.weight)
+        qweight = _quantize_weight(mod.weight.float(), weight_observer)
+
+        # Create quantized EmbeddingBag module and pass in the quantized weight
+        qembedding_bag = EmbeddingBag(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            max_norm=mod.max_norm,
+            norm_type=mod.norm_type,
+            scale_grad_by_freq=mod.scale_grad_by_freq,
+            mode=mod.mode,
+            sparse=mod.sparse,
+            include_last_offset=mod.include_last_offset,
+            dtype=dtype,
+        )
+        qembedding_bag.set_weight(qweight)
+        return qembedding_bag
+
+    @classmethod
+    def from_reference(cls, ref_embedding_bag):
+        qembedding_bag = cls(
+            ref_embedding_bag.num_embeddings,
+            ref_embedding_bag.embedding_dim,
+            ref_embedding_bag.max_norm,
+            ref_embedding_bag.norm_type,
+            ref_embedding_bag.scale_grad_by_freq,
+            ref_embedding_bag.mode,
+            ref_embedding_bag.sparse,
+            ref_embedding_bag.get_quantized_weight(),
+            ref_embedding_bag.include_last_offset,
+            ref_embedding_bag.weight_dtype,
+        )
+        return qembedding_bag
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..acb578d0cc7989ecedd92fcb30664d50b4c18f87
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/functional_modules.py
@@ -0,0 +1,298 @@
+# mypy: allow-untyped-defs
+
+import torch
+from torch import Tensor
+from torch._ops import ops
+
+
+__all__ = ["FloatFunctional", "FXFloatFunctional", "QFunctional"]
+
+
+class FloatFunctional(torch.nn.Module):
+    r"""State collector class for float operations.
+
+    The instance of this class can be used instead of the ``torch.`` prefix for
+    some operations. See example usage below.
+
+    .. note::
+
+        This class does not provide a ``forward`` hook. Instead, you must use
+        one of the underlying functions (e.g. ``add``).
+
+    Examples::
+
+        >>> f_add = FloatFunctional()
+        >>> a = torch.tensor(3.0)
+        >>> b = torch.tensor(4.0)
+        >>> f_add.add(a, b)  # Equivalent to ``torch.add(a, b)``
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.activation_post_process = torch.nn.Identity()
+
+    def forward(self, x):
+        raise RuntimeError(
+            "FloatFunctional is not intended to use the "
+            + "'forward'. Please use the underlying operation"
+        )
+
+    r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
+
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.add(Tensor, float)``"""
+
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.add(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
+
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.mul(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
+
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.mul(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.cat``"""
+
+    def cat(self, x: list[Tensor], dim: int = 0) -> Tensor:
+        r = torch.cat(x, dim=dim)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``relu(torch.add(x,y))``"""
+
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = torch.nn.functional.relu(r)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``"""
+
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.matmul(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+
+class FXFloatFunctional(torch.nn.Module):
+    r"""module to replace FloatFunctional module before FX graph mode quantization,
+    since activation_post_process will be inserted in top level module directly
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+
+    def forward(self, x):
+        raise RuntimeError(
+            "FloatFunctional is not intended to use the "
+            + "'forward'. Please use the underlying operation"
+        )
+
+    r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
+
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.add(Tensor, float)``"""
+
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.add(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
+
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.mul(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
+
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.mul(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.cat``"""
+
+    def cat(self, x: list[Tensor], dim: int = 0) -> Tensor:
+        r = torch.cat(x, dim=dim)
+        return r
+
+    r"""Operation equivalent to ``relu(torch.add(x,y))``"""
+
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = torch.nn.functional.relu(r)
+        return r
+
+    r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``"""
+
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.matmul(x, y)
+        return r
+
+
+class QFunctional(torch.nn.Module):
+    r"""Wrapper class for quantized operations.
+
+    The instance of this class can be used instead of the
+    ``torch.ops.quantized`` prefix. See example usage below.
+
+    .. note::
+
+        This class does not provide a ``forward`` hook. Instead, you must use
+        one of the underlying functions (e.g. ``add``).
+
+    Examples::
+
+        >>> q_add = QFunctional()
+        >>> # xdoctest: +SKIP
+        >>> a = torch.quantize_per_tensor(torch.tensor(3.0), 1.0, 0, torch.qint32)
+        >>> b = torch.quantize_per_tensor(torch.tensor(4.0), 1.0, 0, torch.qint32)
+        >>> q_add.add(a, b)  # Equivalent to ``torch.ops.quantized.add(a, b, 1.0, 0)``
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.scale = 1.0
+        self.zero_point = 0
+        self.activation_post_process = torch.nn.Identity()
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "scale"] = torch.tensor(self.scale)
+        destination[prefix + "zero_point"] = torch.tensor(self.zero_point)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.scale = float(state_dict.pop(prefix + "scale"))
+        self.zero_point = int(state_dict.pop(prefix + "zero_point"))
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def _get_name(self):
+        return "QFunctional"
+
+    def extra_repr(self):
+        return f"scale={self.scale}, zero_point={self.zero_point}"
+
+    def forward(self, x):
+        raise RuntimeError(
+            "Functional is not intended to use the "
+            + "'forward'. Please use the underlying operation"
+        )
+
+    r"""Operation equivalent to ``torch.ops.quantized.add``"""
+
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``"""
+
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = ops.quantized.add_scalar(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``"""
+
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``"""
+
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = ops.quantized.mul_scalar(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.cat``"""
+
+    def cat(self, x: list[Tensor], dim: int = 0) -> Tensor:
+        r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.add_relu``"""
+
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.matmul(Tensor, Tensor)``"""
+
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.matmul(x, y, scale=self.scale, zero_point=self.zero_point)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        assert type(mod) is FloatFunctional, (
+            "QFunctional.from_float expects an instance of FloatFunctional"
+        )
+        scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
+        new_mod = QFunctional()
+        new_mod.scale = float(scale)
+        new_mod.zero_point = int(zero_point)
+        return new_mod
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..84fa07b4a02207a34c16747d52d7283ad2ecfc8f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/linear.py
@@ -0,0 +1,361 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from collections.abc import Iterable
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.nn as nn
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from .utils import _hide_packed_params_repr, _quantize_weight, WeightedQuantizedModule
+
+
+__all__ = ["LinearPackedParams", "Linear"]
+
+
+class LinearPackedParams(torch.nn.Module):
+    _version = 3
+
+    def __init__(self, dtype=torch.qint8):
+        super().__init__()
+        self.dtype = dtype
+        if self.dtype == torch.qint8:
+            wq = torch._empty_affine_quantized(
+                [1, 1], scale=1.0, zero_point=0, dtype=torch.qint8
+            )
+        elif self.dtype == torch.float16:
+            wq = torch.zeros([1, 1], dtype=torch.float)
+        self.set_weight_bias(wq, None)  # type: ignore[possibly-undefined]
+
+    @torch.jit.export
+    def set_weight_bias(self, weight: torch.Tensor, bias: torch.Tensor | None) -> None:
+        if self.dtype == torch.qint8:
+            self._packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+        elif self.dtype == torch.float16:
+            self._packed_params = torch.ops.quantized.linear_prepack_fp16(weight, bias)
+        else:
+            raise RuntimeError("Unsupported dtype on dynamic quantized linear!")
+
+    @torch.jit.export
+    def _weight_bias(self):
+        if self.dtype == torch.qint8:
+            return torch.ops.quantized.linear_unpack(self._packed_params)
+        elif self.dtype == torch.float16:
+            return torch.ops.quantized.linear_unpack_fp16(self._packed_params)
+        else:
+            raise RuntimeError("Unsupported dtype on dynamic quantized linear!")
+
+    def forward(self, x):
+        return x
+
+    # Version 1
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- dtype : torch.dtype
+    #
+    # Version 3
+    #   self
+    #   |--- _packed_params : (Tensor, Tensor) representing (weight, bias)
+    #                         of LinearPackedParams
+    #   |--- dtype : torch.dtype
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "dtype"] = self.dtype
+        destination[prefix + "_packed_params"] = self._weight_bias()
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            self.dtype = torch.qint8
+        else:
+            self.dtype = state_dict[prefix + "dtype"]
+            state_dict.pop(prefix + "dtype")
+
+        if version is None or version < 3:
+            self.set_weight_bias(
+                state_dict[prefix + "weight"], state_dict[prefix + "bias"]
+            )
+            state_dict.pop(prefix + "weight")
+            state_dict.pop(prefix + "bias")
+
+        if version == 3:
+            weight, bias = state_dict[prefix + "_packed_params"]
+            state_dict.pop(prefix + "_packed_params")
+            self.set_weight_bias(weight, bias)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+
+class Linear(WeightedQuantizedModule):
+    r"""
+    A quantized linear module with quantized tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation.
+
+    Similar to :class:`~torch.nn.Linear`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{out\_features}, \text{in\_features})`.
+        bias (Tensor): the non-learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized to zero.
+        scale: `scale` parameter of output Quantized Tensor, type: double
+        zero_point: `zero_point` parameter for output Quantized Tensor, type: long
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> m = nn.quantized.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> # xdoctest: +SKIP
+        >>> input = torch.quantize_per_tensor(input, 1.0, 0, torch.quint8)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    _version = 3
+    _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear)
+
+    def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8):
+        super().__init__()
+        # We don't muck around with buffers or attributes or anything here
+        # to keep the module simple. *everything* is simply a Python attribute.
+        # Serialization logic is explicitly handled in the below serialization and
+        # deserialization modules
+        self.in_features = in_features
+        self.out_features = out_features
+        bias = None
+        if bias_:
+            bias = torch.zeros(out_features, dtype=torch.float)
+
+        if dtype == torch.qint8:
+            qweight = torch._empty_affine_quantized(
+                [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8
+            )
+        elif dtype == torch.float16:
+            qweight = torch.zeros([out_features, in_features], dtype=torch.float)
+        else:
+            raise RuntimeError("Unsupported dtype specified for quantized Linear!")
+
+        self._packed_params = LinearPackedParams(dtype)
+        self._packed_params.set_weight_bias(qweight, bias)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    def _get_name(self):
+        return "QuantizedLinear"
+
+    def extra_repr(self):
+        return (
+            f"in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, "
+            f"zero_point={self.zero_point}, qscheme={self.weight().qscheme()}"
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear(
+            x, self._packed_params._packed_params, self.scale, self.zero_point
+        )
+
+    # ===== Serialization methods =====
+    # The special consideration here is that we have to unpack the weights into their
+    # regular QTensor form for serialization. Packed weights should not live
+    # outside the process in which they were created, rather they should be derived
+    # from the QTensor weight.
+    #
+    # Version 1
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- weight : Tensor
+    #        |--- bias : Tensor
+    #
+    # Version 3
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- _packed_params : (Tensor, Tensor) representing weight, bias
+    #                              of LinearPackedParams C++ struct
+    #
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "scale"] = torch.tensor(self.scale)
+        destination[prefix + "zero_point"] = torch.tensor(self.zero_point)
+
+    # ===== Deserialization methods =====
+    # Counterpart to the serialization methods, we must pack the serialized QTensor
+    # weight into its packed format for use by the FBGEMM ops.
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.scale = float(state_dict[prefix + "scale"])
+        state_dict.pop(prefix + "scale")
+
+        self.zero_point = int(state_dict[prefix + "zero_point"])
+        state_dict.pop(prefix + "zero_point")
+
+        version = local_metadata.get("version", None)
+
+        if version is None or version == 1:
+            # We moved the parameters into a LinearPackedParameters submodule
+            weight = state_dict.pop(prefix + "weight")
+            bias = state_dict.pop(prefix + "bias")
+            state_dict.update(
+                {
+                    prefix + "_packed_params.weight": weight,
+                    prefix + "_packed_params.bias": bias,
+                }
+            )
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    # Function rather than property to make sure that JIT serialization doesn't
+    # register this as an attribute
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: torch.Tensor | None) -> None:
+        self._packed_params.set_weight_bias(w, b)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a quantized module from an observed float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+            use_precomputed_fake_quant (bool): if True, the module will reuse min/max
+                          values from the precomputed fake quant module.
+        """
+        if hasattr(mod, "weight_fake_quant"):
+            if type_before_parametrizations(mod) == nniqat.LinearBn1d:
+                mod.weight, mod.bias = fuse_linear_bn_weights(
+                    mod.weight,
+                    mod.bias,
+                    mod.bn.running_mean,
+                    mod.bn.running_var,
+                    mod.bn.eps,
+                    mod.bn.weight,
+                    mod.bn.bias,
+                )
+            weight_post_process = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            # This function does not participate in JIT, so it is OK to ignore
+            # the type mismatch in assignment. Also, mypy has an issue with
+            # iterables not being implemented, so we are ignoring those too.
+            if not isinstance(cls._FLOAT_MODULE, Iterable):
+                # pyrefly: ignore [bad-assignment]
+                cls._FLOAT_MODULE = [cls._FLOAT_MODULE]
+            supported_modules = ", ".join(
+                [float_mod.__name__ for float_mod in cls._FLOAT_MODULE]
+            )
+            error_msg = f"nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}"
+            assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, (
+                error_msg.format()
+            )
+            assert hasattr(mod, "qconfig"), (
+                "Input float module must have qconfig defined"
+            )
+            activation_post_process = mod.activation_post_process
+            if type_before_parametrizations(mod) == nni.LinearReLU:
+                mod = mod[0]
+            weight_post_process = (
+                mod.qconfig.weight()
+                if not hasattr(mod, "weight_fake_quant")
+                else mod.weight_fake_quant
+            )
+
+        if not use_precomputed_fake_quant:
+            # Observer may not have been called yet
+            # Observer might have been called in the previous stage via PTQ algorithm e.g. AdaRound
+            weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()
+        assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias)
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+
+        Args:
+            ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qlinear = cls(ref_qlinear.in_features, ref_qlinear.out_features)
+        qweight = ref_qlinear.get_quantized_weight()
+        qlinear.set_weight_bias(qweight, ref_qlinear.bias)
+
+        qlinear.scale = float(output_scale)
+        qlinear.zero_point = int(output_zero_point)
+        return qlinear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa335b4699db5519e2e53f27aa18958b5afced94
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/normalization.py
@@ -0,0 +1,358 @@
+# mypy: allow-untyped-defs
+import torch
+
+
+__all__ = [
+    "LayerNorm",
+    "GroupNorm",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+]
+
+
+class LayerNorm(torch.nn.LayerNorm):
+    r"""This is the quantized version of :class:`~torch.nn.LayerNorm`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        weight,
+        bias,
+        scale,
+        zero_point,
+        eps=1e-5,
+        elementwise_affine=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            normalized_shape,
+            eps=eps,
+            elementwise_affine=elementwise_affine,
+            # pyrefly: ignore [bad-argument-type]
+            **factory_kwargs,
+        )
+        self.weight = weight
+        self.bias = bias
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.layer_norm(
+            input,
+            self.normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self.eps,
+            output_scale=self.scale,
+            output_zero_point=self.zero_point,
+        )
+
+    def _get_name(self):
+        return "QuantizedLayerNorm"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.normalized_shape,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.elementwise_affine,
+        )
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.normalized_shape,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.elementwise_affine,
+        )
+
+
+class GroupNorm(torch.nn.GroupNorm):
+    r"""This is the quantized version of :class:`~torch.nn.GroupNorm`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    __constants__ = ["num_groups", "num_channels", "eps", "affine"]
+
+    def __init__(
+        self,
+        num_groups,
+        num_channels,
+        weight,
+        bias,
+        scale,
+        zero_point,
+        eps=1e-5,
+        affine=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.group_norm(
+            input,
+            self.num_groups,
+            self.weight,
+            self.bias,
+            self.eps,
+            self.scale,
+            self.zero_point,
+        )
+
+    def _get_name(self):
+        return "QuantizedGroupNorm"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_groups,
+            mod.num_channels,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+        return new_mod
+
+
+class InstanceNorm1d(torch.nn.InstanceNorm1d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm1d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    def __init__(
+        self,
+        num_features,
+        weight,
+        bias,
+        scale,
+        zero_point,
+        eps=1e-5,
+        momentum=0.1,
+        affine=False,
+        track_running_stats=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.weight = weight
+        self.bias = bias
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedInstanceNorm1d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+
+
+class InstanceNorm2d(torch.nn.InstanceNorm2d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm2d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    def __init__(
+        self,
+        num_features,
+        weight,
+        bias,
+        scale,
+        zero_point,
+        eps=1e-5,
+        momentum=0.1,
+        affine=False,
+        track_running_stats=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.weight = weight
+        self.bias = bias
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedInstanceNorm2d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+
+
+class InstanceNorm3d(torch.nn.InstanceNorm3d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm3d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    def __init__(
+        self,
+        num_features,
+        weight,
+        bias,
+        scale,
+        zero_point,
+        eps=1e-5,
+        momentum=0.1,
+        affine=False,
+        track_running_stats=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.weight = weight
+        self.bias = bias
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore [bad-argument-type]
+        self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale, self.zero_point
+        )
+
+    def _get_name(self):
+        return "QuantizedInstanceNorm3d"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features,
+            mod.weight,
+            mod.bias,
+            float(scale),
+            int(zero_point),
+            mod.eps,
+            mod.affine,
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5040b8c97d050102779c742989dd4f52cd3bffa8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/rnn.py
@@ -0,0 +1,59 @@
+from typing import Any
+
+import torch
+
+
+__all__ = [
+    "LSTM",
+]
+
+
+class LSTM(torch.ao.nn.quantizable.LSTM):
+    r"""A quantized long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples in :class:`~torch.ao.nn.quantizable.LSTM`
+
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> custom_module_config = {
+        ...     'float_to_observed_custom_module_class': {
+        ...         nn.LSTM: nn.quantizable.LSTM,
+        ...     },
+        ...     'observed_to_quantized_custom_module_class': {
+        ...         nn.quantizable.LSTM: nn.quantized.LSTM,
+        ...     }
+        ... }
+        >>> tq.prepare(model, prepare_custom_module_class=custom_module_config)
+        >>> tq.convert(model, convert_custom_module_class=custom_module_config)
+    """
+
+    _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
+
+    def _get_name(self) -> str:
+        return "QuantizedLSTM"
+
+    @classmethod
+    def from_float(cls, *args: Any, **kwargs: Any) -> None:
+        # The whole flow is float -> observed -> quantized
+        # This class does observed -> quantized only
+        raise NotImplementedError(
+            "It looks like you are trying to convert a "
+            "non-observed LSTM module. Please, see "
+            "the examples on quantizable LSTMs."
+        )
+
+    @classmethod
+    def from_observed(cls: type["LSTM"], other: torch.ao.nn.quantizable.LSTM) -> "LSTM":
+        assert isinstance(other, cls._FLOAT_MODULE)  # type: ignore[has-type]
+        converted = torch.ao.quantization.convert(
+            other, inplace=False, remove_qconfig=True
+        )
+        converted.__class__ = cls
+        return converted
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..330070913a7521871f123a3e076264498a6ef612
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/modules/utils.py
@@ -0,0 +1,144 @@
+# mypy: allow-untyped-defs
+import abc
+import collections
+import itertools
+
+import torch
+from torch.nn.modules.module import _addindent
+
+
+__all__ = [
+    "WeightedQuantizedModule",
+]
+
+
+class WeightedQuantizedModule(torch.nn.Module, metaclass=abc.ABCMeta):
+    """Wrapper for quantized modules than can be lowered from reference modules."""
+
+    @classmethod
+    @abc.abstractmethod
+    def from_reference(cls, ref_module, output_scale, output_zero_point):
+        raise NotImplementedError
+
+
+def _get_weight_observer(observer):
+    # FakeQuantize observer
+    if hasattr(observer, "activation_post_process"):
+        observer = observer.activation_post_process
+    # UniformQuantizationObserverBase observer
+    return observer
+
+
+def _needs_weight_clamping(observer, dtype):
+    observer = _get_weight_observer(observer)
+    if dtype in [torch.qint8, torch.quint8, torch.qint32]:
+        info = torch.iinfo(dtype)
+        return observer.quant_min > info.min or observer.quant_max < info.max
+    return False
+
+
+def _clamp_weights(qweight, observer, scale, zp):
+    if not _needs_weight_clamping(observer, qweight.dtype):
+        return qweight
+
+    observer = _get_weight_observer(observer)
+    min_, max_ = observer.quant_min, observer.quant_max
+
+    # Doing this because can't use torch.ops.quantized.clamp() with per_channel qscheme yet.
+    qw_int_max = torch.clone(qweight.int_repr()).fill_(max_)
+    qw_int_min = torch.clone(qweight.int_repr()).fill_(min_)
+    qw_int = torch.minimum(torch.maximum(qweight.int_repr(), qw_int_min), qw_int_max)
+
+    if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]:
+        qweight = torch._make_per_tensor_quantized_tensor(
+            qw_int, scale.item(), zp.item()
+        )
+    elif observer.qscheme in [
+        torch.per_channel_symmetric,
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        qweight = torch._make_per_channel_quantized_tensor(
+            qw_int, scale, zp, axis=observer.ch_axis
+        )
+    else:
+        raise ValueError("Unexpected qscheme " + observer.qscheme)
+    return qweight
+
+
+def _quantize_weight(float_wt, observer):
+    wt_scale, wt_zp = observer.calculate_qparams()
+    if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]:
+        qweight = torch.quantize_per_tensor(
+            float_wt, float(wt_scale), int(wt_zp), torch.qint8
+        )
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    elif observer.qscheme in [torch.per_channel_symmetric, torch.per_channel_affine]:
+        wt_axis = observer.ch_axis
+        qweight = torch.quantize_per_channel(
+            float_wt,
+            wt_scale.to(torch.double),
+            wt_zp.to(torch.int64),
+            wt_axis,
+            torch.qint8,
+        )
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    elif observer.qscheme == torch.per_channel_affine_float_qparams:
+        qweight = torch.quantize_per_channel(
+            float_wt,
+            wt_scale.to(torch.float),
+            wt_zp.to(torch.float),
+            observer.ch_axis,
+            observer.dtype,
+        )
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    else:
+        raise ValueError("Unexpected qscheme " + observer.qscheme)
+    return qweight
+
+
+def _ntuple_from_first(n):
+    """Converts the argument to a tuple of size n
+    with the first element repeated."""
+
+    def parse(x):
+        while isinstance(x, collections.abc.Sequence):
+            if len(x) == n:
+                break
+            x = x[0]
+        return tuple(itertools.repeat(x, n))
+
+    return parse
+
+
+def _hide_packed_params_repr(self, params):
+    # We don't want to show `PackedParams` children, hence custom
+    # `__repr__`. This is the same as nn.Module.__repr__, except the check
+    # for the `params module`.
+    extra_lines = []
+    extra_repr = self.extra_repr()
+    # empty string will be split into list ['']
+    if extra_repr:
+        extra_lines = extra_repr.split("\n")
+    child_lines = []
+    for key, module in self._modules.items():
+        if isinstance(module, params):
+            continue
+        mod_str = repr(module)
+        mod_str = _addindent(mod_str, 2)
+        child_lines.append("(" + key + "): " + mod_str)
+    lines = extra_lines + child_lines
+
+    main_str = self._get_name() + "("
+    if lines:
+        # simple one-liner info, which most builtin Modules will use
+        if len(extra_lines) == 1 and not child_lines:
+            main_str += extra_lines[0]
+        else:
+            main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+    main_str += ")"
+    return main_str
+
+
+_pair_from_first = _ntuple_from_first(2)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e15e9c1516d30f7ca9ee47b21b267533de75b6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__init__.py
@@ -0,0 +1,19 @@
+from .modules import *  # noqa: F403
+
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+    "LSTM",
+    "GRU",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d10eff8aa431fc79df0a413aa0d38ad8a868df60
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe97c22f5a46a5eafc1432075fc57dd44c3aa8d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py
@@ -0,0 +1,29 @@
+from .conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+)
+from .linear import Linear
+from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNNCell
+from .sparse import Embedding, EmbeddingBag
+
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+    "LSTM",
+    "GRU",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb4ea23be0524260424bb4967ca2ab09731ebc8f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e60cd9afa36c944d42abf1f27cf14669df7d367c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f44fcbf3dc5c5687f28b66ec63eaedb622b288c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d26ab1d3135fcaa5abc6a1f5a31609106e32d50
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..786d2a34375072315e16b1cb844774231cefe023
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8414ec75339cc2f25caf242f4d08df63f79c45ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3273b89cc70ab21a87a0369e71c3ceff19615111
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/conv.py
@@ -0,0 +1,518 @@
+# mypy: allow-untyped-defs
+from typing import Any, Literal, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.common_types import _size_1_t
+
+from .utils import ReferenceQuantizedModule
+
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+]
+
+
+class _ConvNd(torch.nn.modules.conv._ConvNd, ReferenceQuantizedModule):
+    """A reference version of nn.quantized.Conv2d
+    we will not pack the parameters in this module, since weight packing is an
+    optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
+    this is useful when user want to use this module in other backends like Glow.
+    """
+
+    __annotations__ = {"bias": Optional[torch.Tensor]}
+    _IS_REFERENCE = True
+
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams,
+        )
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.Conv1d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv1d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+        weight_quant_dequant = self.get_weight()
+
+        result = F.conv1d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.Conv2d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv2d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        weight_quant_dequant = self.get_weight()
+
+        result = F.conv2d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv2d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+
+class Conv3d(_ConvNd, nn.Conv3d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.Conv3d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv3d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+        weight_quant_dequant = self.get_weight()
+
+        result = F.conv3d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+
+class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd):
+    """A reference version of nn.quantized.ConvTranspose2d
+    we will not pack the parameters in this module, since weight packing is an
+    optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
+    this is useful when user want to use this module in other backends like Glow.
+    """
+
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.output_padding,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams,
+        )
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+
+class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.ConvTranspose1d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(
+        self, x: torch.Tensor, output_size: list[int] | None = None
+    ) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose1d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input,  # type: ignore[arg-type]
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            self.dilation,  # type: ignore[arg-type]
+        )
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose1d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+
+class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.ConvTranspose2d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(
+        self, x: torch.Tensor, output_size: list[int] | None = None
+    ) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose2d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+
+        output_padding = self._output_padding(
+            input,  # type: ignore[arg-type]
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            self.dilation,  # type: ignore[arg-type]
+        )
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose2d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose2d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+
+class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        bias=True,
+        dilation=1,
+        padding_mode="zeros",
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ):
+        nn.ConvTranspose3d.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            bias,
+            dilation,
+            # pyrefly: ignore [bad-argument-type]
+            padding_mode,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(
+        self, x: torch.Tensor, output_size: list[int] | None = None
+    ) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose3d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input,  # type: ignore[arg-type]
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            self.dilation,  # type: ignore[arg-type]
+        )
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose3d(
+            x,
+            weight_quant_dequant,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..6014fab24036c30b183f5622d12aae4a345baedb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/linear.py
@@ -0,0 +1,69 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import ReferenceQuantizedModule
+
+
+__all__ = ["Linear"]
+
+
+class Linear(nn.Linear, ReferenceQuantizedModule):
+    """A reference quantized linear module that fits into the FX
+    Graph Mode Quantization workflow
+    activation will be floating point Tensor, we will store floating
+    point weight as well in the module, but in forward we'll quantize
+    and dequantize the weight before running the floating point functional
+    linear operator.
+    """
+
+    _IS_REFERENCE = True
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias_: bool = True,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+        weight_qparams: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias_, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self) -> str:
+        return "QuantizedLinear(Reference)"
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.linear ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.linear --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized linear
+        """
+        weight_quant_dequant = self.get_weight()
+        result = F.linear(x, weight_quant_dequant, self.bias)
+        return result
+
+    @classmethod
+    def from_float(
+        cls, float_linear: nn.Linear, weight_qparams: dict[str, Any]
+    ) -> "Linear":
+        qref_linear = Linear(
+            float_linear.in_features,
+            float_linear.out_features,
+            float_linear.bias is not None,
+            device=float_linear.weight.device,
+            dtype=float_linear.weight.dtype,
+            weight_qparams=weight_qparams,
+        )
+        qref_linear.weight = torch.nn.Parameter(float_linear.weight.detach())
+        if float_linear.bias is not None:
+            qref_linear.bias = torch.nn.Parameter(float_linear.bias.detach())
+        return qref_linear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bdbfb81430b4db9e09ea752310732b89f47bfa1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -0,0 +1,861 @@
+# mypy: allow-untyped-defs
+from typing import Any
+
+import torch
+import torch.nn as nn
+from torch import _VF, Tensor
+from torch.nn.utils.rnn import PackedSequence
+
+from .utils import _quantize_and_dequantize_weight, _quantize_weight
+
+
+__all__ = [
+    "RNNCellBase",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+    "RNNBase",
+    "LSTM",
+    "GRU",
+    "get_quantized_weight",
+]
+
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+
+def _get_weight_and_quantization_params(module, wn):
+    weight = getattr(module, wn)
+    params = [weight]
+    for param_name in [
+        wn + n for n in ["_qscheme", "_dtype", "_scale", "_zero_point", "_axis_int"]
+    ]:
+        if hasattr(module, param_name):
+            param = getattr(module, param_name)
+        else:
+            param = None
+        params.append(param)
+    return params
+
+
+def get_quantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = _get_weight_and_quantization_params(module, wn)
+    weight = _quantize_weight(*params)
+    return weight
+
+
+def _get_quantize_and_dequantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = _get_weight_and_quantization_params(module, wn)
+    weight = _quantize_and_dequantize_weight(*params)
+    return weight
+
+
+class RNNCellBase(nn.RNNCellBase):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool,
+        num_chunks: int,
+        device=None,
+        dtype=None,
+        weight_qparams_dict=None,
+    ) -> None:
+        super().__init__(
+            input_size, hidden_size, bias, num_chunks, device=device, dtype=dtype
+        )
+        # TODO(jerryzh168): maybe make this arg a required arg
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0,
+            }
+            weight_qparams_dict = {
+                "weight_ih": weight_qparams,
+                "weight_hh": weight_qparams,
+                "is_decomposed": False,
+            }
+        assert len(weight_qparams_dict) == 3, (
+            "Expected length for weight_qparams_dict to be 3 for QuantizedRNNCellBase(Reference)"
+        )
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        assert weight_qparams_dict is not None
+        self.is_decomposed = weight_qparams_dict["is_decomposed"]
+        for key, weight_qparams in weight_qparams_dict.items():
+            if key == "is_decomposed":
+                continue
+            # TODO: refactor the duplicated code to utils.py
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [
+                None,
+                torch.per_tensor_affine,
+                torch.per_channel_affine,
+            ], Exception(
+                f"qscheme: {weight_qscheme} is not support in {self._get_name()}"
+            )
+            if weight_qscheme is not None:
+                scale = weight_qparams["scale"]
+                scale_tensor = (
+                    scale.detach().clone()
+                    if isinstance(scale, torch.Tensor)
+                    else torch.tensor(scale, dtype=torch.float, device=device)
+                )
+                self.register_buffer(key + "_scale", scale_tensor)
+                zp = weight_qparams["zero_point"]
+                zp_tensor = (
+                    zp.detach().clone()
+                    if isinstance(zp, torch.Tensor)
+                    else torch.tensor(zp, dtype=torch.int, device=device)
+                )
+                self.register_buffer(key + "_zero_point", zp_tensor)
+                if weight_qscheme == torch.per_channel_affine:
+                    axis = weight_qparams["axis"]
+                    axis_tensor = (
+                        axis.detach().clone()
+                        if isinstance(axis, torch.Tensor)
+                        else torch.tensor(axis, dtype=torch.int, device=device)
+                    )
+                    self.register_buffer(key + "_axis", axis_tensor)
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device)
+                    )
+                setattr(self, key + "_axis_int", getattr(self, key + "_axis").item())
+
+    def _get_name(self):
+        return "QuantizedRNNCellBase(Reference)"
+
+    def get_quantized_weight_ih(self):
+        return get_quantized_weight(self, "weight_ih")
+
+    def get_quantized_weight_hh(self):
+        return get_quantized_weight(self, "weight_hh")
+
+    def get_weight_ih(self):
+        return _get_quantize_and_dequantized_weight(self, "weight_ih")
+
+    def get_weight_hh(self):
+        return _get_quantize_and_dequantized_weight(self, "weight_hh")
+
+
+class RNNCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        nonlinearity: str = "tanh",
+        device=None,
+        dtype=None,
+        weight_qparams_dict: dict[str, Any] | None = None,
+    ) -> None:
+        factory_kwargs = {
+            "device": device,
+            "dtype": dtype,
+            "weight_qparams_dict": weight_qparams_dict,
+        }
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        self.nonlinearity = nonlinearity
+
+    def _get_name(self):
+        return "QuantizedRNNCell(Reference)"
+
+    # TODO: refactor nn.RNNCell to have a _forward that takes weight_ih and weight_hh as input
+    # and remove duplicated code, same for the other two Cell modules
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
+        assert input.dim() in (
+            1,
+            2,
+        ), (
+            f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        if self.nonlinearity == "tanh":
+            ret = _VF.rnn_tanh_cell(
+                input,
+                hx,
+                self.get_weight_ih(),
+                self.get_weight_hh(),
+                self.bias_ih,
+                self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = _VF.rnn_relu_cell(
+                input,
+                hx,
+                self.get_weight_ih(),
+                self.get_weight_hh(),
+                self.bias_ih,
+                self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}")
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.nonlinearity,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict,
+        )
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+
+class LSTMCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        weight_qparams_dict: dict[str, Any] | None = None,
+    ) -> None:
+        factory_kwargs = {
+            "device": device,
+            "dtype": dtype,
+            "weight_qparams_dict": weight_qparams_dict,
+        }
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedLSTMCell(Reference)"
+
+    def forward(
+        self, input: Tensor, hx: tuple[Tensor, Tensor] | None = None
+    ) -> tuple[Tensor, Tensor]:
+        assert input.dim() in (
+            1,
+            2,
+        ), (
+            f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            zeros = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+            hx = (zeros, zeros)
+        else:
+            hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx
+
+        ret = _VF.lstm_cell(
+            input,
+            hx,
+            self.get_weight_ih(),
+            self.get_weight_hh(),
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = (ret[0].squeeze(0), ret[1].squeeze(0))
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict, use_precomputed_fake_quant=False):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict,
+        )
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+
+class GRUCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        weight_qparams_dict: dict[str, Any] | None = None,
+    ) -> None:
+        factory_kwargs = {
+            "device": device,
+            "dtype": dtype,
+            "weight_qparams_dict": weight_qparams_dict,
+        }
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedGRUCell(Reference)"
+
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
+        assert input.dim() in (
+            1,
+            2,
+        ), (
+            f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        ret = _VF.gru_cell(
+            input,
+            hx,
+            self.get_weight_ih(),
+            self.get_weight_hh(),
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict,
+        )
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+
+class RNNBase(nn.RNNBase):
+    def __init__(
+        self,
+        mode: str,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        proj_size: int = 0,
+        device=None,
+        dtype=None,
+        weight_qparams_dict: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(
+            mode,
+            input_size,
+            hidden_size,
+            num_layers,
+            bias,
+            batch_first,
+            dropout,
+            bidirectional,
+            proj_size,
+            device,
+            dtype,
+        )
+        # TODO(jerryzh168): maybe make this arg a required arg
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0,
+            }
+            weight_qparams_dict = {"is_decomposed": False}  # type: ignore[dict-item]
+            for wn in self._flat_weights_names:
+                if wn.startswith("weight"):
+                    weight_qparams_dict[wn] = weight_qparams
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        self.is_decomposed = weight_qparams_dict["is_decomposed"]
+        for key, weight_qparams in weight_qparams_dict.items():
+            if key == "is_decomposed":
+                continue
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [
+                None,
+                torch.per_tensor_affine,
+                torch.per_channel_affine,
+            ], Exception(
+                f"qscheme: {weight_qscheme} is not support in {self._get_name()}"
+            )
+            if weight_qscheme is not None:
+                self.register_buffer(
+                    key + "_scale",
+                    torch.tensor(
+                        weight_qparams["scale"], dtype=torch.float, device=device
+                    ),
+                )
+                self.register_buffer(
+                    key + "_zero_point",
+                    torch.tensor(
+                        weight_qparams["zero_point"], dtype=torch.int, device=device
+                    ),
+                )
+                if weight_qscheme == torch.per_channel_affine:
+                    self.register_buffer(
+                        key + "_axis",
+                        torch.tensor(
+                            weight_qparams["axis"], dtype=torch.int, device=device
+                        ),
+                    )
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device)
+                    )
+                setattr(self, key + "_axis_int", getattr(self, key + "_axis").item())
+
+
+class LSTM(RNNBase):
+    """Reference Quantized LSTM Module
+    We'll store weight_qparams for all the weights in _flat_weights, we need to pass in
+    a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0,
+    to the weight_qparams for that weight
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__("LSTM", *args, **kwargs)
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    def permute_hidden(  # type: ignore[override]
+        self,
+        hx: tuple[Tensor, Tensor],
+        permutation: Tensor | None,
+    ) -> tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(
+            hx[1], permutation
+        )
+
+    def get_expected_cell_size(
+        self, input: Tensor, batch_sizes: Tensor | None
+    ) -> tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (
+            self.num_layers * num_directions,
+            mini_batch,
+            self.hidden_size,
+        )
+        return expected_hidden_size
+
+    # In the future, we should prevent mypy from applying contravariance rules here.
+    # See torch/nn/modules/module.py::_forward_unimplemented
+    def check_forward_args(  # type: ignore[override]
+        self,
+        input: Tensor,
+        hidden: tuple[Tensor, Tensor],
+        batch_sizes: Tensor | None,
+    ):
+        self.check_input(input, batch_sizes)
+        self.check_hidden_size(
+            hidden[0],
+            self.get_expected_hidden_size(input, batch_sizes),
+            "Expected hidden[0] size {}, got {}",
+        )
+        self.check_hidden_size(
+            hidden[1],
+            self.get_expected_cell_size(input, batch_sizes),
+            "Expected hidden[1] size {}, got {}",
+        )
+
+    def get_quantized_weight_bias_dict(self):
+        """dictionary from flat_weight_name to quantized weight or (unquantized) bias
+        e.g.
+        {
+          "weight_ih_l0": quantized_weight,
+          "bias_ih_l0": unquantized_bias,
+          ...
+        }
+        """
+        quantized_weight_bias_dict = {}
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                if wn.startswith("weight"):
+                    weight_or_bias = get_quantized_weight(self, wn)
+                else:
+                    weight_or_bias = getattr(self, wn)
+            else:
+                weight_or_bias = None
+            quantized_weight_bias_dict[wn] = weight_or_bias
+        return quantized_weight_bias_dict
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                weight = getattr(self, wn)
+                if wn.startswith("weight"):
+                    params = _get_weight_and_quantization_params(self, wn)
+                    weight = _quantize_and_dequantize_weight(*params)
+            else:
+                weight = None
+            flat_weights.append(weight)
+        return flat_weights
+
+    def forward(self, input, hx=None):  # noqa: F811
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        batch_sizes = None
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            real_hidden_size = (
+                self.proj_size if self.proj_size > 0 else self.hidden_size
+            )
+            h_zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                real_hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            c_zeros = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+            hx = (h_zeros, c_zeros)
+        else:
+            if batch_sizes is None:  # If not PackedSequence input.
+                if is_batched:  # type: ignore[possibly-undefined]
+                    if hx[0].dim() != 3 or hx[1].dim() != 3:
+                        msg = (
+                            "For batched 3-D input, hx and cx should "
+                            f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors"
+                        )
+                        raise RuntimeError(msg)
+                else:
+                    if hx[0].dim() != 2 or hx[1].dim() != 2:
+                        msg = (
+                            "For unbatched 2-D input, hx and cx should "
+                            f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors"
+                        )
+                        raise RuntimeError(msg)
+                    hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1))
+
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.lstm(
+                input,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = _VF.lstm(
+                input,
+                batch_sizes,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output,
+                # pyrefly: ignore [bad-argument-type]
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def _get_name(self):
+        return "QuantizedLSTM(Reference)"
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.num_layers,
+            mod.bias,
+            mod.batch_first,
+            mod.dropout,
+            mod.bidirectional,
+            weight_qparams_dict=weight_qparams_dict,
+        )
+        for wn in mod._flat_weights_names:
+            setattr(ref_mod, wn, getattr(mod, wn))
+        return ref_mod
+
+
+class GRU(RNNBase):
+    """Reference Quantized GRU Module
+    We'll store weight_qparams for all the weights in _flat_weights, we need to pass in
+    a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0,
+    to the weight_qparams for that weight
+    """
+
+    def __init__(self, *args, **kwargs):
+        if "proj_size" in kwargs:
+            raise ValueError(
+                "proj_size argument is only supported for LSTM, not RNN or GRU"
+            )
+        super().__init__("GRU", *args, **kwargs)
+
+    def get_quantized_weight_bias_dict(self):
+        """dictionary from flat_weight_name to quantized weight or (unquantized) bias
+        e.g.
+        {
+          "weight_ih_l0": quantized_weight,
+          "bias_ih_l0": unquantized_bias,
+          ...
+        }
+        """
+        quantized_weight_bias_dict = {}
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                if wn.startswith("weight"):
+                    weight_or_bias = get_quantized_weight(self, wn)
+                else:
+                    weight_or_bias = getattr(self, wn)
+            else:
+                weight_or_bias = None
+            quantized_weight_bias_dict[wn] = weight_or_bias
+        return quantized_weight_bias_dict
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                weight = getattr(self, wn)
+                if wn.startswith("weight"):
+                    params = _get_weight_and_quantization_params(self, wn)
+                    weight = _quantize_and_dequantize_weight(*params)
+            else:
+                weight = None
+            flat_weights.append(weight)
+        return flat_weights
+
+    def forward(self, input, hx=None):  # noqa: F811
+        # Note: this is copied from the forward of GRU in https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py
+        # only changed self._flat_weights to self.get_flat_weights()
+        # TODO: maybe we can try inheriting from that class and define get_flat_weights
+        # as a @property? this might interfere with TorchScript, if we remove that
+        # requirement in the future we should be able to do this
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            assert input.dim() in (
+                2,
+                3,
+            ), (
+                f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
+            )
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor"
+                        )
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor"
+                    )
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            hx = torch.zeros(
+                self.num_layers * num_directions,
+                max_batch_size,
+                self.hidden_size,
+                dtype=input.dtype,
+                device=input.device,
+            )
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.gru(
+                input,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = _VF.gru(
+                input,
+                batch_sizes,
+                hx,
+                self.get_flat_weights(),
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1]
+
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output,
+                # pyrefly: ignore [bad-argument-type]
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = hidden.squeeze(1)
+
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def _get_name(self):
+        return "QuantizedGRU(Reference)"
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.num_layers,
+            mod.bias,
+            mod.batch_first,
+            mod.dropout,
+            mod.bidirectional,
+            weight_qparams_dict=weight_qparams_dict,
+        )
+        for wn in mod._flat_weights_names:
+            setattr(ref_mod, wn, getattr(mod, wn))
+        return ref_mod
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff80997c1439c50a456df328b4068ae0c419a01
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -0,0 +1,163 @@
+# mypy: allow-untyped-defs
+from typing import Any
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from .utils import ReferenceQuantizedModule
+
+
+__all__ = ["Embedding", "EmbeddingBag"]
+
+
+class Embedding(nn.Embedding, ReferenceQuantizedModule):
+    """A reference quantized Embedding module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int | None = None,
+        max_norm: float | None = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Tensor | None = None,
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            padding_idx,
+            max_norm,
+            norm_type,
+            scale_grad_by_freq,
+            sparse,
+            _weight,
+            # pyrefly: ignore [bad-argument-type]
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(self, input: Tensor) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding(
+            input,
+            weight_quant_dequant,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.padding_idx,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.sparse,
+            mod.weight,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams,
+        )
+
+
+class EmbeddingBag(nn.EmbeddingBag, ReferenceQuantizedModule):
+    """A reference quantized EmbeddingBag module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        max_norm: float | None = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        _weight: Tensor | None = None,
+        include_last_offset: bool = False,
+        padding_idx: int | None = None,
+        device=None,
+        dtype=None,
+        weight_qparams: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            max_norm,
+            norm_type,
+            scale_grad_by_freq,
+            mode,
+            sparse,
+            _weight,
+            include_last_offset,
+            padding_idx,
+            device,
+            dtype,
+        )
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(
+        self,
+        input: Tensor,
+        offsets: Tensor | None = None,
+        per_sample_weights: Tensor | None = None,
+    ) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding_bag(
+            input,
+            weight_quant_dequant,
+            offsets,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            per_sample_weights,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams, use_precomputed_fake_quant=False):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.mode,
+            mod.sparse,
+            mod.weight,
+            mod.include_last_offset,
+            mod.padding_idx,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams,
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bdbcd4a6739e528e679c67b6a6614ea373801d3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/quantized/reference/modules/utils.py
@@ -0,0 +1,438 @@
+# mypy: allow-untyped-defs
+import typing
+
+import torch
+
+
+__all__ = [
+    "ReferenceQuantizedModule",
+]
+
+
+class ReferenceQuantizedModule(torch.nn.Module):
+    def _init_weight_qparams(self, weight_qparams, device):
+        if weight_qparams is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0,
+            }
+        # pyrefly: ignore [bad-assignment]
+        self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"]
+        self.weight_dtype = weight_qparams["dtype"]
+        assert self.weight_qscheme in [
+            None,
+            torch.per_tensor_affine,
+            torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams,
+        ], (
+            f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}"
+        )
+        if self.weight_dtype in [
+            torch.quint8,
+            torch.qint8,
+            torch.quint4x2,
+            torch.qint32,
+        ]:
+            zero_point_dtype = (
+                weight_qparams["zero_point"].dtype
+                if isinstance(weight_qparams["zero_point"], torch.Tensor)
+                else torch.int
+            )
+            w_scale = weight_qparams["scale"]
+            w_scale_tensor = (
+                w_scale.detach().clone()
+                if isinstance(w_scale, torch.Tensor)
+                else torch.tensor(w_scale, dtype=torch.float, device=device)
+            )
+            self.register_buffer("weight_scale", w_scale_tensor)
+            w_zp = weight_qparams["zero_point"]
+            w_zp_tensor = (
+                w_zp.detach().clone()
+                if isinstance(w_zp, torch.Tensor)
+                else torch.tensor(w_zp, dtype=zero_point_dtype, device=device)
+            )
+            self.register_buffer("weight_zero_point", w_zp_tensor)
+            if self.weight_qscheme in [
+                torch.per_channel_affine,
+                torch.per_channel_affine_float_qparams,
+            ]:
+                w_axis = weight_qparams["axis"]
+                w_axis_tensor = (
+                    w_axis.detach().clone()
+                    if isinstance(w_axis, torch.Tensor)
+                    else torch.tensor(w_axis, dtype=torch.int, device=device)
+                )
+                self.register_buffer("weight_axis", w_axis_tensor)
+            else:
+                # added for TorchScriptability, not used
+                self.register_buffer(
+                    "weight_axis", torch.tensor(0, dtype=torch.int, device=device)
+                )
+        else:
+            # added for TorchScriptability, and for torch.float
+            self.register_buffer(
+                "weight_scale", torch.tensor(1.0, dtype=torch.float, device=device)
+            )
+            self.register_buffer(
+                "weight_zero_point", torch.tensor(0, dtype=torch.int, device=device)
+            )
+            self.register_buffer(
+                "weight_axis", torch.tensor(0, dtype=torch.int, device=device)
+            )
+        # pyrefly: ignore [bad-assignment]
+        self.is_decomposed: bool = weight_qparams.get("is_decomposed", False)
+        # store weight_axis as weight_axis_int due to some constraints of torchdynamo.export
+        # for capturing `.item` operations
+        self.weight_axis_int: int = self.weight_axis.item()  # type: ignore[operator, assignment]
+        # pyrefly: ignore [bad-assignment]
+        self.weight_quant_min: int | None = weight_qparams.get("quant_min")
+        # pyrefly: ignore [bad-assignment]
+        self.weight_quant_max: int | None = weight_qparams.get("quant_max")
+
+    def get_weight(self):
+        """
+        Fake quantize (quantize and dequantize) the weight with
+        the quantization parameters for weight, this is used to
+        simulate the numerics for the quantized weight in a quantized
+        model
+        """
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        if self.is_decomposed:
+            return _quantize_and_dequantize_weight_decomposed(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                # pyrefly: ignore [bad-argument-type]
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+                self.weight_quant_min,
+                self.weight_quant_max,
+            )
+        else:
+            return _quantize_and_dequantize_weight(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                # pyrefly: ignore [bad-argument-type]
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+            )
+
+    def get_quantized_weight(self):
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        # assert isinstance(self.weight_axis, torch.Tensor)
+        if self.is_decomposed:
+            return _quantize_weight_decomposed(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                # pyrefly: ignore [bad-argument-type]
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+                self.weight_quant_min,
+                self.weight_quant_max,
+            )
+        else:
+            return _quantize_weight(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                # pyrefly: ignore [bad-argument-type]
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+            )
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        _save_weight_qparams(
+            destination,
+            prefix,
+            self.weight_qscheme,
+            self.weight_dtype,
+            self.weight_scale,
+            self.weight_zero_point,
+            self.weight_axis,
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        for key in _get_weight_qparam_keys(state_dict, prefix):
+            setattr(self, key, state_dict[prefix + key])
+            state_dict.pop(prefix + key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+def _quantize_weight_decomposed(
+    weight: torch.Tensor,
+    weight_qscheme: torch.qscheme,
+    weight_dtype: torch.dtype,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_axis: int,
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
+) -> torch.Tensor:
+    _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (-2147483648, 2147483647),  # torch.jit interprets 2**31 as a float
+    }
+
+    # TODO: add an util function for converting qdtype to dtype
+    _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+    }
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+            if weight_quant_min is None or weight_quant_max is None:
+                weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[
+                    weight_dtype_
+                ]
+            weight = torch.ops.quantized_decomposed.quantize_per_tensor(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_,
+            )
+            return weight
+    elif weight_qscheme in [
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        # TODO: torch.quint4x2 is not supported
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+            if weight_quant_min is None or weight_quant_max is None:
+                weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[
+                    weight_dtype_
+                ]
+            weight = torch.ops.quantized_decomposed.quantize_per_channel(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_axis,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_,
+            )  # type: ignore[arg-type]
+            return weight
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+
+def _dequantize_weight_decomposed(
+    weight: torch.Tensor,
+    weight_qscheme: torch.qscheme,
+    weight_dtype: torch.dtype,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_axis: int,
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
+) -> torch.Tensor:
+    # TODO: get the quant_min and quant_max from activation_post_process
+    _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (-2147483648, 2147483647),  # torch.jit interprets 2**31 as a float
+    }
+    # TODO: add an util function for converting qdtype to dtype
+    _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+    }
+    weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+    if weight_quant_min is None or weight_quant_max is None:
+        weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[weight_dtype_]
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_,
+            )
+            return weight
+    elif weight_qscheme in [
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        # TODO: torch.quint4x2 is not supported
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.ops.quantized_decomposed.dequantize_per_channel(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_axis,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_,
+            )  # type: ignore[arg-type]
+            return weight
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+
+def _quantize_weight(
+    weight: torch.Tensor,
+    weight_qscheme: torch.qscheme,
+    weight_dtype: torch.dtype,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_axis_int: int,
+) -> torch.Tensor:
+    if weight_dtype == torch.float16:
+        weight = weight.to(weight_dtype)
+        return weight
+
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.quantize_per_tensor(
+                weight, weight_scale, weight_zero_point, weight_dtype
+            )
+            return weight
+    elif weight_qscheme in [
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
+            weight = torch.quantize_per_channel(
+                weight, weight_scale, weight_zero_point, weight_axis_int, weight_dtype
+            )  # type: ignore[arg-type]
+            return weight
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+
+def _quantize_and_dequantize_weight_decomposed(
+    weight: torch.Tensor,
+    weight_qscheme: torch.qscheme,
+    weight_dtype: torch.dtype,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_axis_int: int,
+    weight_quant_min: int | None,
+    weight_quant_max: int | None,
+) -> torch.Tensor:
+    """Quantize and then dequantize the weight based on
+    the quantization parameters
+    """
+    if weight_qscheme in [
+        torch.per_tensor_affine,
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        weight_quant = _quantize_weight_decomposed(
+            weight,
+            weight_qscheme,
+            weight_dtype,
+            weight_scale,
+            weight_zero_point,
+            weight_axis_int,
+            weight_quant_min,
+            weight_quant_max,
+        )
+        weight_dequant = _dequantize_weight_decomposed(
+            weight_quant,
+            weight_qscheme,
+            weight_dtype,
+            weight_scale,
+            weight_zero_point,
+            weight_axis_int,
+            weight_quant_min,
+            weight_quant_max,
+        )
+    else:
+        weight_dequant = weight
+    return weight_dequant
+
+
+def _quantize_and_dequantize_weight(
+    weight: torch.Tensor,
+    weight_qscheme: torch.qscheme,
+    weight_dtype: torch.dtype,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_axis_int: int,
+) -> torch.Tensor:
+    """Quantize and then dequantize the weight based on
+    the quantization parameters
+    """
+    if weight_qscheme in [
+        torch.per_tensor_affine,
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        weight_quant = _quantize_weight(
+            weight,
+            weight_qscheme,
+            weight_dtype,
+            weight_scale,
+            weight_zero_point,
+            weight_axis_int,
+        )
+        weight_dequant = weight_quant.dequantize()
+    else:
+        weight_dequant = weight
+    return weight_dequant
+
+
+def _save_weight_qparams(
+    destination,
+    prefix,
+    weight_qscheme,
+    weight_dtype,
+    weight_scale,
+    weight_zero_point,
+    weight_axis,
+):
+    destination[prefix + "weight_qscheme"] = weight_qscheme
+    destination[prefix + "weight_dtype"] = weight_dtype
+    if weight_qscheme is not None:
+        destination[prefix + "weight_scale"] = weight_scale
+        destination[prefix + "weight_zero_point"] = weight_zero_point
+        if weight_qscheme == torch.per_channel_affine:
+            destination[prefix + "weight_axis"] = weight_axis
+
+
+def _get_weight_qparam_keys(state_dict: dict[str, typing.Any], prefix: str):
+    keys = ["weight_qscheme", "weight_dtype"]
+    weight_qscheme = state_dict[prefix + "weight_qscheme"]
+    if weight_qscheme is not None:
+        keys.append("weight_scale")
+        keys.append("weight_zero_point")
+        if weight_qscheme == torch.quantize_per_channel:
+            keys.append("weight_axis")
+    return keys
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fda5a58f2984ee05b0d167297b458f62c37fc59
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__init__.py
@@ -0,0 +1 @@
+from . import quantized
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3759036fb1abc7dfca136083371b20d04cfb1613
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef66c90b0e8ecdbc7cd2cfb4c1cecf0bc38e8466
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__init__.py
@@ -0,0 +1,10 @@
+from torch.ao.nn.sparse.quantized import dynamic
+
+from .linear import Linear, LinearPackedParams
+
+
+__all__ = [
+    "dynamic",
+    "Linear",
+    "LinearPackedParams",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b04421eb3ae125677272c6afa6aab5c5b7f0b1dc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..466a2ee60241cb8404c6f75bc8b07f6353f0d8a1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96949caf4a30afd66e6d05c1e5737327d362bd9e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ecfd8793dc08b96ed64f47f531724aa8a866d0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
@@ -0,0 +1,6 @@
+from .linear import Linear
+
+
+__all__ = [
+    "Linear",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6875d89edeb8aac39104411720c8b6d8d08c3748
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c2a7f384930a32cf41c49659a5cb4d9f0ca1b26
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..d327cabd0d3681cce4ec4b7d62f0f9e734ad0730
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -0,0 +1,191 @@
+# mypy: allow-untyped-defs
+
+import torch
+import torch.ao.nn.intrinsic as nni
+from torch.ao.nn.quantized.modules.utils import (
+    _hide_packed_params_repr,
+    _quantize_weight,
+)
+from torch.ao.nn.sparse.quantized import linear
+from torch.ao.nn.sparse.quantized.utils import LinearBlockSparsePattern
+
+
+__all__ = ["Linear"]
+
+
+class Linear(torch.nn.Module):
+    r"""
+    A dynamically quantized sparse linear module with float tensor as inputs and outputs.
+    """
+
+    _version = 1
+    _op_type = "sparse_dynamic"
+    _FLOAT_MODULE = torch.nn.Linear
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        row_block_size,
+        col_block_size,
+        bias=True,
+        dtype=torch.qint8,
+    ):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError(
+                "Only QINT8 is supported for Sparse Quantized Linear Dynamic"
+            )
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if bias:
+            bias = torch.zeros(self.out_features, dtype=torch.float)
+        else:
+            bias = None
+
+        qweight = torch._empty_affine_quantized(
+            [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8
+        )
+        self._packed_params = linear.LinearPackedParams(
+            row_block_size=row_block_size, col_block_size=col_block_size, dtype=dtype
+        )
+        self._packed_params.set_weight_bias(
+            qweight, bias, row_block_size, col_block_size
+        )
+
+    def _get_name(self):
+        return "SparseQuantizedDynamicLinear"
+
+    def extra_repr(self):
+        return f"in_features={self.in_features}, out_features={self.out_features}, qscheme={self.weight().qscheme()}"
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, linear.LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sparse.qlinear_dynamic(x, self._packed_params._packed_params)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "op_type"] = self._op_type
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        op_type = int(state_dict[prefix + "op_type"])
+        assert op_type == "sparse", (
+            f"Cannot load from op_type [{op_type}], expecting [{self._op_type}]"
+        )
+        state_dict.pop(prefix + "op_type")
+
+        version = local_metadata.get("version", None)
+        assert version <= self._version
+
+        # Is this code valid? In old quantization it seemed to be used to load
+        # older model
+        weight = state_dict.pop(prefix + "weight")
+        bias = state_dict.pop(prefix + "bias")
+        state_dict.update(
+            {
+                prefix + "_packed_params.weight": weight,
+                prefix + "_packed_params.bias": bias,
+            }
+        )
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(
+        self,
+        w: torch.Tensor,
+        b: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
+    ) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self.out_features = w.shape[0]
+        self.in_features = w.shape[1]
+        self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a quantized sparse dynamic module from a float module.
+
+        We only care about the convert at this stage, no need for observers just yet.
+        """
+        assert type(mod) is cls._FLOAT_MODULE, (
+            " nnq."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        # TODO: Need to add options to qconfig to avoid the calibration.
+        # TODO: Add calibration for the sparsity
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        if type(mod) is nni.LinearReLU:
+            mod = mod[0]
+        # pyrefly: ignore [missing-attribute]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            # pyrefly: ignore [not-callable]
+            weight_observer = mod.qconfig.weight()
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+
+            weight_observer = default_dynamic_qconfig.weight()
+
+        # It is important to multiply by the mask BEFORE calling the `weight_observer`
+        # TODO (zaf): Mask might not be part of the qconfig (T83295194)
+        weight = mod.weight
+        if getattr(mod.qconfig, "mask", False):
+            weight = mod.qconfig.mask * mod.weight
+
+        weight_observer(weight)
+        dtype = weight_observer.dtype
+        assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
+        _w_sc, w_zp = weight_observer.calculate_qparams()
+        if isinstance(w_zp, torch.Tensor):
+            assert not torch.any(w_zp.bool()), "All weight zero points must map to 0"
+        else:
+            assert w_zp == 0, "Weight zero point must map to 0"
+        qweight = _quantize_weight(weight.float(), weight_observer)
+
+        row_block_size, col_block_size = LinearBlockSparsePattern.block_size()
+        qlinear = cls(
+            mod.in_features,
+            mod.out_features,
+            row_block_size,
+            col_block_size,
+            dtype=dtype,
+        )
+        # pyrefly: ignore [bad-argument-type]
+        qlinear.set_weight_bias(qweight, mod.bias, row_block_size, col_block_size)
+        return qlinear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f106a32abfbf960b989c8eba860db2dec4a7fe4c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/linear.py
@@ -0,0 +1,274 @@
+# mypy: allow-untyped-defs
+
+import torch
+from torch.ao.nn.quantized.modules.utils import (
+    _hide_packed_params_repr,
+    _quantize_weight,
+)
+
+
+__all__ = ["LinearPackedParams", "Linear"]
+
+
+# TODO (zaf): Inherit from `quantized.LinearPackedParams` (T83294430)
+class LinearPackedParams(torch.nn.Module):
+    _version = 1
+
+    def __init__(self, row_block_size=1, col_block_size=4, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Linear prepacking only supports QINT8")
+        self.dtype = dtype
+        wq = torch._empty_affine_quantized(
+            [1, 1], scale=1.0, zero_point=0, dtype=torch.qint8
+        )
+        self.set_weight_bias(wq, None, row_block_size, col_block_size)
+
+    def _get_name(self):
+        return "SparseQuantizedLinearPackedParams"
+
+    @torch.jit.export
+    def set_weight_bias(
+        self,
+        weight: torch.Tensor,
+        bias: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
+    ) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params = torch.ops.sparse.qlinear_prepack(
+            weight, bias, row_block_size, col_block_size
+        )
+
+    @torch.jit.export
+    def _weight_bias(self):
+        (weight, bias, block_sizes) = torch.ops.sparse.qlinear_unpack(
+            self._packed_params
+        )
+        return (weight, bias, block_sizes[0], block_sizes[1])
+
+    def forward(self, x):
+        return x
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "dtype"] = self.dtype
+        destination[prefix + "_packed_params"] = self._weight_bias()
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        assert version <= self._version
+
+        self.dtype = state_dict.pop(prefix + "dtype")
+        weight, bias, row_block_size, col_block_size = state_dict.pop(
+            prefix + "_packed_params"
+        )
+        self.set_weight_bias(weight, bias, row_block_size, col_block_size)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def __getstate__(self):
+        return self._packed_params, self.training, self.dtype
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        (self._packed_params, self.training, self.dtype) = state
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+
+# TODO (zaf): Inherit from `quantized.Linear` (T83294430)
+class Linear(torch.nn.Module):
+    r"""
+    A quantized sparse linear module with quantized tensor as inputs and outputs.
+    """
+
+    _version = 1
+    _FLOAT_MODULE = torch.nn.Linear
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        row_block_size,
+        col_block_size,
+        bias=True,
+        dtype=torch.qint8,
+    ):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError(
+                "Only QINT8 is supported for Sparse Quantized Linear"
+            )
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if bias:
+            bias = torch.zeros(self.out_features, dtype=torch.float)
+        else:
+            bias = None
+
+        qweight = torch._empty_affine_quantized(
+            [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8
+        )
+        self._packed_params = LinearPackedParams(
+            row_block_size=row_block_size, col_block_size=col_block_size, dtype=dtype
+        )
+        self._packed_params.set_weight_bias(
+            qweight, bias, row_block_size, col_block_size
+        )
+        self.scale = 1.0
+        self.zero_point = 0
+
+    @classmethod
+    def _get_name(cls):
+        return "SparseQuantizedLinear"
+
+    def extra_repr(self):
+        return (
+            f"in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, "
+            f"zero_point={self.zero_point}, qscheme={self.weight().qscheme()}"
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sparse.qlinear(
+            x, self._packed_params._packed_params, self.scale, self.zero_point
+        )
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "scale"] = torch.tensor(self.scale)
+        destination[prefix + "zero_point"] = torch.tensor(self.zero_point)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        self.scale = float(state_dict[prefix + "scale"])
+        state_dict.pop(prefix + "scale")
+
+        self.zero_point = int(state_dict[prefix + "zero_point"])
+        state_dict.pop(prefix + "zero_point")
+
+        state_dict.pop(prefix + "op_type")
+
+        version = local_metadata.get("version", None)
+        assert version <= self._version
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            False,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(
+        self,
+        w: torch.Tensor,
+        b: torch.Tensor | None,
+        row_block_size: int | None,
+        col_block_size: int | None,
+    ) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        r"""Create a quantized sparse module from a float module.
+
+        We only care about the convert at this stage, no need for observers just yet.
+
+        TODO(zaf): Need to add the sparse params to the qconfig
+        """
+        assert type(mod) is cls._FLOAT_MODULE, (
+            cls._get_name() + ".from_float only works for " + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "sparse_params"), (
+            "Expecting the Linear to have `sparse_params`. Make sure you have provided arguments "
+            'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.'
+        )
+        sparse_block_shape = mod.sparse_params.get("sparse_block_shape", None)  # type: ignore[operator, union-attr]
+        assert isinstance(sparse_block_shape, (tuple, list))
+        assert len(sparse_block_shape) == 2
+        # TODO: Need to add options to qconfig to avoid the calibration.
+        # TODO: Add calibration for the sparsity
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        activation_post_process = mod.activation_post_process
+        weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
+
+        # Assumption is that the weight is already sparsified by the
+        # `sparsifier.convert`
+        weight = mod.weight
+
+        weight_post_process(weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[operator, union-attr]
+        assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
+        w_sc, w_zp = weight_post_process.calculate_qparams()
+        if isinstance(w_zp, torch.Tensor):
+            assert not torch.any(w_zp.bool()), "All weight zero points must map to 0"
+        else:
+            assert w_zp == 0, "Weight zero point must map to 0"
+        qweight = _quantize_weight(weight.float(), weight_post_process)
+
+        row_block_size = mod.sparse_params["sparse_block_shape"][0]  # type: ignore[index]
+        col_block_size = mod.sparse_params["sparse_block_shape"][1]  # type: ignore[index]
+        qlinear = cls(
+            mod.in_features,
+            mod.out_features,
+            row_block_size,
+            col_block_size,
+            dtype=dtype,
+        )
+        qlinear.set_weight_bias(
+            qweight,
+            mod.bias,
+            row_block_size,  # type: ignore[arg-type]
+            col_block_size,  # type: ignore[arg-type]
+        )
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cfd4a5973dfa8a5219f5ca97246424ae17a6308
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/nn/sparse/quantized/utils.py
@@ -0,0 +1,62 @@
+import threading
+
+
+__all__ = ["LinearBlockSparsePattern"]
+
+
+def _is_valid_linear_block_sparse_pattern(
+    row_block_size: int, col_block_size: int
+) -> bool:
+    return (row_block_size == 1 and col_block_size == 4) or (
+        row_block_size == 8 and col_block_size == 1
+    )
+
+
+# This is a stop-gap measure as current flow does not allow module
+# specific block sparse pattern.
+# In fact there is no way to convey sparse pattern via module config
+# of quantization flow. Thus using the global context to convey
+# sparsity pattern.
+# Once the flow supports it, this should be removed.
+class LinearBlockSparsePattern:
+    rlock = threading.RLock()
+    row_block_size: int = 1
+    col_block_size: int = 4
+    prev_row_block_size: int = 1
+    prev_col_block_size: int = 4
+
+    def __init__(self, row_block_size: int = 1, col_block_size: int = 4):
+        assert _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size)
+        LinearBlockSparsePattern.rlock.acquire()
+        LinearBlockSparsePattern.prev_row_block_size = (
+            LinearBlockSparsePattern.row_block_size
+        )
+        LinearBlockSparsePattern.prev_col_block_size = (
+            LinearBlockSparsePattern.col_block_size
+        )
+        LinearBlockSparsePattern.row_block_size = row_block_size
+        LinearBlockSparsePattern.col_block_size = col_block_size
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        backtrace: object | None,
+    ) -> None:
+        LinearBlockSparsePattern.row_block_size = (
+            LinearBlockSparsePattern.prev_row_block_size
+        )
+        LinearBlockSparsePattern.col_block_size = (
+            LinearBlockSparsePattern.prev_col_block_size
+        )
+        LinearBlockSparsePattern.rlock.release()
+
+    @staticmethod
+    def block_size() -> tuple[int, int]:
+        return (
+            LinearBlockSparsePattern.row_block_size,
+            LinearBlockSparsePattern.col_block_size,
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cfcd51af91e25c13076c8930312ba9c9df4e7bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec6195b14e08e26e450cf6f3c332e41f0dc89354
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3d2d766dd1db6ffcaf85704ce7ff7d17a0bb4e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..026ac73606e307bedd500a801a76ba1a97c4c655
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite.py
@@ -0,0 +1,568 @@
+# mypy: allow-untyped-defs
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.nn as nn
+from torch.ao.quantization import prepare
+from torch.ao.quantization.quantization_mappings import (
+    get_default_compare_output_module_list,
+)
+
+
+NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST = {
+    nnqd.Linear,
+    nnq.Linear,
+    nnqd.LSTM,
+    nn.LSTM,
+}
+
+
+def _find_match(
+    str_list: dict[str, Any] | list[str],
+    key_str: str,
+    postfix: str,
+) -> str | None:
+    split_str = key_str.split(".")
+    if split_str[-1] == postfix:
+        match_string = "".join(key_str.split(".")[0:-1])
+        for s2 in str_list:
+            pattern1 = "".join(s2.split(".")[0:-1])
+            pattern2 = "".join(s2.split(".")[0:-2])
+            if match_string == pattern1:
+                return s2
+            if match_string == pattern2:
+                return s2
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        if postfix == "_packed_params":
+            match_string = "".join(key_str.split(".")[0:-2])
+            if len(match_string) == 0:
+                return None
+            for s2 in str_list:
+                pattern1 = "".join(s2.split(".")[0:-1])
+                pattern2 = "".join(s2.split(".")[0:-2])
+                if match_string == pattern1:
+                    return s2
+                if match_string == pattern2:
+                    return s2
+        return None
+    else:
+        return None
+
+
+def compare_weights(
+    float_dict: dict[str, Any], quantized_dict: dict[str, Any]
+) -> dict[str, dict[str, torch.Tensor]]:
+    r"""Compare the weights of the float module with its corresponding quantized
+    module. Return a dict with key corresponding to module names and each entry being
+    a dictionary with two keys 'float' and 'quantized', containing the float and
+    quantized weights. This dict can be used to compare and compute the quantization
+    error of the weights of float and quantized models.
+
+    Example usage::
+
+        wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict())
+        for key in wt_compare_dict:
+            print(
+                key,
+                compute_error(
+                    wt_compare_dict[key]["float"],
+                    wt_compare_dict[key]["quantized"].dequantize(),
+                ),
+            )
+
+    Args:
+        float_dict: state dict of the float model
+        quantized_dict: state dict of the quantized model
+
+    Return:
+        weight_dict: dict with key corresponding to module names and each entry being
+        a dictionary with two keys 'float' and 'quantized', containing the float and
+        quantized weights
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_weights")
+    weight_dict: dict[str, dict] = {}
+    for key in quantized_dict:
+        match_key = _find_match(float_dict, key, "weight")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key]
+            continue
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        match_key = _find_match(float_dict, key, "_packed_params")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key][0]
+
+        # For LSTM
+        split_str = key.split(".")
+        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+            layer = split_str[-2]
+            module_name = ".".join(split_str[:-3])
+            float_weight_ih_key = module_name + ".weight_ih_l" + layer
+            float_weight_hh_key = module_name + ".weight_hh_l" + layer
+            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+                weight_dict[key] = {}
+                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+                )
+                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+                )
+
+    return weight_dict
+
+
+def _get_logger_dict_helper(
+    mod: nn.Module,
+    target_dict: dict[str, Any],
+    prefix: str = "",
+) -> None:
+    r"""This is the helper function for get_logger_dict
+
+    Args:
+        mod: module we want to save all logger stats
+        prefix: prefix for the current module
+        target_dict: the dictionary used to save all logger stats
+    """
+
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + "."
+
+    for child in mod.children():
+        if isinstance(child, Logger):
+            target_dict[get_prefix(prefix) + "stats"] = child.stats
+            break
+
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        _get_logger_dict_helper(child, target_dict, module_prefix)
+
+
+def get_logger_dict(mod: nn.Module, prefix: str = "") -> dict[str, dict]:
+    r"""Traverse the modules and save all logger stats into target dict.
+    This is mainly used for quantization accuracy debug.
+
+    Type of loggers supported:
+        ShadowLogger: used to log the outputs of the quantized module and its matching float shadow module,
+        OutputLogger: used to log the outputs of the modules
+
+    Args:
+        mod: module we want to save all logger stats
+        prefix: prefix for the current module
+
+    Return:
+        target_dict: the dictionary used to save all logger stats
+
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.get_logger_dict")
+
+    target_dict: dict[str, dict] = {}
+    _get_logger_dict_helper(mod, target_dict, prefix)
+    return target_dict
+
+
+class Logger(nn.Module):
+    r"""Base class for stats logging"""
+
+    def __init__(self):
+        super().__init__()
+        self.stats = {}
+        # We only insert observer if the op is quantized with static quantization,
+        # which is identified by activation_observer.dtype == quint8.  This is needed
+        # when attaching Logger as observer for FX mode
+        self.dtype = torch.quint8
+
+    def forward(self, x):
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+
+
+class ShadowLogger(Logger):
+    r"""Class used in Shadow module to record the outputs of the original and
+    shadow modules.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.stats["float"] = []
+        self.stats["quantized"] = []
+
+    def forward(self, x, y):  # type: ignore[override]
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        if len(x) > 1:
+            x = x[0]
+        if len(y) > 1:
+            y = y[0]
+        self.stats["quantized"].append(x.detach())
+        self.stats["float"].append(y.detach())
+
+
+class OutputLogger(Logger):
+    r"""Class used to log the outputs of the module"""
+
+    def __init__(self):
+        super().__init__()
+        self.stats["tensor_val"] = []
+
+    def forward(self, x):
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        self.stats["tensor_val"].append(x)
+        return x
+
+
+def _convert_tuple_to_list(t: Any) -> Any:
+    return [_convert_tuple_to_list(x) for x in t] if type(t) is tuple else t
+
+
+def _dequantize_tensor_list(t: Any) -> Any:
+    return (
+        [_dequantize_tensor_list(x) for x in t]
+        if type(t) is list
+        else t.dequantize()
+        if t.is_quantized
+        else t
+    )
+
+
+class Shadow(nn.Module):
+    r"""Shadow module attaches the float module to its matching quantized module
+    as the shadow. Then it uses Logger module to process the outputs of both
+    modules.
+
+    Args:
+        q_module: module quantized from float_module that we want to shadow
+        float_module: float module used to shadow q_module
+        logger_cls: type of logger used to process the outputs of q_module and
+            float_module. ShadowLogger or custom loggers can be used.
+    """
+
+    def __init__(self, q_module, float_module, logger_cls):
+        super().__init__()
+        self.orig_module = q_module
+        self.shadow_module = float_module
+        self.dequant = nnq.DeQuantize()
+        self.logger = logger_cls()
+
+    def forward(self, *x) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        xl = _convert_tuple_to_list(x)
+        output = self.orig_module(*xl)
+        xl_float = _dequantize_tensor_list(xl)
+        shadow_output = self.shadow_module(*xl_float)
+        self.logger(output, shadow_output)
+        return output
+
+    def add(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.add(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.add(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def add_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.add_scalar(x, y)
+        x = x.dequantize()
+        shadow_output = self.shadow_module.add_scalar(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def mul(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.mul(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.mul(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def mul_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.mul_scalar(x, y)
+        x = x.dequantize()
+        shadow_output = self.shadow_module.mul_scalar(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def cat(self, x: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.cat(x, dim)
+        x = [y.dequantize() for y in x]
+        shadow_output = self.shadow_module.cat(x, dim)
+        self.logger(output, shadow_output)
+        return output
+
+    def add_relu(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        output = self.orig_module.add_relu(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.add_relu(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+
+def prepare_model_with_stubs(
+    float_module: nn.Module,
+    q_module: nn.Module,
+    module_swap_list: set[type],
+    logger_cls: Callable,
+) -> None:
+    r"""Prepare the model by attaching the float module to its matching quantized
+    module as the shadow if the float module type is in module_swap_list.
+
+    Example usage::
+
+        prepare_model_with_stubs(float_model, q_model, module_swap_list, Logger)
+        q_model(data)
+        ob_dict = get_logger_dict(q_model)
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+        module_swap_list: list of float module types to attach the shadow
+        logger_cls: type of logger to be used in shadow module to process the outputs of
+            quantized module and its float shadow module
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite.prepare_model_with_stubs"
+    )
+
+    float_module_children = dict(float_module.named_children())
+
+    reassign = {}
+    for name, mod in q_module.named_children():
+        if name not in float_module_children:
+            continue
+
+        float_mod = float_module_children[name]
+
+        if type(float_mod) not in module_swap_list:
+            prepare_model_with_stubs(float_mod, mod, module_swap_list, logger_cls)
+
+        # Insert shadow module only if the module is not of the same type as
+        # the floating point module
+        if type(float_mod) in module_swap_list and not _is_identical_module_type(
+            mod, float_mod
+        ):
+            reassign[name] = Shadow(mod, float_mod, logger_cls)
+
+    for key, value in reassign.items():
+        q_module._modules[key] = value
+
+
+def _is_identical_module_type(mod1, mod2):
+    # Compare if two modules have the same dtype
+    mod1_module_types = [type(mod) for mod in mod1.modules()]
+    mod2_module_types = [type(mod) for mod in mod2.modules()]
+    return mod1_module_types == mod2_module_types
+
+
+def compare_model_stub(
+    float_model: nn.Module,
+    q_model: nn.Module,
+    module_swap_list: set[type],
+    *data,
+    logger_cls=ShadowLogger,
+) -> dict[str, dict]:
+    r"""Compare quantized module in a model with its floating point counterpart,
+    feeding both of them the same input. Return a dict with key corresponding to
+    module names and each entry being a dictionary with two keys 'float' and
+    'quantized', containing the output tensors of quantized and its matching
+    float shadow module. This dict can be used to compare and compute the module
+    level quantization error.
+
+    This function first call prepare_model_with_stubs() to swap the quantized
+    module that we want to compare with the Shadow module, which takes quantized
+    module, corresponding float module and logger as input, and creates a forward
+    path inside to make the float module to shadow quantized module sharing the
+    same input. The logger can be customizable, default logger is ShadowLogger
+    and it will save the outputs of the quantized module and float module that
+    can be used to compute the module level quantization error.
+
+    Example usage::
+
+        module_swap_list = [
+            torchvision.models.quantization.resnet.QuantizableBasicBlock
+        ]
+        ob_dict = compare_model_stub(float_model, qmodel, module_swap_list, data)
+        for key in ob_dict:
+            print(
+                key,
+                compute_error(
+                    ob_dict[key]["float"], ob_dict[key]["quantized"].dequantize()
+                ),
+            )
+
+    Args:
+        float_model: float model used to generate the q_model
+        q_model: model quantized from float_model
+        module_swap_list: list of float module types at which shadow modules will
+            be attached.
+        data: input data used to run the prepared q_model
+        logger_cls: type of logger to be used in shadow module to process the outputs of
+            quantized module and its float shadow module
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_model_stub")
+    prepare_model_with_stubs(float_model, q_model, module_swap_list, logger_cls)
+    q_model(*data)
+    ob_dict = get_logger_dict(q_model)
+    return ob_dict
+
+
+def get_matching_activations(
+    float_module: nn.Module,
+    q_module: nn.Module,
+) -> dict[str, dict[str, torch.Tensor]]:
+    r"""Find the matching activation between float and quantized modules.
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+
+    Return:
+        act_dict: dict with key corresponding to quantized module names and each
+        entry being a dictionary with two keys 'float' and 'quantized', containing
+        the matching float and quantized activations
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite.get_matching_activations"
+    )
+    float_dict = get_logger_dict(float_module)
+    quantized_dict = get_logger_dict(q_module)
+    act_dict: dict[str, dict] = {}
+    for key in quantized_dict:
+        if len(quantized_dict[key]["tensor_val"]) == 0:
+            continue
+        match_key = _find_match(sorted(float_dict, reverse=True), key, "stats")
+        if match_key is not None:
+            act_dict[key] = {}
+            act_dict[key]["float"] = float_dict[match_key]["tensor_val"]
+            act_dict[key]["quantized"] = quantized_dict[key]["tensor_val"]
+    return act_dict
+
+
+def prepare_model_outputs(
+    float_module: nn.Module,
+    q_module: nn.Module,
+    logger_cls=OutputLogger,
+    allow_list=None,
+) -> None:
+    r"""Prepare the model by attaching the logger to both float module
+    and quantized module if they are in the allow_list.
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+        logger_cls: type of logger to be attached to float_module and q_module
+        allow_list: list of module types to attach logger
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite.prepare_model_outputs"
+    )
+    if allow_list is None:
+        allow_list = get_default_compare_output_module_list()
+
+    qconfig_debug = torch.ao.quantization.QConfig(activation=logger_cls, weight=None)
+    float_module.qconfig = qconfig_debug  # type: ignore[assignment]
+    prepare(
+        float_module, inplace=True, allow_list=allow_list, prepare_custom_config_dict={}
+    )
+    q_module.qconfig = qconfig_debug  # type: ignore[assignment]
+    prepare(
+        q_module,
+        inplace=True,
+        allow_list=allow_list,
+        observer_non_leaf_module_list=NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST,
+        prepare_custom_config_dict={},
+    )
+
+
+def compare_model_outputs(
+    float_model: nn.Module,
+    q_model: nn.Module,
+    *data,
+    logger_cls=OutputLogger,
+    allow_list=None,
+) -> dict[str, dict[str, torch.Tensor]]:
+    r"""Compare output activations between float and quantized models at
+    corresponding locations for the same input. Return a dict with key corresponding
+    to quantized module names and each entry being a dictionary with two keys
+    'float' and 'quantized', containing the activations of quantized model and
+    float model at matching locations. This dict can be used to compare and
+    compute the propagation quantization error.
+
+    Example usage::
+
+        act_compare_dict = compare_model_outputs(float_model, qmodel, data)
+        for key in act_compare_dict:
+            print(
+                key,
+                compute_error(
+                    act_compare_dict[key]["float"],
+                    act_compare_dict[key]["quantized"].dequantize(),
+                ),
+            )
+
+    Args:
+        float_model: float model used to generate the q_model
+        q_model: model quantized from float_model
+        data: input data used to run the prepared float_model and q_model
+        logger_cls: type of logger to be attached to float_module and q_module
+        allow_list: list of module types to attach logger
+
+    Return:
+        act_compare_dict: dict with key corresponding to quantized module names
+        and each entry being a dictionary with two keys 'float' and 'quantized',
+        containing the matching float and quantized activations
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite.compare_model_outputs"
+    )
+    if allow_list is None:
+        allow_list = get_default_compare_output_module_list()
+    prepare_model_outputs(float_model, q_model, logger_cls, allow_list)
+    float_model(*data)
+    q_model(*data)
+    act_compare_dict = get_matching_activations(float_model, q_model)
+    return act_compare_dict
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..1861d0160db152e73debda3bda7f714ca4bbf601
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/_numeric_suite_fx.py
@@ -0,0 +1,1121 @@
+# mypy: allow-untyped-defs
+"""
+This module contains tooling to compare weights and activations
+across models. Example usage::
+
+    import copy
+    import torch
+    import torch.ao.quantization.quantize_fx as quantize_fx
+    import torch.ao.ns._numeric_suite_fx as ns
+
+    m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)).eval()
+    mp = quantize_fx.prepare_fx(m, {"": torch.ao.quantization.default_qconfig})
+    # We convert a copy because we need the original prepared model
+    # to be available for comparisons, and `quantize_fx.convert_fx` is inplace.
+    mq = quantize_fx.convert_fx(copy.deepcopy(mp))
+
+    #
+    # Comparing weights
+    #
+
+    # extract weight pairs
+    weight_comparison = ns.extract_weights("a", mp, "b", mq)
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        weight_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+
+    # weight_comparison contains the weights from `mp` and `mq` stored
+    # in pairs, and can be used for further analysis.
+
+
+    #
+    # Comparing activations, with error propagation
+    #
+
+    # add loggers
+    mp_ns, mq_ns = ns.add_loggers(
+        "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger
+    )
+
+    # send an example datum to capture intermediate activations
+    datum = torch.randn(1, 1, 1, 1)
+    mp_ns(datum)
+    mq_ns(datum)
+
+    # extract intermediate activations
+    act_comparison = ns.extract_logger_info(mp_ns, mq_ns, ns.OutputLogger, "b")
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+
+    # act_comparison contains the activations from `mp_ns` and `mq_ns` stored
+    # in pairs, and can be used for further analysis.
+
+    #
+    # Comparing activations, without error propagation
+    #
+
+    # create shadow model
+    mp_shadows_mq = ns.add_shadow_loggers(
+        "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger
+    )
+
+    # send an example datum to capture intermediate activations
+    datum = torch.randn(1, 1, 1, 1)
+    mp_shadows_mq(datum)
+
+    # extract intermediate activations
+    shadow_act_comparison = ns.extract_shadow_logger_info(
+        mp_shadows_mq, ns.OutputLogger, "b"
+    )
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        shadow_act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+
+    # shadow_act_comparison contains the activations from `mp_ns` and `mq_ns` stored
+    # in pairs, and can be used for further analysis.
+
+"""
+
+import collections
+from collections.abc import Callable
+from typing import Any, TYPE_CHECKING
+
+import torch
+import torch.ao.quantization.quantize_fx as quantize_fx
+import torch.nn as nn
+from torch.ao.ns.fx.graph_matcher import get_matching_subgraph_pairs
+from torch.ao.ns.fx.mappings import get_base_name_to_sets_of_related_ops
+from torch.ao.ns.fx.n_shadows_utils import (
+    _get_dedup_subgraphs,
+    create_add_loggers_graph,
+    create_n_transformed_and_logged_copies_of_subgraph,
+    create_results_comparison,
+    extract_weight_comparison,
+    group_results_by_subgraph,
+    OutputProp,
+    print_n_shadows_summary,
+    SHADOW_WRAPPER_NODE_NAME_PREFIX,
+)
+from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.backend_config import BackendConfig
+from torch.ao.quantization.backend_config.utils import (
+    get_fusion_pattern_to_root_node_getter,
+)
+from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
+from torch.ao.quantization.fx.match_utils import _find_matches
+from torch.ao.quantization.fx.qconfig_mapping_utils import (
+    _generate_node_name_to_qconfig,
+)
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .fx.graph_passes import add_loggers_to_model, create_a_shadows_b
+from .fx.ns_types import NSNodeTargetType, NSResultsType, NSSingleResultValuesType
+from .fx.utils import (
+    get_target_type_str,
+    maybe_add_missing_fqns,
+    rekey_logger_info_on_node_name_of_model,
+)
+from .fx.weight_utils import extract_weight_from_node
+
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import QConfigAny
+
+RNNReturnType = tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]
+
+
+class OutputLogger(nn.Module):
+    """
+    Base class for capturing intermediate values.
+    """
+
+    stats: list[torch.Tensor]
+    stats_rnn: list[RNNReturnType]
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(
+        self,
+        ref_node_name: str,
+        prev_node_name: str,
+        model_name: str,
+        ref_name: str,
+        prev_node_target_type: str,
+        ref_node_target_type: str,
+        results_type: str,
+        index_within_arg: int,
+        index_of_arg: int,
+        fqn: str | None,
+        qconfig_str: str | None = "",
+    ):
+        super().__init__()
+        self.stats: list[torch.Tensor] = []
+        self.stats_rnn: list[RNNReturnType] = []
+
+        # name of the node which was responsible for adding this logger
+        # Note:
+        # - if we are logging node outputs, this is the same as prev_node_name
+        # - if we are logging node inputs, this is the name of the node
+        #   whose input this logger is logging.
+        #
+        # example, where logger1 is logging input of op1 and logger2 is logging
+        #    the output of op1:
+        #
+        #  x1 -> logger1 -> op1 -> logger2 -> x2
+        #
+        # in this example,
+        #   - logger1's prev_node_name is x1 and ref_node_name is op1
+        #   - logger2's prev_node_name is op1 and ref_node_name is op1
+        self.ref_node_name = ref_node_name
+        # name of the node whose output this Logger is capturing
+        self.prev_node_name = prev_node_name
+
+        # name of the model from which the node originated from
+        self.model_name = model_name
+        # reference name, used to match loggers from separate models
+        # to each other
+        self.ref_name = ref_name
+        # type of the target of the node whose output this logger is logging
+        self.prev_node_target_type = prev_node_target_type
+        # type of the target of the node which was responsible for adding this
+        # logger
+        self.ref_node_target_type = ref_node_target_type
+        # what kind of values are inside of stats
+        self.results_type = results_type
+        # index of this node within the arg of the input/output node
+        # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1
+        self.index_within_arg = index_within_arg
+        # index of this node within the args of the input/output node
+        # for example, in add(x1, x2), x2 would have index_of_arg == 1
+        self.index_of_arg = index_of_arg
+        # fully qualified name
+        self.fqn = fqn
+        # if loggers are added before prepare_fx, but we do not want
+        # collect results of calibration, only results after convert_fx
+        # so, we add a flag to control whether this logger collects data
+        self.enabled = True
+        # string representation of qconfig
+        self.qconfig_str = qconfig_str
+        # this can be turned off to reduce memory usage during calibration
+        self.save_activations = True
+
+    # Note: cannot annotate the type of x because TorchScript does not support
+    #   the Union type.
+    def forward(self, x):
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        # TODO(future PR): consider designing this better, as the difference
+        # between these two flags is subtle and not obvious.
+        if not self.enabled:
+            return x
+        if not self.save_activations:
+            return x
+        # TODO(future PR): consider refactoring this to better reuse the parent
+        # class
+        if isinstance(x, torch.Tensor):
+            self.stats.append(x.detach())
+        elif isinstance(x, tuple) and len(x) == 2 and len(x[1]) == 2:
+            new_res = (x[0].detach(), (x[1][0].detach(), x[1][1].detach()))
+            self.stats_rnn.append(new_res)
+        return x
+
+    def __repr__(self):
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != "training") and not k.startswith("_")
+        }
+        return f"OutputLogger({clean_dict})"
+
+
+class OutputComparisonLogger(OutputLogger):
+    """
+    Same as OutputLogger, but also requires the original activation
+    in order to calculate the comparison at calibration time
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO(future PR): make the comparison function configurable
+        self.comparison_fn = torch.ao.ns.fx.utils.compute_sqnr
+        self.comparison_fn_name = "sqnr"
+        # precalculated comparisons of logger output versus reference
+        self.comparisons = []
+        # precalculated comparisons function
+
+    def forward(self, x, x_ref):  # type: ignore[override]
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        if not self.enabled:
+            return x
+        if not isinstance(x, torch.Tensor):
+            raise AssertionError("non-tensor inputs not yet supported")
+        if self.save_activations:
+            # save the activation, for debugging
+            self.stats.append(x.detach())
+        # save the comparison
+        self.comparisons.append(self.comparison_fn(x, x_ref))
+        return x
+
+    def __repr__(self):
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != "training") and not k.startswith("_")
+        }
+        return f"OutputComparisonLogger({clean_dict})"
+
+
+class NSTracer(quantize_fx.QuantizationTracer):
+    """
+    Just like a regular FX quantization tracer, but treats observers and fake_quantize
+    modules as leaf modules.
+    """
+
+    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
+        # fmt: off
+        """
+        """  # blank docblock to make autodoc happy
+        # fmt: on
+        if isinstance(m, torch.ao.quantization.ObserverBase):
+            return True
+        elif isinstance(m, torch.ao.quantization.FakeQuantizeBase):
+            return True
+        return super().is_leaf_module(m, module_qualified_name)
+
+
+def _extract_weights_one_model(
+    model_name: str,
+    model: GraphModule,
+    nodes_and_names_to_instrument: list[tuple[Node, str]],
+    results: NSResultsType,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
+) -> None:
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx._extract_weights_one_model"
+    )
+    for node, ref_name in nodes_and_names_to_instrument:
+        res_type = NSSingleResultValuesType.WEIGHT.value
+        extracted_weight = extract_weight_from_node(
+            node, model, op_to_type_to_weight_extraction_fn
+        )
+        if extracted_weight:
+            if ref_name not in results:
+                results[ref_name] = {res_type: {}}
+            results[ref_name][res_type][model_name] = [extracted_weight]
+
+
+def _extract_weights_impl(
+    model_name_a: str,
+    gm_a: GraphModule,
+    model_name_b: str,
+    gm_b: GraphModule,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
+) -> NSResultsType:
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx._extract_weights_impl"
+    )
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map
+    )
+
+    # split the subgraph pairs into one data structure for each model
+    nodes_and_names_to_instrument_a: list[tuple[Node, str]] = []
+    nodes_and_names_to_instrument_b: list[tuple[Node, str]] = []
+    for match_name, match in matched_subgraph_pairs.items():
+        subgraph_a, subgraph_b = match
+        nodes_and_names_to_instrument_a.append((subgraph_a.base_op_node, match_name))
+        nodes_and_names_to_instrument_b.append((subgraph_b.base_op_node, match_name))
+
+    # populate the results, one model at a time
+    results: NSResultsType = {}
+    _extract_weights_one_model(
+        model_name_a,
+        gm_a,
+        nodes_and_names_to_instrument_a,
+        results,
+        op_to_type_to_weight_extraction_fn,
+    )
+    _extract_weights_one_model(
+        model_name_b,
+        gm_b,
+        nodes_and_names_to_instrument_b,
+        results,
+        op_to_type_to_weight_extraction_fn,
+    )
+
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+
+    # rekey on names of nodes in gm_b
+    results = rekey_logger_info_on_node_name_of_model(results, model_name_b)
+
+    return results
+
+
+def extract_weights(
+    model_name_a: str,
+    model_a: nn.Module,
+    model_name_b: str,
+    model_b: nn.Module,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
+) -> NSResultsType:
+    """
+    Extract weights from model A and model B, and return a comparison.
+
+    Args:
+        model_name_a: string name of model A to use in results
+        model_a: model A
+        model_name_b: string name of model B to use in results
+        model_b: model B
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+        op_to_type_to_weight_extraction_fn: optional override of function which extracts weight
+            from a type, subject to change
+
+    Return:
+        NSResultsType, containing the weight comparisons
+    """
+
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.extract_weights")
+    if base_name_to_sets_of_related_ops is None:
+        base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops()
+
+    # TODO(future PR): expose these
+    skipped_module_names: list[str] = []
+    skipped_module_classes: list[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(
+        model_a, "node_name_to_scope"
+    )
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(
+        model_b, "node_name_to_scope"
+    )
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _extract_weights_impl(
+        model_name_a,
+        gm_a,
+        model_name_b,
+        gm_b,
+        base_name_to_sets_of_related_ops,
+        unmatchable_types_map,
+        op_to_type_to_weight_extraction_fn,
+    )
+
+
+def _add_loggers_one_model(
+    model_name: str,
+    model: GraphModule,
+    nodes_and_names_to_instrument_inputs: list[tuple[Node, str, str]],
+    nodes_and_names_to_instrument_outputs: list[tuple[Node, str, str]],
+    logger_cls: Callable,
+) -> nn.Module:
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx._add_loggers_one_model"
+    )
+
+    # TODO(future PR): do not observe nodes we do not care
+    #   about (both fp32, denylist, etc)
+    node_to_instrument_inputs_to_ref_name: dict[Node, tuple[str, str]] = {}
+    node_to_instrument_outputs_to_ref_name: dict[Node, tuple[str, str]] = {}
+    for node, ref_name, ref_node_type in nodes_and_names_to_instrument_inputs:
+        node_to_instrument_inputs_to_ref_name[node] = (ref_name, ref_node_type)
+    for node, ref_name, ref_node_type in nodes_and_names_to_instrument_outputs:
+        node_to_instrument_outputs_to_ref_name[node] = (ref_name, ref_node_type)
+
+    model = add_loggers_to_model(
+        model,
+        node_to_instrument_inputs_to_ref_name,
+        node_to_instrument_outputs_to_ref_name,
+        logger_cls,
+        model_name,
+    )
+    return model
+
+
+def _add_loggers_impl(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> tuple[nn.Module, nn.Module]:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_loggers_impl")
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map
+    )
+    nodes_and_names_to_instrument_inputs_a = []
+    nodes_and_names_to_instrument_inputs_b = []
+    nodes_and_names_to_instrument_outputs_a = []
+    nodes_and_names_to_instrument_outputs_b = []
+    for match_name, (subgraph_a, subgraph_b) in matched_subgraph_pairs.items():
+        ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a)
+        ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b)
+        # Note: for matching inputs we use start_node, such as observing
+        # the input of linear in linear-relu
+        if should_log_inputs:
+            nodes_and_names_to_instrument_inputs_a.append(
+                (subgraph_a.start_node, match_name, ref_node_type_a)
+            )
+            nodes_and_names_to_instrument_inputs_b.append(
+                (subgraph_b.start_node, match_name, ref_node_type_b)
+            )
+        # Note: for matching activations we always use end_node,
+        # such as observing the output of relu in linear-relu
+        nodes_and_names_to_instrument_outputs_a.append(
+            (subgraph_a.end_node, match_name, ref_node_type_a)
+        )
+        nodes_and_names_to_instrument_outputs_b.append(
+            (subgraph_b.end_node, match_name, ref_node_type_b)
+        )
+
+    new_model_a = _add_loggers_one_model(
+        name_a,
+        gm_a,
+        nodes_and_names_to_instrument_inputs_a,
+        nodes_and_names_to_instrument_outputs_a,
+        logger_cls,
+    )
+    new_model_b = _add_loggers_one_model(
+        name_b,
+        gm_b,
+        nodes_and_names_to_instrument_inputs_b,
+        nodes_and_names_to_instrument_outputs_b,
+        logger_cls,
+    )
+    return (new_model_a, new_model_b)
+
+
+def add_loggers(
+    name_a: str,
+    model_a: nn.Module,
+    name_b: str,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    should_log_inputs: bool = False,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> tuple[nn.Module, nn.Module]:
+    """
+    Instrument model A and model B with loggers.
+
+    Args:
+        name_a: string name of model A to use in results
+        model_a: model A
+        name_b: string name of model B to use in results
+        model_b: model B
+        logger_cls: class of Logger to use
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+
+    Return:
+        Returns a tuple of (model_a_with_loggers, model_b_with_loggers).  Modifies both models inplace.
+    """
+
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.add_loggers")
+    # TODO(future PR): expose these
+    skipped_module_names: list[str] = []
+    skipped_module_classes: list[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(
+        model_a, "node_name_to_scope"
+    )
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(
+        model_b, "node_name_to_scope"
+    )
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _add_loggers_impl(
+        name_a,
+        gm_a,
+        name_b,
+        gm_b,
+        logger_cls,
+        should_log_inputs=should_log_inputs,
+        base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops,
+        unmatchable_types_map=unmatchable_types_map,
+    )
+
+
+def _extract_logger_info_one_model(
+    model: nn.Module,
+    results: NSResultsType,
+    logger_cls: Callable,
+) -> None:
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx._extract_logger_info_one_model"
+    )
+    for _gm_name, mod in model.named_modules():
+        # TODO(future PR): better check when scripted
+        is_logger = isinstance(mod, logger_cls) or (  # type: ignore[arg-type]
+            isinstance(mod, torch.jit.RecursiveScriptModule)
+            and mod.original_name == "OutputLogger"
+        )
+        if is_logger:
+            key = mod.ref_name
+            if key not in results:
+                results[key] = {}
+            if mod.model_name in results[key]:
+                raise AssertionError(f"{mod.model_name} is already present in results")
+            if mod.results_type not in results[key]:
+                results[key][mod.results_type] = {}
+            if mod.model_name not in results[key][mod.results_type]:
+                results[key][mod.results_type][mod.model_name] = []
+            stats_to_use = mod.stats
+            if len(mod.stats_rnn) > 0:
+                stats_to_use = mod.stats_rnn
+            data = {
+                "type": mod.results_type,
+                "values": stats_to_use,
+                "ref_node_name": mod.ref_node_name,
+                "ref_node_target_type": mod.ref_node_target_type,
+                "prev_node_name": mod.prev_node_name,
+                "prev_node_target_type": mod.prev_node_target_type,
+                "index_within_arg": mod.index_within_arg,
+                "index_of_arg": mod.index_of_arg,
+                "fqn": mod.fqn,
+                "qconfig_str": mod.qconfig_str,
+            }
+            if hasattr(mod, "comparisons"):
+                data["comparisons"] = mod.comparisons
+                data["comparison_fn_name"] = mod.comparison_fn_name
+            else:
+                data["comparisons"] = []
+                data["comparison_fn_name"] = ""
+            results[key][mod.results_type][mod.model_name].append(data)
+            # ensure the list stays sorted
+            results[key][mod.results_type][mod.model_name].sort(
+                key=lambda res: f"{res['index_of_arg']}:{res['index_within_arg']}"
+            )
+
+
+# TODO(future PR): align on naming
+# this is equivalent of just the comparison extraction part of `ns.compare_model_outputs`
+def extract_logger_info(
+    model_a: nn.Module,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    model_name_to_use_for_layer_names: str,
+) -> NSResultsType:
+    """
+    Traverse all loggers in `model_a` and `model_b`, and extract the logged
+    information.
+
+    Args:
+        model_a: model A
+        model_b: model B
+        logger_cls: class of Logger to use
+        model_name_to_use_for_layer_names: string name of model to use for
+          layer names in the output
+
+    Return:
+        NSResultsType, containing the logged comparisons
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx.extract_logger_info"
+    )
+    results: NSResultsType = {}
+    for model in (model_a, model_b):
+        _extract_logger_info_one_model(model, results, logger_cls)
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+    # rekey on the name of model b
+    results = rekey_logger_info_on_node_name_of_model(
+        results, model_name_to_use_for_layer_names
+    )
+    return results
+
+
+def _add_shadow_loggers_impl(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> nn.Module:
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx._add_shadow_loggers_impl"
+    )
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b, base_name_to_sets_of_related_ops, unmatchable_types_map
+    )
+    gm_a_shadows_b = create_a_shadows_b(
+        name_a,
+        gm_a,
+        name_b,
+        gm_b,
+        matched_subgraph_pairs,
+        logger_cls,
+        should_log_inputs=should_log_inputs,
+        node_type_to_io_type_map=node_type_to_io_type_map,
+    )
+    return gm_a_shadows_b
+
+
+def add_shadow_loggers(
+    name_a: str,
+    model_a: nn.Module,
+    name_b: str,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    should_log_inputs: bool = False,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> nn.Module:
+    """
+    Instrument model A and model B with shadow loggers.
+
+    Args:
+        name_a: string name of model A to use in results
+        model_a: model A
+        name_b: string name of model B to use in results
+        model_b: model B
+        logger_cls: class of Logger to use
+        should_log_inputs: whether to log inputs
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx.add_shadow_loggers"
+    )
+    # TODO(future PR): expose these
+    skipped_module_names: list[str] = []
+    skipped_module_classes: list[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(
+        model_a, "node_name_to_scope"
+    )
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(
+        model_b, "node_name_to_scope"
+    )
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _add_shadow_loggers_impl(
+        name_a,
+        gm_a,
+        name_b,
+        gm_b,
+        logger_cls,
+        should_log_inputs=should_log_inputs,
+        base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops,
+        node_type_to_io_type_map=node_type_to_io_type_map,
+        unmatchable_types_map=unmatchable_types_map,
+    )
+
+
+def extract_shadow_logger_info(
+    model_a_shadows_b: nn.Module,
+    logger_cls: Callable,
+    model_name_to_use_for_layer_names: str,
+) -> NSResultsType:
+    """
+    Traverse all loggers in a shadow model, and extract the logged
+    information.
+
+    Args:
+        model_a_shadows_b: shadow model
+        logger_cls: class of Logger to use
+        model_name_to_use_for_layer_names: string name of model to use for
+          layer names in the output
+
+    Return:
+        NSResultsType, containing the logged comparisons
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api._numeric_suite_fx.extract_shadow_logger_info"
+    )
+    results: NSResultsType = collections.defaultdict(dict)
+    _extract_logger_info_one_model(model_a_shadows_b, results, logger_cls)
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+    # rekey on the name of model b
+    results = rekey_logger_info_on_node_name_of_model(
+        results, model_name_to_use_for_layer_names
+    )
+    return dict(results)
+
+
+def extend_logger_results_with_comparison(
+    results: NSResultsType,
+    model_name_1: str,
+    model_name_2: str,
+    comparison_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+    comparison_name: str,
+) -> None:
+    """
+    Compares the logged values from `model_name_2` against the corresponding
+    values in `model_name_1`, using `comparison_fn`. Records the result
+    in `model_name_2`'s results under `comparison_name`. Modifies `results` inplace.
+
+    Args:
+        results: the result data structure from `extract_logger_info` or
+          `extract_shadow_logger_info`.
+        model_name_1: string name of model 1
+        model_name_2: string name of model 2
+        comparison_fn: function to compare two Tensors
+        comparison_name: string name of model to use for
+          layer names in the output
+    """
+    for results_type_to_results in results.values():
+        for model_name_to_results in results_type_to_results.values():
+            if model_name_1 not in model_name_to_results:
+                raise AssertionError(f"{model_name_1} not found in results")
+            if model_name_2 not in model_name_to_results:
+                raise AssertionError(f"{model_name_2} not found in results")
+
+            results_1 = model_name_to_results[model_name_1]
+            results_2 = model_name_to_results[model_name_2]
+
+            for result_2 in results_2:
+                index_within_arg_2 = result_2["index_within_arg"]
+                index_of_arg_2 = result_2["index_of_arg"]
+                # find corresponding result_1
+                result_1 = None
+                for cur_result_1 in results_1:
+                    index_within_arg_1 = cur_result_1["index_within_arg"]
+                    index_of_arg_1 = cur_result_1["index_of_arg"]
+                    if (index_within_arg_1 == index_within_arg_2) and (
+                        index_of_arg_1 == index_of_arg_2
+                    ):
+                        result_1 = cur_result_1
+                        break
+                if result_1 is None:
+                    raise AssertionError("Expected result_1 to be not None")
+
+                values_1 = result_1["values"]
+                values_2 = result_2["values"]
+                result_2[comparison_name] = []
+                for value_1, value_2 in zip(values_1, values_2):
+                    comparison_result = comparison_fn(value_1, value_2)
+                    result_2[comparison_name].append(comparison_result)
+
+
+def prepare_n_shadows_model(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_multi_mapping: QConfigMultiMapping,
+    backend_config: BackendConfig,
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
+    custom_tracer: Any = None,
+) -> GraphModule:
+    """
+    Given a model with a graph with M ops such as
+
+
+      args_kwargs_m -> op_m -> output_m
+
+
+    And a set of N qconfigs for each op, creates a new model, with
+    each of the subgraph of `op_m` transformed into
+
+    .. code::
+
+           |---------> op_m_n -> log_m_n
+           |                     /
+      args_kwargs_m ---------> op_m -> log_m_0
+
+    Where op_m_n is op_m wrapped in a submodule and transformed with
+    qconfig_n, and its inner graph looks like
+
+    .. code::
+
+      args_m -------- op_m_prepared_with_qconfig_n -> out_m_n
+                  /
+      kwargs_m ---
+
+    This is useful for testing different quantization of multiple layers in
+    a single pass through the model.
+
+    High level TODOs for future PRs:
+    * figure out a better way to name the output structure
+    * return a results data structure instead of printing it out
+    * add examples to docblocks
+    """
+
+    if custom_tracer is None:
+        tracer = quantize_fx.QuantizationTracer([], [])
+    else:
+        tracer = custom_tracer
+    mt = torch.fx.GraphModule(model, tracer.trace(model))
+    # this is necessary to ensure logger FQNs get populated
+    mt._node_name_to_scope = tracer.node_name_to_scope  # type: ignore[assignment]
+
+    # run example input propagation, we need this to call prepare_fx on
+    # individual subgraphs
+    output_prop = OutputProp(mt)
+    output_prop.propagate(*example_inputs)
+
+    # Find the set of subgraphs in the original graph which we need to
+    # consider.
+    modules = dict(mt.named_modules(remove_duplicate=False))
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
+    root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config)
+    standalone_module_names: list[str] = []
+    standalone_module_classes: list[type] = []
+    custom_module_classes: list[type] = []
+    matches = _find_matches(
+        mt.graph,
+        modules,
+        patterns,
+        root_node_getter_mapping,
+        standalone_module_names,
+        standalone_module_classes,
+        custom_module_classes,
+    )
+    subgraphs_dedup: dict[str, list[Node]] = _get_dedup_subgraphs(matches)
+
+    # generate node to qconfig for each subgraph
+    # TODO(future PR): deduplicate repeating entries
+    list_of_node_name_to_qconfig: list[dict[str, QConfigAny]] = []
+    for qconfig_mapping in qconfig_multi_mapping.qconfig_mappings_list:
+        node_name_to_qconfig = _generate_node_name_to_qconfig(
+            mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope
+        )
+        list_of_node_name_to_qconfig.append(node_name_to_qconfig)
+
+    # For each region in the model, do the following:
+    #   For each qconfig for that region, do the following:
+    #     1. create a copy of the region wrapped in a module
+    #     2. pass original args, original kwargs, and expected output to module
+    #     3. add an output comparison logger and hook it up to compare
+    #        actual output to expected output
+    #     4. run `prepare_fx` on the module
+    for subgraph_idx, (match_name, nodes_in_this_subgraph) in enumerate(
+        subgraphs_dedup.items()
+    ):
+        create_n_transformed_and_logged_copies_of_subgraph(
+            mt,
+            subgraph_idx,
+            match_name,
+            nodes_in_this_subgraph,
+            qconfig_multi_mapping.qconfig_mappings_list,
+            list_of_node_name_to_qconfig,
+            custom_prepare_fn,
+            custom_prepare_kwargs,  # type: ignore[arg-type]
+        )
+
+    return mt
+
+
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _prepare_n_shadows_add_loggers_model(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> torch.nn.Module:
+    r"""
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+
+    This creates a model which provides logging for the following
+    problem: if we quantize `model` with `qconfig_mapping` and feed
+    the same input through both models, log the comparisons of
+    corresponding intermediate layers.
+
+    The problem is solved with a single model.  Specifically, we
+    partition `model` into N subgraphs, create a copy of each relevant
+    subgraph, wrap it in a module, apply the quantization API to that
+    module, and hook up loggers to measure the comparisons.
+
+    Example starting graph:
+
+      x0 -> op0 -> x1 -> op1 -> x2
+
+    Example config: quantize op0 to int8, do nothing to op1.
+    The following graph will be created:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_0 -> x2_1 ----> clog
+
+    Where op0_0 is op0, op0_1 is op0 wrapped in a submodule and quantized
+    to int8, op1_0 is op1 (appearing in the graph twice), log is a logger,
+    and clog is a comparison logger.
+    """
+
+    tracer = quantize_fx.QuantizationTracer([], [])
+    mt = torch.fx.GraphModule(model, tracer.trace(model))
+    # this is necessary to ensure logger FQNs get populated
+    mt._node_name_to_scope = tracer.node_name_to_scope  # type: ignore[assignment]
+
+    # run example input propagation, we need this to call prepare_fx on
+    # individual subgraphs
+    output_prop = OutputProp(mt)
+    output_prop.propagate(*example_inputs)
+
+    # Find the set of subgraphs in the original graph which we need to
+    # consider.
+    modules = dict(mt.named_modules(remove_duplicate=False))
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
+    root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config)
+    standalone_module_names: list[str] = []
+    standalone_module_classes: list[type] = []
+    custom_module_classes: list[type] = []
+    matches = _find_matches(
+        mt.graph,
+        modules,
+        patterns,
+        root_node_getter_mapping,
+        standalone_module_names,
+        standalone_module_classes,
+        custom_module_classes,
+    )
+    subgraphs_dedup: dict[str, list[Node]] = _get_dedup_subgraphs(matches)
+
+    # generate node to qconfig for each subgraph
+    node_name_to_qconfig = _generate_node_name_to_qconfig(
+        mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope
+    )
+
+    # Now, mutate the graph to be the add_loggers graph with propagation
+    # error.
+    create_add_loggers_graph(mt, subgraphs_dedup, qconfig_mapping, node_name_to_qconfig)
+
+    return mt
+
+
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _n_shadows_compare_weights(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> NSResultsType:
+    """
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+    """
+    qconfig_multi_mapping = QConfigMultiMapping.from_list_qconfig_mapping(
+        [qconfig_mapping]
+    )
+    mp = prepare_n_shadows_model(
+        model, example_inputs, qconfig_multi_mapping, backend_config
+    )
+    # passing inputs through the model is necessary to populate
+    # observers which observe weights with real values
+    mp(*example_inputs)
+    mq = convert_n_shadows_model(mp)
+    weight_comparison = extract_weight_comparison(mq)
+    return weight_comparison
+
+
+# TODO(future PR): consider aligning API signature with other similar quantization
+# functions (enable_fake_quant, etc)
+def loggers_set_enabled(model: torch.nn.Module, enabled: bool) -> None:
+    """
+    Sets the `enabled` setting on a `model`'s loggers
+    """
+    for _, child in model.named_modules():
+        if isinstance(child, OutputLogger):
+            child.enabled = enabled
+
+
+# TODO(future PR): consider aligning API signature with other similar quantization
+# functions (enable_fake_quant, etc)
+def loggers_set_save_activations(
+    model: torch.nn.Module,
+    save_activations: bool,
+) -> None:
+    """
+    Sets the `save_activations` setting on a `model`'s loggers
+    """
+    for _name, child in model.named_modules():
+        if isinstance(child, OutputLogger):
+            child.save_activations = save_activations
+
+
+def convert_n_shadows_model(
+    model: GraphModule,
+    custom_convert_fn: Callable | None = None,
+    custom_convert_kwargs: dict[str, Any] | None = None,
+) -> GraphModule:
+    """
+    Given a model from `prepare_n_shadows_model`, runs `convert_fx`
+    on each shadow submodule.
+    """
+    for node in model.graph.nodes:
+        # TODO(future PR): consider matching in a safer way than
+        # node name string match
+        if node.name.startswith(SHADOW_WRAPPER_NODE_NAME_PREFIX):
+            orig_mod = getattr(model, node.name)
+            if custom_convert_fn is None:
+                converted_mod = torch.ao.quantization.quantize_fx.convert_fx(orig_mod)
+            else:
+                if custom_convert_kwargs is None:
+                    custom_convert_kwargs = {}
+                converted_mod = custom_convert_fn(orig_mod, **custom_convert_kwargs)
+            setattr(model, node.name, converted_mod)
+
+    return model
+
+
+def extract_results_n_shadows_model(model: torch.nn.Module) -> NSResultsType:
+    """
+    Extracts logger results from `model`.
+    """
+    results: NSResultsType = {}
+    _extract_logger_info_one_model(model, results, OutputLogger)
+    return results
+
+
+def print_comparisons_n_shadows_model(results: NSResultsType) -> None:
+    """
+    Prints a summary of extracted `results`.
+    """
+    results_grouped = group_results_by_subgraph(results)
+    results_comparison = create_results_comparison(results_grouped)
+    print_n_shadows_summary(results_comparison)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef5330d2dfed35fc4ebce6ec88ec0448788115f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ad7877ed8be7cdcfc8a99527c6470c731c1e82c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c64e7f16784f5b3262ad614ba382218500395abf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c3b7decef5cea4527ec121108aa3618ec211075
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c36befccf69e1da2ee85ff06462303af6f112e0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d7df37b978fc4ece793d06d4b56e29f7201a722
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee739633fe41f1ccffb1b40bccbb1bdd3a526724
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee6d25c59ae9c398a1d10e3e7a910f538b2d8954
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2969564680c7c0a4de1efa03d7bc78dbd51a5045
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e15a5ad25aefa23e2ce1f558e5e3e581eb505bf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fdad3f2d9bc49094c0da3264012cc206c28ab86
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_matcher.py
@@ -0,0 +1,485 @@
+# mypy: allow-untyped-defs
+import collections
+import enum
+from typing import Any
+
+import torch
+from torch.ao.quantization import FakeQuantizeBase, ObserverBase
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.fx import GraphModule
+from torch.fx.graph import Graph, Node
+
+from .mappings import get_base_name_to_sets_of_related_ops, get_unmatchable_types_map
+from .ns_types import NSNodeTargetType, NSSubgraph
+from .pattern_utils import (
+    end_node_matches_reversed_fusion,
+    get_reversed_fusions,
+    get_type_a_related_to_b,
+)
+
+
+toq = torch.ops.quantized
+
+
+def _get_output_nodes(g: Graph) -> list[Node]:
+    return [n for n in g.nodes if n.op == "output"]
+
+
+class _NSGraphMatchableSubgraphsIterator:
+    """
+    Iterates through the graph of gm, starting with the output nodes
+    and continuing backwards.
+    1. Returns matchable subgraphs, in order. A subgraph is defined by
+       (start_node, end_node).
+    2. Skips over non-matchable subgraphs
+    """
+
+    def __init__(
+        self,
+        gm: GraphModule,
+        non_matchable_functions: set[NSNodeTargetType],
+        non_matchable_modules: set[NSNodeTargetType],
+        non_matchable_methods: set[NSNodeTargetType],
+    ):
+        self.gm: GraphModule = gm
+        self.non_matchable_functions: set[NSNodeTargetType] = non_matchable_functions
+        self.non_matchable_modules: set[NSNodeTargetType] = non_matchable_modules
+        self.non_matchable_methods: set[NSNodeTargetType] = non_matchable_methods
+        self.seen_nodes: set[Node] = set()
+        self.stack: list[Node] = []
+        for start_node in _get_output_nodes(self.gm.graph):
+            self.stack.append(start_node)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> NSSubgraph:
+        """
+        Returns the next matchable subgraph.
+        """
+        while len(self.stack) > 0:
+            cur_end_node = self.stack.pop()
+            if cur_end_node in self.seen_nodes:
+                continue
+
+            # for subgraphs which are single nodes, start_node == end_node
+            # for subgraphs with more than one node, start node != end_node
+            cur_start_node = cur_end_node
+            # Subgraphs like linear-relu have the base node as the start node.
+            # Subgraphs like dequantize-linear-relu-to(torch.float16) have the
+            #   base node as the second node.
+            # The cur_base_op_node var will move to the actual node during
+            #   the fusion matching later in this code block.
+            cur_base_op_node = cur_end_node
+
+            # Check for potential fusions. For now, we are greedy
+            # and always skip all non-base nodes of a fusion.  For example,
+            # if we match linear-relu backwards, we will always skip the
+            # relu node and attempt to match the linear node.  This can
+            # be made configurable later if needed.
+            for _reverse_fusion_ops, base_op_idx in get_reversed_fusions():
+                is_match = end_node_matches_reversed_fusion(
+                    cur_end_node, _reverse_fusion_ops, self.gm, self.seen_nodes
+                )
+                if is_match:
+                    # navigate to the base node
+                    for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1):
+                        # pyrefly: ignore [bad-argument-type]
+                        self.seen_nodes.add(cur_start_node)
+                        # for now, assume that there are no other nodes
+                        # which need to be added to the stack
+                        cur_start_node = cur_start_node.args[0]  # type: ignore[assignment]
+                        # if the base op index matches the current node, set it
+                        rev_base_op_idx = len(_reverse_fusion_ops) - 2 - base_op_idx
+                        if rev_fusion_idx == rev_base_op_idx:
+                            cur_base_op_node = cur_start_node
+                    break
+
+            # pyrefly: ignore [bad-argument-type]
+            self.seen_nodes.add(cur_start_node)
+            # add args of previous nodes to stack
+            # pyrefly: ignore [missing-attribute]
+            for arg in cur_start_node.all_input_nodes:
+                self._recursively_add_node_arg_to_stack(arg)
+
+            # skip unmatchable nodes
+            # note: this check is done on the start_node, i.e.
+            # if we are matching linear-relu in reverse, this would do the matchable
+            # check on the linear
+            # pyrefly: ignore [bad-argument-type]
+            if not self._is_matchable(cur_base_op_node):
+                continue
+
+            # If an observer or a fake_quant was not matched as a part of
+            # a pattern of multiple nodes, ignore it. One case where this is
+            # relevant is an observer on a graph input, which was added because
+            # it is necessary for the next node.
+            if cur_end_node.op == "call_module" and cur_start_node is cur_end_node:
+                maybe_obs = getattr_from_fqn(self.gm, cur_end_node.target)  # type: ignore[arg-type]
+                if isinstance(maybe_obs, (ObserverBase, FakeQuantizeBase)):
+                    continue
+
+            return NSSubgraph(
+                # pyrefly: ignore [bad-argument-type]
+                start_node=cur_start_node,
+                end_node=cur_end_node,
+                # pyrefly: ignore [bad-argument-type]
+                base_op_node=cur_base_op_node,
+            )
+
+        raise StopIteration
+
+    def _recursively_add_node_arg_to_stack(self, arg: Any) -> None:
+        """
+        Adds all of the nodes in this arg to the stack, properly navigating
+        through list, dicts and tuples.
+        """
+        if isinstance(arg, Node):
+            self.stack.append(arg)
+        elif (
+            isinstance(arg, torch.fx.immutable_collections.immutable_list)
+            or type(arg) is tuple
+        ):
+            for inner_arg in arg:
+                self._recursively_add_node_arg_to_stack(inner_arg)
+        elif isinstance(arg, torch.fx.immutable_collections.immutable_dict):
+            for value in arg.values():
+                self._recursively_add_node_arg_to_stack(value)
+
+    def _is_matchable(self, node: Node) -> bool:
+        if node.op == "call_function":
+            return node.target not in self.non_matchable_functions
+        elif node.op == "call_module":
+            if not isinstance(node.target, str):
+                raise AssertionError(f"Expected str, got {type(node.target)}")
+            target_mod = getattr_from_fqn(self.gm, node.target)
+            return not any(
+                isinstance(target_mod, t)  # type: ignore[arg-type]
+                for t in self.non_matchable_modules
+            )
+        elif node.op == "call_method":
+            return node.target not in self.non_matchable_methods
+        else:
+            return False
+
+
+class GraphMatchingException(Exception):
+    """
+    Exception raised when two graphs cannot be matched.
+    """
+
+
+class SubgraphTypeRelationship(enum.Enum):
+    # same type, known
+    # example: F.linear and F.linear, or nn.Conv2d and nn.Conv2d
+    EQUAL = enum.auto()
+    # same type, but the type is not known to Numerical Suite
+    # (user defined type, etc).
+    EQUAL_BUT_UKNOWN = enum.auto()
+    # known, same subgraph_relationship set, but not the same type
+    # example: F.linear and toq.linear
+    RELATED_BUT_NOT_EQUAL = enum.auto()
+    # not related
+    NOT_RELATED = enum.auto()
+
+
+def _get_subgraph_relationship_type(
+    subgraph_a: NSSubgraph,
+    subgraph_b: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    type_a_related_to_b: set[tuple[NSNodeTargetType, NSNodeTargetType]],
+) -> SubgraphTypeRelationship:
+    node_a = subgraph_a.base_op_node
+    node_b = subgraph_b.base_op_node
+
+    # TODO(next): make this code handle matching by what is before the base op
+    if node_a.op != node_b.op:
+        if not (
+            node_a.op in ("call_function", "call_method")
+            and node_b.op in ("call_function", "call_method")
+        ):
+            return SubgraphTypeRelationship.NOT_RELATED
+
+    if node_a.op in ("call_function", "call_method"):
+        key = (node_a.target, node_b.target)
+
+        if key not in type_a_related_to_b:
+            if node_a.target == node_b.target:
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        # after this point, we are dealing with known types
+
+        if node_a.target == node_b.target:
+            node_a_has_prev = subgraph_a.base_op_node == subgraph_a.start_node
+            node_b_has_prev = subgraph_b.base_op_node == subgraph_b.start_node
+            if node_a_has_prev and (not node_b_has_prev):
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and node_b_has_prev:
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and (not node_b_has_prev):
+                return SubgraphTypeRelationship.EQUAL
+            else:
+                # TODO(future PR): check for matches start_op_node and base_op_node
+                return SubgraphTypeRelationship.EQUAL
+
+        if key in type_a_related_to_b:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+        else:
+            return SubgraphTypeRelationship.NOT_RELATED
+    elif node_a.op == "call_module":
+        if (
+            subgraph_a.base_op_node != subgraph_a.start_node
+            or subgraph_b.base_op_node != subgraph_b.start_node
+        ):
+            raise AssertionError(
+                "Matching call_module patterns where base_op_node != start_node is not supported yet"
+            )
+        # for call_module, we need to look up the modules to do the type check
+        if not isinstance(node_a.target, str):
+            raise AssertionError(f"Expected str, got {type(node_a.target)}")
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        if not isinstance(node_b.target, str):
+            raise AssertionError(f"Expected str, got {type(node_b.target)}")
+        mod_b = getattr_from_fqn(gm_b, node_b.target)
+
+        key = (type(mod_a), type(mod_b))
+
+        if key not in type_a_related_to_b:
+            if type(mod_a) is type(mod_b):
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        elif type(mod_a) is type(mod_b):
+            return SubgraphTypeRelationship.EQUAL
+        else:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+
+    return SubgraphTypeRelationship.NOT_RELATED
+
+
+def _get_name_for_subgraph(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
+    existing_names: set[str],
+) -> str:
+    """
+    Returns a unique name for a subgraph. This name is based on two things:
+    1. the name of the set containing the underlying type of the base op in the
+       subgraph (i.e. 'torch.nn.functional.linear' if this is related to a linear op)
+    2. the number of previous subgraphs with related underlying type of the base op
+
+    For example, in the graph
+
+    linear0 -> relu0 -> linear1 -> relu1
+
+    The subgraphs are (linear0, relu0) and (linear1, relu1).  If we iterate
+    from the output node backwards, the name given to (linear1, relu1) will be
+    `base_op_torch.nn.functional.linear_0`, and the name given to (linear0, relu0)
+    will be `base_op_torch.nn.functional.linear_1`.
+
+    Why are we not just using the node name? Answer: because of two requirements:
+    A. fusions must be supported
+    B. some Numeric Suite APIs can be called without having all of the models in memory
+
+    For example, let's say we need to match nodes of
+
+    (1) ... -> linear0 -> relu0 -> ...
+
+    And
+
+    (2) ... -> linear_relu0 -> ...
+
+    Without being able to inspect them together. With the current naming scheme, if
+    we iterate through both of these graphs in the same order, and assuming the rest
+    of the graphs match, both of these subgraphs will get the same name without
+    (1) and (2) knowing anything about each other.
+    """
+    target_type = _get_node_target_type(subgraph_a.base_op_node, gm_a)
+    target_base_type = None
+    for base_name, sets_of_related_ops in base_name_to_sets_of_related_ops.items():
+        if target_type in sets_of_related_ops:
+            target_base_type = base_name
+    target_base_name = "base_op_" + str(target_base_type)
+    counter = 0
+    proposed_name = target_base_name + "_" + str(counter)
+    while proposed_name in existing_names:
+        counter += 1
+        proposed_name = target_base_name + "_" + str(counter)
+    existing_names.add(proposed_name)
+    return proposed_name
+
+
+def _get_node_target_type(node: Node, gm: GraphModule) -> NSNodeTargetType | None:
+    if node.op in ("call_function", "call_method"):
+        return node.target
+    elif node.op == "call_module":
+        if not isinstance(node.target, str):
+            raise AssertionError(f"Expected str, got {type(node.target)}")
+        mod = getattr_from_fqn(gm, node.target)
+        return type(mod)
+    return None
+
+
+def get_matching_subgraph_pairs(
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] | None = None,
+    unmatchable_types_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> dict[str, tuple[NSSubgraph, NSSubgraph]]:
+    """
+    Matches matchable subgraphs of graph_a to graph_b.
+
+    For a node, "matchable" is defined as a node which is not an observer,
+    fake_quants, quant or dequant.
+
+    A subgraph can contain one or more nodes.  A subgraph is matchable if
+    at least one node inside of it is matchable.  Currently, all nodes in
+    a subgraph must be matchable (because we assume no observers will be
+    inserted in the middle of a fusion).
+
+    A subgraph is defined by (start_node, end_node).  We assume that only
+    start_node and end_node are linked with the surrounding graph, all other
+    nodes in a subgraph are self-contained.
+
+    A pair of nodes is "related" if both nodes represent the same mathematical
+    operation across different quantization flavors. For example,
+    `F.linear` and `torch.ops.quantized.linear` are related, and
+    `F.linear` and `torch.nn.Conv` are not related.
+
+    For each matchable pair of nodes node_a and node_b, they will match
+    if node_a and node_b are related.
+
+    For graphs A and B, they will match iff:
+    1. the number of matchable subgraphs in A and B is equivalent
+    2. when iterating through the matchable subgraphs of A and B in the same order, each
+       corresponding pair of base nodes is related.
+
+    This enables us to find the corresponding subgraphs between
+    graphs of related models.  For example, if we had two graphs such as:
+
+    graph_a: x0 -> conv_0 (type: nn.Conv2d) -> obs_0 -> x1
+             w  -/
+             b  -/
+
+    graph_b: x0 -> quant_0 -> qconv_0 (type: nnq.Conv2d) -> dequant_0 -> x1
+           packed_params_0 -/
+
+    This function will return the following result:
+    {
+        'conv_0': (  # the name of the node in graph_b
+          (conv_0, conv_0),  # (start_node_a, end_node_a)
+          (qconv_0, qconv_0),  # (start_node_b, end_node_b)
+        ),
+    }
+
+    Or, if we have a fusion pattern,
+
+    graph_a: x0 -> linear_0 -> relu_0 -> obs_0 -> x1
+             w  -/
+             b  -/
+
+    graph_b: x0 -> quant_0 -> linear_relu_0 -> dequant_0 -> x1
+           packed_params_0 -/
+
+    This function will return the following result:
+    {
+        'linear_relu_0': (  # the name of the node in graph_b
+          (linear_0, relu_0),  # (start_node_a, end_node_a)
+          (linear_relu_0, linear_relu_0),  # (start_node_b, end_node_b)
+        ),
+    }
+    """
+    if unmatchable_types_map is None:
+        unmatchable_types_map = get_unmatchable_types_map()
+    non_matchable_functions = unmatchable_types_map["funs_unmatchable"]
+    non_matchable_modules = unmatchable_types_map["mods_unmatchable"]
+    non_matchable_methods = unmatchable_types_map["meths_unmatchable"]
+
+    graph_a_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_a, non_matchable_functions, non_matchable_modules, non_matchable_methods
+    )
+    graph_b_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_b, non_matchable_functions, non_matchable_modules, non_matchable_methods
+    )
+    results = collections.OrderedDict()
+    if base_name_to_sets_of_related_ops is None:
+        base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops()
+    type_a_related_to_b = get_type_a_related_to_b(base_name_to_sets_of_related_ops)
+
+    existing_names_a: set[str] = set()
+    existing_names_b: set[str] = set()
+
+    while True:
+        # fetch the next subgraphs from a and b
+        cur_subgraph_a, cur_subgraph_b = None, None
+        try:
+            cur_subgraph_a = next(graph_a_iterator)
+        except StopIteration:
+            pass
+        try:
+            cur_subgraph_b = next(graph_b_iterator)
+        except StopIteration:
+            pass
+
+        # look up types of a and b for useful error messages
+        type_start_a, type_start_b = None, None
+        if cur_subgraph_a is not None:
+            type_start_a = _get_node_target_type(cur_subgraph_a.start_node, gm_a)
+        if cur_subgraph_b is not None:
+            type_start_b = _get_node_target_type(cur_subgraph_b.start_node, gm_b)
+
+        # check for results and determine what to do next
+        if cur_subgraph_a is not None and cur_subgraph_b is not None:
+            # both nodes were fetched, check for subgraph_relationship
+            # note: subgraph_relationship is checked on the start node, i.e.
+            # if a linear-relu pattern is checked, we would check for subgraph_relationship
+            # of the linear
+            subgraph_relationship = _get_subgraph_relationship_type(
+                cur_subgraph_a, cur_subgraph_b, gm_a, gm_b, type_a_related_to_b
+            )
+            if subgraph_relationship == SubgraphTypeRelationship.NOT_RELATED:
+                msg = f"""
+The subgraphs
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b})
+are not related. Please ensure that the two models you pass in have the same number
+of subgraphs, and each pair of subgraphs is related to each other."""
+                raise GraphMatchingException(msg)
+            elif subgraph_relationship == SubgraphTypeRelationship.EQUAL_BUT_UKNOWN:
+                # skip matching but unknown types
+                continue
+            key_name_a = _get_name_for_subgraph(
+                cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops, existing_names_a
+            )
+            key_name_b = _get_name_for_subgraph(
+                cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops, existing_names_b
+            )
+            if key_name_a != key_name_b:
+                raise AssertionError(
+                    f"Subgraph names {key_name_a} and {key_name_b} do not match"
+                )
+            results[key_name_a] = (cur_subgraph_a, cur_subgraph_b)
+            continue
+        elif cur_subgraph_a is None and cur_subgraph_b is None:
+            # we reached the end of both graphs
+            break
+        else:
+            # only one node was fetched, no match possible, throw error
+            msg = f"""
+Attempting to match
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b}),
+one of which is empty. Please ensure that the two models you pass in have the same number
+of subgraphs."""
+            raise GraphMatchingException(msg)
+
+    # The subgraph pairs are originally created by traversing the two graphs
+    # from the outputs to the inputs. Reverse the results to return the
+    # subgraphs in their order of execution.
+    results = collections.OrderedDict(reversed(results.items()))
+
+    # pyrefly: ignore [bad-return]
+    return results
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py
new file mode 100644
index 0000000000000000000000000000000000000000..338db28ce41d96ec5d3de38591f5937543d65394
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/graph_passes.py
@@ -0,0 +1,1155 @@
+# mypy: allow-untyped-defs
+from collections.abc import Callable
+from typing import Any
+
+import torch
+from torch.ao.ns.fx.mappings import get_node_type_to_io_type_map
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ao.quantization.observer import _is_activation_post_process
+from torch.fx import GraphModule, map_arg
+from torch.fx.graph import Graph, Node
+
+from .ns_types import NSNodeTargetType, NSSingleResultValuesType, NSSubgraph
+from .utils import (
+    get_arg_indices_of_inputs_to_log,
+    get_node_first_input_and_output_type,
+    get_node_input_qparams,
+    get_normalized_nth_input,
+    get_number_of_non_param_args,
+    get_target_type_str,
+    getattr_from_fqn,
+    NodeInputOrOutputType,
+    op_type_supports_shadowing,
+    return_first_non_observer_node,
+)
+
+
+def _maybe_get_fqn(node: Node, gm: GraphModule) -> str | None:
+    fqn = None
+    if hasattr(gm, "_node_name_to_scope"):
+        # fqn on observers is not present, because they do not
+        # exist when the fqns are created during tracing. If this is
+        # an observer, get the fqn of the node being observed.
+        node_to_use_for_fqn = node
+        if node.op == "call_module":
+            if not isinstance(node.target, str):
+                raise AssertionError(f"Expected str, got {type(node.target)}")
+            module = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(module):
+                node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
+        fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0]  # type: ignore[index]
+    return fqn  # type: ignore[return-value]
+
+
+def _insert_logger_after_node(
+    node: Node,
+    gm: GraphModule,
+    logger_cls: Callable,
+    logger_node_name_suffix: str,
+    ref_node_name: str,
+    model_name: str,
+    ref_name: str,
+    ref_node_target_type: str,
+    results_type: str,
+    index_within_arg: int,
+    index_of_arg: int,
+    fqn: str | None,
+) -> Node:
+    """
+    Given a starting graph of
+
+    prev_node -> node -> next_node
+
+    This function creates a new logger_cls obj and adds it
+    after node, resulting in
+
+    prev_node -> node -> logger_obj -> next_node
+    """
+    # create new name
+    logger_node_name = get_new_attr_name_with_prefix(
+        node.name + logger_node_name_suffix
+    )(gm)
+    target_type = get_target_type_str(node, gm)
+    # create the logger object
+    logger_obj = logger_cls(
+        ref_node_name,
+        node.name,
+        model_name,
+        ref_name,
+        target_type,
+        ref_node_target_type,
+        results_type,
+        index_within_arg,
+        index_of_arg,
+        fqn,
+    )
+    # attach the logger object to the parent module
+    setattr(gm, logger_node_name, logger_obj)
+    logger_node = node.graph.create_node("call_module", logger_node_name, (node,), {})
+    return logger_node
+
+
+def add_loggers_to_model(
+    gm: GraphModule,
+    node_to_instrument_inputs_to_ref_node_name: dict[Node, tuple[str, str]],
+    node_to_instrument_outputs_to_ref_node_name: dict[Node, tuple[str, str]],
+    logger_cls: Callable,
+    model_name: str,
+) -> GraphModule:
+    """
+    Takes the graph of gm, adds loggers to the output
+    of each node in nodes_to_instrument. Returns a GraphModule with the new
+    graph.
+    """
+
+    new_graph = Graph()
+    env: dict[str, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            new_graph.output(map_arg(get_normalized_nth_input(node, gm, 0), load_arg))
+            continue
+
+        if (node in node_to_instrument_inputs_to_ref_node_name) or (
+            node in node_to_instrument_outputs_to_ref_node_name
+        ):
+            fqn = _maybe_get_fqn(node, gm)
+
+            if node in node_to_instrument_inputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_inputs_to_ref_node_name[
+                    node
+                ]
+                # Ops such add and mul are special because either
+                # one or two of the first two arguments can be tensors,
+                # and if one argument is a tensor it can be first or
+                # second (x + 1 versus 1 + x).
+                arg_indices_to_log = get_arg_indices_of_inputs_to_log(node)
+                for node_arg_idx in arg_indices_to_log:
+                    node_arg = get_normalized_nth_input(node, gm, node_arg_idx)
+                    if type(node_arg) is Node:
+                        # create a single input logger
+                        prev_node = env[node_arg.name]
+                        env[node_arg.name] = _insert_logger_after_node(
+                            prev_node,
+                            gm,
+                            logger_cls,
+                            "_ns_logger_",
+                            node.name,
+                            model_name,
+                            ref_name,
+                            ref_node_type,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0,
+                            index_of_arg=node_arg_idx,
+                            fqn=fqn,
+                        )
+                    elif (
+                        type(node_arg) is torch.fx.immutable_collections.immutable_list
+                    ):
+                        # create N input loggers, one for each node
+                        for arg_idx, arg in enumerate(node_arg):  # type: ignore[var-annotated, arg-type]
+                            prev_node = env[arg.name]
+                            env[prev_node.name] = _insert_logger_after_node(
+                                prev_node,
+                                gm,
+                                logger_cls,
+                                "_ns_logger_",
+                                node.name,
+                                model_name,
+                                ref_name,
+                                ref_node_type,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx,
+                                index_of_arg=node_arg_idx,
+                                fqn=fqn,
+                            )
+
+            # ensure env is populated with base node
+            # Note: runs for both inputs and outputs
+            env[node.name] = new_graph.node_copy(node, load_arg)
+
+            if node in node_to_instrument_outputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_outputs_to_ref_node_name[
+                    node
+                ]
+                # add the logger after the base node
+                env[node.name] = _insert_logger_after_node(
+                    env[node.name],
+                    gm,
+                    logger_cls,
+                    "_ns_logger_",
+                    node.name,
+                    model_name,
+                    ref_name,
+                    ref_node_type,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0,
+                    index_of_arg=0,
+                    fqn=fqn,
+                )
+
+        else:
+            env[node.name] = new_graph.node_copy(node, load_arg)
+
+    new_gm = GraphModule(gm, new_graph)
+    return new_gm
+
+
+def _insert_quantize_per_tensor_node(
+    prev_node_c: Node,
+    node_a: Node,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    scale: torch.Tensor | float,
+    zero_point: torch.Tensor | int,
+    dtype_cast_name: str,
+) -> Node:
+    # copy scale
+    scale_node_name = get_new_attr_name_with_prefix(node_a.name + "_input_scale_")(gm_b)
+    setattr(gm_b, scale_node_name, scale)
+    scale_node = graph_c.create_node(
+        "get_attr", scale_node_name, (), {}, scale_node_name
+    )
+    # copy zero_point
+    zero_point_node_name = get_new_attr_name_with_prefix(
+        node_a.name + "_input_zero_point_"
+    )(gm_b)
+    setattr(gm_b, zero_point_node_name, zero_point)
+    zero_point_node = graph_c.create_node(
+        "get_attr", zero_point_node_name, (), {}, zero_point_node_name
+    )
+    # create the quantize_per_tensor call
+    return graph_c.create_node(
+        "call_function",
+        torch.quantize_per_tensor,
+        (prev_node_c, scale_node, zero_point_node, torch.quint8),
+        {},
+        dtype_cast_name,
+    )
+
+
+def _insert_dtype_cast_after_node(
+    node_a: Node,
+    node_c: Node,
+    prev_node_c: Node | list[Node],
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    node_name_prefix: str,
+    logger_cls: Callable,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]],
+) -> Node | list[Node]:
+    """
+    Given a starting graph C (derived from graph B) of
+
+    ... -> prev_node_c -> node_c -> ...
+
+    And a corresponding related node_a, inserts the correct dtype
+    cast node after prev_node_c to cast into the dtype expected
+    by node_a, resulting in:
+
+                          dtype_cast
+                        /
+    ... -> prev_node_c -> node_c -> ...
+
+    For example, if node_c is an int8 op and node_a is an fp32 op, this function
+    will insert a dequant.
+    """
+    dtype_cast_op = None
+    dtype_cast_mod_cls = None
+    dtype_cast_method = None
+    dtype_cast_method_dtype = None
+    dtype_cast_scale = None
+    dtype_cast_zero_point = None
+    node_input_type_a, _node_output_type_a = get_node_first_input_and_output_type(
+        node_a, gm_a, logger_cls, node_type_to_io_type_map
+    )
+    node_input_type_c, _node_output_type_c = get_node_first_input_and_output_type(
+        node_c, gm_b, logger_cls, node_type_to_io_type_map
+    )
+
+    if (
+        (
+            node_input_type_a == NodeInputOrOutputType.FP32
+            and node_input_type_c == NodeInputOrOutputType.INT8
+        )
+        or (
+            node_input_type_a == NodeInputOrOutputType.FP32
+            and node_input_type_c == NodeInputOrOutputType.FP16
+        )
+        or
+        # TODO(future PR): determine the actual dtype of node_c,
+        # the current code only works because dequantize works with
+        # multiple input dtypes.
+        (
+            node_input_type_a == NodeInputOrOutputType.FP32
+            and node_input_type_c == NodeInputOrOutputType.FP32_OR_INT8
+        )
+    ):
+        dtype_cast_op = torch.dequantize
+    elif (
+        node_input_type_a == node_input_type_c
+        and node_input_type_a != NodeInputOrOutputType.UNKNOWN
+    ):
+        dtype_cast_mod_cls = torch.nn.Identity
+    elif (
+        node_input_type_a == NodeInputOrOutputType.INT8
+        and node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        # int8 shadows fp32, the dtype cast needs to quantize to int8
+        # with the right qparams.
+        node_a_input_qparams = get_node_input_qparams(
+            node_a, gm_a, node_type_to_io_type_map
+        )
+        if node_a_input_qparams is not None:
+            dtype_cast_op = torch.quantize_per_tensor  # type: ignore[assignment]
+            dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams
+    elif (
+        node_input_type_a == NodeInputOrOutputType.FP16
+        and node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        dtype_cast_method = "to"
+        dtype_cast_method_dtype = torch.float16
+    else:
+        raise AssertionError(
+            f"dtype cast from {node_input_type_c} {node_c.format_node()} to "
+            + f"{node_input_type_a} {node_a.format_node()} needs to be implemented"
+        )
+
+    if isinstance(prev_node_c, Node):
+        new_dtype_cast_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        if dtype_cast_op:
+            if dtype_cast_scale is not None and dtype_cast_zero_point is not None:
+                return _insert_quantize_per_tensor_node(
+                    prev_node_c,
+                    node_a,
+                    gm_b,
+                    graph_c,
+                    dtype_cast_scale,
+                    dtype_cast_zero_point,
+                    new_dtype_cast_name,
+                )
+            else:
+                return graph_c.create_node(
+                    "call_function",
+                    dtype_cast_op,
+                    (prev_node_c,),
+                    {},
+                    new_dtype_cast_name,
+                )
+        elif dtype_cast_method:
+            return graph_c.create_node(
+                "call_method",
+                dtype_cast_method,
+                (prev_node_c, dtype_cast_method_dtype),
+                {},
+                new_dtype_cast_name,
+            )
+        else:
+            if not dtype_cast_mod_cls:
+                raise AssertionError("Expected dtype_cast_mod_cls to be not None")
+            dtype_cast_mod = dtype_cast_mod_cls()
+            setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+            return graph_c.create_node(
+                "call_module",
+                new_dtype_cast_name,
+                (prev_node_c,),
+                {},
+                new_dtype_cast_name,
+            )
+    elif isinstance(prev_node_c, list):
+        results = []
+        for prev_node_c_inner in prev_node_c:
+            new_dtype_cast_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+            if dtype_cast_op:
+                # TODO(future PR): add handling for quantize_per_tensor
+                new_dtype_cast_node = graph_c.create_node(
+                    "call_function",
+                    dtype_cast_op,
+                    (prev_node_c_inner,),
+                    {},
+                    new_dtype_cast_name,
+                )
+                results.append(new_dtype_cast_node)
+            else:
+                if not dtype_cast_mod_cls:
+                    raise AssertionError("Expected dtype_cast_mod_cls to be not None")
+                dtype_cast_mod = dtype_cast_mod_cls()
+                setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+                new_dtype_cast_node = graph_c.create_node(
+                    "call_module",
+                    new_dtype_cast_name,
+                    (prev_node_c_inner,),
+                    {},
+                    new_dtype_cast_name,
+                )
+                results.append(new_dtype_cast_node)
+        return results
+    else:
+        raise AssertionError(f"type f{type(prev_node_c)} is not handled")
+
+
+# TODO(future PR): look into using copy_node API instead
+def _copy_node_from_a_to_c(
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+) -> Node:
+    """
+    Simple copy of node_a to graph_c.
+    """
+    if node_a.op == "get_attr":
+        node_a_copy_name = get_new_attr_name_with_prefix(node_a.name + "_shadow_copy_")(
+            gm_b
+        )
+        node_a_obj = getattr_from_fqn(gm_a, node_a.target)  # type: ignore[arg-type]
+        if torch.is_tensor(node_a_obj):
+            node_a_obj = node_a_obj.detach()
+        setattr(gm_b, node_a_copy_name, node_a_obj)
+        node_a_copy = graph_c.create_node(
+            node_a.op, node_a_copy_name, (), {}, node_a_copy_name
+        )
+        return node_a_copy
+    elif node_a.op == "call_method":
+        if node_a.target not in ("dequantize", "to"):
+            raise AssertionError(f"target {node_a.target} is not implemented")
+        if node_a.target == "dequantize":
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c
+            )  # type: ignore[arg-type]
+            node_a_copy_name = get_new_attr_name_with_prefix(
+                node_a.name + "_shadow_copy_"
+            )(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op, node_a.target, (arg_copy,), {}, node_a_copy_name
+            )
+            return node_a_copy
+        else:  # to
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c
+            )  # type: ignore[arg-type]
+            node_a_copy_name = get_new_attr_name_with_prefix(
+                node_a.name + "_shadow_copy_"
+            )(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op,
+                node_a.target,
+                (arg_copy, get_normalized_nth_input(node_a, gm_a, 1)),
+                {},
+                node_a_copy_name,
+            )
+            return node_a_copy
+
+    else:
+        raise AssertionError(
+            f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented"
+        )
+
+
+def _can_insert_copy_of_subgraph_a(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    num_non_param_args_node_a: int,
+) -> bool:
+    """
+    This function returns `False` if the input subgraph cannot be copied by
+    `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means
+    that there is a corner case logic for which copy is not yet implemented.
+    """
+    # populate the list of nodes we need to check
+    nodes = []
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        nodes.append(cur_node)
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+    nodes.append(cur_node)
+    nodes.reverse()
+
+    def _can_insert(node_a_arg, gm_a):
+        if isinstance(node_a_arg, Node):
+            arg_a = return_first_non_observer_node(node_a_arg, gm_a)
+            if arg_a.op == "call_method":
+                return arg_a.target in ("dequantize", "to")
+            elif arg_a.op == "get_attr":
+                return True
+            else:
+                return False
+        elif isinstance(node_a_arg, (list, tuple)):
+            for el in node_a_arg:
+                if not isinstance(el, Node):
+                    return False
+        return True
+
+    # For each node, check if we handle the copy behavior. This follows the
+    # logic in `_insert_copy_of_subgraph_a_after_input_node_c`.
+    for node_a in nodes:
+        local_num_non_param_args_node_a = (
+            num_non_param_args_node_a if node_a is nodes[0] else 1
+        )
+
+        norm_args_kwargs = node_a.normalized_arguments(
+            gm_a, normalize_to_only_use_kwargs=True
+        )
+        if norm_args_kwargs is not None:
+            norm_args, norm_kwargs = norm_args_kwargs
+        else:
+            norm_args, norm_kwargs = node_a.args, node_a.kwargs
+
+        cur_idx = 0
+
+        while cur_idx < len(norm_args):
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(norm_args[cur_idx], gm_a):
+                    return False
+            cur_idx += 1
+
+        for kwarg_val in norm_kwargs.values():
+            # stitch the inputs from base graph
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(kwarg_val, gm_a):
+                    return False
+            cur_idx += 1
+
+    return True
+
+
+def _insert_copy_of_subgraph_a_after_input_node_c(
+    input_node_c: Node | list[Node],
+    input_node_c_2: Node | list[Node] | None,
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    TODO(before land): real docblock
+    """
+    if not isinstance(input_node_c, (Node, list)):
+        raise AssertionError(f"Expected Node or list, got {type(input_node_c)}")
+
+    # create a sequential list of the subgraphs' nodes from start to end,
+    # because we need to add the nodes to graph C in non-reverse order
+    nodes_of_a = [subgraph_a.end_node]
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+        nodes_of_a.insert(0, cur_node)
+
+    # go through nodes of a in order, and insert them into the graph of c
+    # sequentially
+    cur_node_a = nodes_of_a[0]
+    cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+        input_node_c, input_node_c_2, cur_node_a, gm_a, gm_b, node_name_prefix
+    )
+    for cur_idx_a in range(1, len(nodes_of_a)):
+        cur_node_a = nodes_of_a[cur_idx_a]
+        prev_node_c = cur_node_c  # previous added node is the input to next node
+        cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+            prev_node_c,
+            # TODO(future PR): enable multiple inputs for nodes which are not at start of subgraph
+            None,
+            cur_node_a,
+            gm_a,
+            gm_b,
+            node_name_prefix,
+        )
+    # return the last inserted node
+    return cur_node_c
+
+
+def _insert_copy_of_node_a_after_input_node_c(
+    input_node_c: Node | list[Node],
+    input_node_c_2: Node | list[Node] | None,
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    Assume that node_a from graph_a has
+      args (input, (input2)?, arg1, ...), and
+      kwargs {kw0: kwarg0, ...}
+
+    Note: input2 is optional. If it equals to None, we assume that the op
+    has a single non-param input.  If it is specified, we assume that the op
+    has two non-param inputs.
+
+    Copies the underlying values of arg1..argn and kwarg0..kwargn into gm_b,
+    and creates the corresponding nodes in graph_c. Note: observers are ignored,
+    so if an arg is an observer we navigate up until we find a non-observer parent.
+
+    If node_a is a call_module, points the module pointed to by node_a to gm_b.
+
+    Creates the copy of node_a in graph_c, with input as the first arg,
+    and all other args and kwargs pointing to the copies of the objects
+    in gm_b created above.
+
+    An example in pictures:
+
+    graph A:
+    ========
+
+    input -------------> node_a
+                         / / /
+    (input_2)?----------/ / /
+                         / /
+    weight -> weight_obs  /
+                         /
+    bias ----------------
+
+    graph C (derived from B):
+    =========================
+
+    input_node_c --> node_a_copy
+                     / / /
+    (input_node_c_2)? / /
+                     / /
+    weight_copy ----/ /
+                     /
+    bias_copy ------/
+    """
+    if isinstance(input_node_c, Node):
+        graph_c = input_node_c.graph
+    else:
+        if not isinstance(input_node_c, list):
+            raise AssertionError(f"Expected list, got {type(input_node_c)}")
+        graph_c = input_node_c[0].graph
+
+    norm_args_kwargs = node_a.normalized_arguments(
+        gm_a, normalize_to_only_use_kwargs=True
+    )
+    if norm_args_kwargs is not None:
+        norm_args, norm_kwargs = norm_args_kwargs
+    else:
+        norm_args, norm_kwargs = node_a.args, node_a.kwargs
+
+    new_args = []
+    new_kwargs = {}
+
+    def _copy_arg(arg):
+        # copy the other inputs from the other graph
+        if isinstance(arg, Node):
+            arg = return_first_non_observer_node(arg, gm_a)
+            arg = _copy_node_from_a_to_c(arg, gm_a, gm_b, graph_c)
+            return arg
+        elif isinstance(arg, (int, float, torch.dtype)):
+            return arg
+        elif isinstance(kwarg_val, (list, tuple)):
+            for el in kwarg_val:
+                if isinstance(el, Node):
+                    raise AssertionError(
+                        "handling of Node inside list is not implemented"
+                    )
+            return arg
+        else:
+            raise AssertionError(
+                f"handling for kwarg of type {type(kwarg_val)} is not implemented"
+            )
+
+    cur_idx = 0
+
+    while cur_idx < len(norm_args):
+        if cur_idx == 0:
+            new_arg = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_arg = input_node_c_2
+        else:
+            new_arg = _copy_arg(norm_args[cur_idx])
+        new_args.append(new_arg)
+        cur_idx += 1
+
+    for kwarg_name, kwarg_val in norm_kwargs.items():
+        # stitch the inputs from base graph
+        if cur_idx == 0:
+            new_kwargs[kwarg_name] = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_kwargs[kwarg_name] = input_node_c_2
+        else:
+            new_kwargs[kwarg_name] = _copy_arg(kwarg_val)
+        cur_idx += 1
+
+    new_args = tuple(new_args)  # type: ignore[assignment]
+
+    node_a_shadows_c_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+
+    if node_a.op == "call_module":
+        # if target is a module, we point to the module from gm_b
+        new_mod_copy_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        # fetch the corresponding module from gm_a
+        if not isinstance(node_a.target, str):
+            raise AssertionError(f"Expected str, got {type(node_a.target)}")
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        setattr(gm_b, new_mod_copy_name, mod_a)
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op,
+            new_mod_copy_name,
+            new_args,  # type: ignore[arg-type]
+            new_kwargs,  # type: ignore[arg-type]
+            node_a_shadows_c_name,
+        )
+        return node_a_shadows_c
+    else:
+        if node_a.op not in ("call_function", "call_method"):
+            raise AssertionError(f"Unexpected op: {node_a.op}")
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op,
+            node_a.target,
+            new_args,  # type: ignore[arg-type]
+            new_kwargs,  # type: ignore[arg-type]
+            node_a_shadows_c_name,
+        )
+        return node_a_shadows_c
+
+
+def create_a_shadows_b(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    matched_subgraph_pairs: dict[str, tuple[NSSubgraph, NSSubgraph]],
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]] | None = None,
+) -> GraphModule:
+    """
+    Creates a new GraphModule consisting of the graph of C, with the meaningful
+    nodes of A shadowing the corresponding nodes of B.  For example,
+
+    Graph A:
+    a0 -> op0_fp32 -> a1 -> op1_fp32 -> a2
+
+    Graph B:
+    b0 -> op0_int8 -> b1 -> op1_int8 -> b2
+
+    matched_node_pairs: {'op0': (op0_fp32, op0_int8), 'op1': (op1_fp32, op1_int8)}
+
+    Graph C (A shadows B):
+
+        / dequant0 -> op0_fp32 -> logger_a_0  / dequant_1 -> op1_fp32 -> logger_a_1
+       /                                     /
+    b0 -------------> op0_int8 -> logger_b_0 --------------> op1_int8 -> logger_b_1
+
+    In a nutshell, this function does the following for each node pair:
+    * copies the necessary attributes and modules from gm_a to gm_b,
+      keeping names unique
+    * adds a dtype cast op (dequant, quant, etc)
+    * adds a copy of node_a in gm_b's graph
+    * adds loggers to the outputs of node_a and node_b
+    """
+
+    if node_type_to_io_type_map is None:
+        node_type_to_io_type_map = get_node_type_to_io_type_map()
+
+    # graph_c is the graph created from copying the nodes of graph_b and inserting
+    # the shadows with the nodes copied from graph_a
+    graph_c = Graph()
+    env_c: dict[str, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env_c[node.name])
+
+    start_node_b_to_matched_subgraph_a_and_name = {}
+    end_node_b_to_matched_subgraph_a_and_name = {}
+    for match_name, match in matched_subgraph_pairs.items():
+        subgraph_a, subgraph_b = match
+        ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a)
+        ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b)
+        start_node_b_to_matched_subgraph_a_and_name[subgraph_b.start_node] = (
+            subgraph_a,
+            match_name,
+            ref_node_type_a,
+            ref_node_type_b,
+        )
+        end_node_b_to_matched_subgraph_a_and_name[subgraph_b.end_node] = (
+            subgraph_a,
+            match_name,
+            ref_node_type_a,
+            ref_node_type_b,
+        )
+
+    for node_b in gm_b.graph.nodes:
+        if node_b.op == "output":
+            graph_c.output(map_arg(node_b.args[0], load_arg))
+            continue
+
+        # calculate the flags to determine what to do with this node
+        node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name
+        node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name
+
+        if node_b_is_start_node or node_b_is_end_node:
+            if node_b_is_start_node:
+                (
+                    subgraph_a,
+                    ref_name,
+                    ref_node_type_a,
+                    ref_node_type_b,
+                ) = start_node_b_to_matched_subgraph_a_and_name[node_b]
+            else:
+                if not node_b_is_end_node:
+                    raise AssertionError("Expected node_b_is_end_node to be not false")
+                (
+                    subgraph_a,
+                    ref_name,
+                    ref_node_type_a,
+                    ref_node_type_b,
+                ) = end_node_b_to_matched_subgraph_a_and_name[node_b]
+
+            all_op_types_support_shadowing = op_type_supports_shadowing(
+                subgraph_a.start_node
+            ) and op_type_supports_shadowing(node_b)
+            if not all_op_types_support_shadowing:
+                print(
+                    f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}"
+                    + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}"
+                    + ", unsupported"
+                )
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            # For both start_node and end_node verify that we know how to do
+            # the dtype cast. If we do not, skip.
+            (
+                node_input_type_a,
+                node_output_type_a,
+            ) = get_node_first_input_and_output_type(
+                subgraph_a.start_node, gm_a, logger_cls, node_type_to_io_type_map
+            )
+            (
+                node_input_type_b,
+                node_output_type_b,
+            ) = get_node_first_input_and_output_type(
+                node_b, gm_b, logger_cls, node_type_to_io_type_map
+            )
+            node_io_types_known_a_and_b = (
+                node_input_type_a != NodeInputOrOutputType.UNKNOWN
+                and node_output_type_a != NodeInputOrOutputType.UNKNOWN
+                and node_input_type_b != NodeInputOrOutputType.UNKNOWN
+                and node_output_type_b != NodeInputOrOutputType.UNKNOWN
+            )
+            if not node_io_types_known_a_and_b:
+                print(
+                    f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}"
+                    + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}"
+                    + ", unknown dtype cast"
+                )
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            # If we are shadowing from fp32 to int8, we need to insert
+            # quantize_per_tensor call with qparams from the previous node.
+            # Only do this if we are able to infer these qparams from the graph.
+            if (
+                node_input_type_a == NodeInputOrOutputType.INT8
+                and node_input_type_b == NodeInputOrOutputType.FP32
+            ):
+                node_a_input_qparams = get_node_input_qparams(
+                    subgraph_a.start_node, gm_a, node_type_to_io_type_map
+                )
+                if not node_a_input_qparams:
+                    print(
+                        f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}"
+                        + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}"
+                        + ", unknown input qparams"
+                    )
+                    env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                    continue
+
+            num_non_param_args_node_a = get_number_of_non_param_args(
+                subgraph_a.start_node, gm_a
+            )
+            if not _can_insert_copy_of_subgraph_a(
+                subgraph_a, gm_a, num_non_param_args_node_a
+            ):
+                print(
+                    f"skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}"
+                    + f", start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}"
+                    + ", unhandled logic in subgraph copy"
+                )
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a)
+            fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)  # type: ignore[possibly-undefined]
+
+            if node_b_is_start_node:
+                # if necessary, log the input of node_c
+                if should_log_inputs:
+                    prev_node_b = get_normalized_nth_input(node_b, gm_b, 0)
+                    if isinstance(prev_node_b, Node):
+                        prev_node_c = env_c[prev_node_b.name]
+                        env_c[prev_node_c.name] = _insert_logger_after_node(
+                            prev_node_c,
+                            gm_b,
+                            logger_cls,
+                            "_ns_logger_b_inp_",
+                            node_b.name,
+                            name_b,
+                            ref_name,
+                            ref_node_type_b,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0,
+                            index_of_arg=0,
+                            fqn=fqn_base_b,
+                        )
+                    elif isinstance(prev_node_b, list):
+                        # first, save the prev_node instances, because they
+                        # will be overwritten in the env after the first logger
+                        # is added
+                        prev_node_c_list = [env_c[arg.name] for arg in prev_node_b]
+
+                        for arg_idx, prev_node_c in enumerate(prev_node_c_list):
+                            env_c[prev_node_c.name] = _insert_logger_after_node(
+                                prev_node_c,
+                                gm_b,
+                                logger_cls,
+                                "_ns_logger_b_inp_",
+                                node_b.name,
+                                name_b,
+                                ref_name,
+                                ref_node_type_b,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx,
+                                index_of_arg=0,
+                                fqn=fqn_base_b,
+                            )
+                    else:
+                        # logging of inputs which are not lists is not supported yet
+                        raise AssertionError(
+                            f"type {type(prev_node_b)} is not handled yet"
+                        )
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)?
+
+            # Note: this if statement is always True, spelling it out to clarify code
+            # intent.
+            if node_b_is_start_node or node_b_is_end_node:
+                # ensure env_c is populated with base node
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                node_c = env_c[node_b.name]
+
+                # after this point,
+                #
+                # node_a is the original node from graph_a, with parent module gm_a
+                # node_b is the original node from graph_b, with parent module gm_b
+                # node_c is the copy of node_b in graph_c
+                #
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+            if node_b_is_start_node:
+                # cast dtype from the dtype of node_c's input to the dtype of
+                # node_a's input (dequant, etc)
+                # prev_node_c = node_c.args[0]
+                prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)  # type: ignore[possibly-undefined]
+                if should_log_inputs:
+                    # skip the input logger when inserting a dtype cast
+                    if isinstance(prev_node_c, Node):
+                        # pyrefly: ignore [unbound-name]
+                        prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
+                    elif isinstance(prev_node_c, list):
+                        prev_node_c = [
+                            get_normalized_nth_input(arg, gm_b, 0)
+                            for arg in prev_node_c
+                        ]
+                dtype_cast_node = _insert_dtype_cast_after_node(
+                    subgraph_a.start_node,
+                    # pyrefly: ignore [unbound-name]
+                    node_c,
+                    prev_node_c,
+                    gm_a,
+                    gm_b,
+                    graph_c,
+                    node_b.name + "_dtype_cast_",
+                    logger_cls,
+                    node_type_to_io_type_map,
+                )
+                # note: not inserting to env_c because all nodes which use the dtype
+                #   casts are copied from graph_a
+                #
+                # subgraph so far:
+                #
+                #           (dtype_cast_node)+
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+                # if input logging is enabled, log the input to the subgraph
+                if should_log_inputs:
+                    # TODO: explain this
+                    ref_node_name = ""
+                    if isinstance(dtype_cast_node, Node):
+                        dtype_cast_node = _insert_logger_after_node(
+                            dtype_cast_node,
+                            gm_b,
+                            logger_cls,
+                            "_ns_logger_a_inp_",
+                            ref_node_name,
+                            name_a,
+                            ref_name,
+                            ref_node_type_a,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0,
+                            index_of_arg=0,
+                            fqn=fqn_base_a,
+                        )
+                        input_logger: Node | list[Node] = dtype_cast_node
+                    else:
+                        if not isinstance(dtype_cast_node, list):
+                            raise AssertionError(
+                                f"Expected list, got {type(dtype_cast_node)}"
+                            )
+                        new_loggers = []
+                        for dtype_cast_idx, dtype_cast_node_inner in enumerate(
+                            dtype_cast_node
+                        ):
+                            dtype_cast_logger = _insert_logger_after_node(
+                                dtype_cast_node_inner,
+                                gm_b,
+                                logger_cls,
+                                "_ns_logger_a_inp_",
+                                ref_node_name,
+                                name_a,
+                                ref_name,
+                                ref_node_type_a,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=dtype_cast_idx,
+                                index_of_arg=0,
+                                fqn=fqn_base_a,
+                            )
+                            new_loggers.append(dtype_cast_logger)
+                        dtype_cast_node = new_loggers
+                        input_logger = dtype_cast_node
+                    # subgraph so far:
+                    #
+                    #       (dtype_cast_node)+ -> (logger_a_input)?
+                    #                  /
+                    # prev_node_c -> (logger_c_input)? -> node_start_c
+
+                # hook up the new mod_a copy to be in the graph, receiving the
+                # same inputs as mod_b does, with dtype cast to match a
+                # Some ops, such as LSTMs, have two non-param inputs. If we have
+                # such an op, pass the second param as well. Note: dtype casting
+                # for the second param is not implemented yet, it can be added
+                # later if there is a use case.
+                node_c_second_non_param_arg = None
+                num_non_param_args_node_a = get_number_of_non_param_args(
+                    subgraph_a.start_node, gm_a
+                )
+                if num_non_param_args_node_a == 2:
+                    # node_c_second_non_param_arg = node_c.args[1]
+                    node_c_second_non_param_arg = get_normalized_nth_input(
+                        # pyrefly: ignore [unbound-name]
+                        node_c,
+                        gm_b,
+                        1,
+                    )
+                node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c(
+                    dtype_cast_node,
+                    node_c_second_non_param_arg,
+                    subgraph_a,
+                    gm_a,
+                    gm_b,
+                    # pyrefly: ignore [unbound-name]
+                    node_c.name + "_shadow_copy_",
+                )
+                env_c[node_a_shadows_c.name] = node_a_shadows_c
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy(args/kwargs not shown)
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+                if should_log_inputs:
+                    # When we created the input logger, we left the ref_node_name
+                    # as an empty string, because the subgraph copy did not exist
+                    # yet. Now that the subgraph copy exists, we modify this name
+                    # to its true value.
+                    # Note: the alternative to this is to create the input logger
+                    # after creating the subgraph, which is slightly more
+                    # complicated. This is the lesser of two evils.
+                    # input_logger = env_c[dtype_cast_node.name]
+                    # Find the first node in the subgraph
+                    cur_node = node_a_shadows_c
+                    while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
+                        cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
+                    # pyrefly: ignore [unbound-name]
+                    if isinstance(input_logger, Node):
+                        # pyrefly: ignore [unbound-name]
+                        input_logger_mod = getattr(gm_b, input_logger.name)
+                        input_logger_mod.ref_node_name = cur_node.name
+                    else:
+                        # pyrefly: ignore [unbound-name]
+                        if not isinstance(input_logger, list):
+                            raise AssertionError(
+                                # pyrefly: ignore [unbound-name]
+                                f"Expected list, got {type(input_logger)}"
+                            )
+                        # pyrefly: ignore [unbound-name]
+                        for input_logger_inner in input_logger:
+                            input_logger_mod = getattr(gm_b, input_logger_inner.name)
+                            input_logger_mod.ref_node_name = cur_node.name
+
+                # hook up a logger to the mod_a copy
+                env_c[node_a_shadows_c.name] = _insert_logger_after_node(
+                    env_c[node_a_shadows_c.name],
+                    gm_b,
+                    logger_cls,
+                    "_ns_logger_a_",
+                    node_a_shadows_c.name,
+                    name_a,
+                    ref_name,
+                    ref_node_type_a,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0,
+                    index_of_arg=0,
+                    fqn=fqn_base_a,
+                )
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+            if node_b_is_end_node:
+                # hook up a logger to the mod_b copy
+                env_c[node_b.name] = _insert_logger_after_node(
+                    env_c[node_b.name],
+                    gm_b,
+                    logger_cls,
+                    "_ns_logger_b_",
+                    node_b.name,
+                    name_b,
+                    ref_name,
+                    ref_node_type_b,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0,
+                    index_of_arg=0,
+                    fqn=fqn_base_b,
+                )
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c
+                #
+                # Note: node_start_c may be the same node as node_end_c, or they
+                # may have nodes in between.
+
+        else:
+            env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+
+    gm_c = GraphModule(gm_b, graph_c)
+    return gm_c
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..275291789f1c5461af366038d7702801bf5fc303
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/mappings.py
@@ -0,0 +1,763 @@
+import operator
+from typing import TYPE_CHECKING
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.qat.dynamic as nnqatd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.quantization.fx._lower_to_native_backend as _lower_to_native_backend
+import torch.ao.quantization.quantization_mappings as quantization_mappings
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.quantization.backend_config import get_native_backend_config
+
+from .ns_types import NSNodeTargetType
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+toq = torch.ops.quantized
+
+
+def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
+    # note: this set is modified below by items from backend_config
+    sets_of_related_ops: list[set[NSNodeTargetType]] = [
+        # conv modules
+        {
+            nn.Conv1d,
+        },
+        {
+            nn.Conv2d,
+        },
+        {
+            nn.Conv3d,
+        },
+        # conv functionals
+        {
+            F.conv1d,
+        },
+        {
+            F.conv2d,
+        },
+        {
+            F.conv3d,
+        },
+        # linear modules
+        {
+            nn.Linear,
+        },
+        # linear functionals
+        {
+            F.linear,
+        },
+        # average pool
+        {
+            nn.AvgPool1d,
+            torch.avg_pool1d,
+        },
+        {
+            nn.AvgPool2d,
+            torch._C._nn.avg_pool2d,
+        },
+        {
+            nn.AvgPool3d,
+            torch._C._nn.avg_pool3d,
+        },
+        # adaptive average pool
+        {
+            nn.AdaptiveAvgPool1d,
+            F.adaptive_avg_pool1d,
+        },
+        {
+            nn.AdaptiveAvgPool2d,
+            F.adaptive_avg_pool2d,
+        },
+        {
+            nn.AdaptiveAvgPool3d,
+            F.adaptive_avg_pool3d,
+        },
+        # LSTM
+        {
+            nn.LSTM,
+        },
+        # add
+        {
+            torch.add,
+            operator.add,  # x + y
+        },
+        # cat
+        {
+            torch.cat,
+        },
+        # mul
+        {
+            torch.mul,
+            operator.mul,
+        },
+        # relu
+        {
+            F.relu,
+            nn.ReLU,
+            "relu",
+            "relu_",
+            torch.relu,
+        },
+        # maxpool
+        {
+            nn.MaxPool1d,
+            F.max_pool1d,
+        },
+        {
+            nn.MaxPool2d,
+            F.max_pool2d,
+        },
+        {
+            nn.MaxPool3d,
+            F.max_pool3d,
+        },
+        # sigmoid
+        {
+            torch.sigmoid,
+            "sigmoid",
+            "sigmoid_",
+            nn.Sigmoid,
+            F.sigmoid,
+        },
+        # BatchNorm
+        {
+            nn.BatchNorm2d,
+        },
+        {
+            nn.BatchNorm3d,
+        },
+        # ConvTranspose
+        {
+            nn.ConvTranspose1d,
+        },
+        {
+            nn.ConvTranspose2d,
+        },
+        {
+            nn.ConvTranspose3d,
+        },
+        # functional transposed conv
+        {
+            F.conv_transpose1d,
+        },
+        {
+            F.conv_transpose2d,
+        },
+        {
+            F.conv_transpose3d,
+        },
+        # ELU
+        {
+            nn.ELU,
+        },
+        # Embedding
+        {
+            nn.Embedding,
+        },
+        # EmbeddingBag
+        {
+            nn.EmbeddingBag,
+        },
+        # GroupNorm
+        {
+            nn.GroupNorm,
+        },
+        # Hardswish
+        {
+            nn.Hardswish,
+        },
+        # InstanceNorm
+        {
+            nn.InstanceNorm1d,
+        },
+        {
+            nn.InstanceNorm2d,
+        },
+        {
+            nn.InstanceNorm3d,
+        },
+        # LayerNorm
+        {
+            nn.LayerNorm,
+        },
+        # LeakyReLU
+        {
+            nn.LeakyReLU,
+        },
+        # ReLU6
+        {
+            nn.ReLU6,
+            F.relu6,
+        },
+        # F.elu
+        {
+            F.elu,
+        },
+        # F.hardswish
+        {
+            F.hardswish,
+        },
+        # F.group_norm
+        {
+            F.group_norm,
+        },
+        # F.instance_norm
+        {
+            F.instance_norm,
+        },
+        # F.layer_norm
+        {
+            F.layer_norm,
+        },
+        # F.leaky_relu
+        {
+            F.leaky_relu,
+        },
+        # F.silu
+        {
+            nn.SiLU,
+            F.silu,
+        },
+        # F.mish
+        {
+            nn.Mish,
+            F.mish,
+        },
+        # F.tanh
+        {
+            nn.Tanh,
+            F.tanh,
+            torch.tanh,
+            "tanh_",
+            "tanh",
+        },
+        # F.hardsigmoid
+        {
+            "hardsigmoid_",
+            "hardsigmoid",
+            F.hardsigmoid,
+            nn.Hardsigmoid,
+        },
+        # F.hardtanh
+        {
+            nn.Hardtanh,
+            F.hardtanh,
+            F.hardtanh_,
+        },
+        # floordiv
+        {
+            operator.floordiv,
+        },
+        # unsqueeze
+        {
+            torch.unsqueeze,
+        },
+        # stack
+        {
+            torch.stack,
+        },
+        # squeeze
+        {
+            torch.squeeze,
+        },
+        # sort
+        {
+            torch.sort,
+        },
+        # repeat_interleave
+        {
+            torch.repeat_interleave,
+        },
+        # min
+        {
+            torch.min,
+        },
+        # mean
+        {
+            torch.mean,
+        },
+        # max
+        {
+            torch.max,
+        },
+        # transpose
+        {
+            torch.transpose,
+        },
+        # flatten
+        {
+            torch.flatten,
+        },
+        # clamp
+        {
+            torch.clamp,
+        },
+        # chunk
+        {
+            torch.chunk,
+        },
+        # interpolate
+        {
+            torch.nn.functional.interpolate,
+        },
+        # dropout
+        {
+            nn.Dropout,
+        },
+        # F.dropout
+        {
+            F.dropout,
+        },
+        # matmul
+        {
+            torch.matmul,
+        },
+        # Softmax
+        {
+            nn.Softmax,
+        },
+        # PReLU
+        {
+            nn.PReLU,
+            nnq.PReLU,
+        },
+        # F.prelu
+        {
+            F.prelu,
+            toq.prelu,
+        },
+        # pixel shuffle
+        {
+            nn.PixelShuffle,
+        },
+        {
+            F.pixel_shuffle,
+        },
+        # pixel unshuffle
+        {
+            nn.PixelUnshuffle,
+        },
+        {
+            F.pixel_unshuffle,
+        },
+        # narrow
+        {
+            torch.narrow,
+        },
+    ]
+
+    # for each floating point op, add versions of the op added by
+    # backend_config
+    backend_config = get_native_backend_config()
+
+    new_connections: list[tuple[Callable, Callable]] = [
+        # technical debt edge case
+        (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear),
+    ]
+
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        # pattern format: (c, (b, a))
+        first_element = pattern
+        # look from the end, because pattern is in reverse order
+        while isinstance(first_element, (list, tuple)):
+            first_element = first_element[-1]
+
+        if config.fused_module is not None:
+            # case 1: pattern fuses a pattern of ops into an op
+            # example: nn.Conv1d, nn.ReLU fused into nni.ConvReLU1d
+            new_connections.append((first_element, config.fused_module))
+
+        if config.qat_module is not None:
+            # case 2: pattern swaps a module into a QAT module
+            # example: nni.ConvReLU1d swapped into nniqat.ConvReLU1d
+            new_connections.append((first_element, config.qat_module))
+
+        if config.reference_quantized_module is not None:
+            # case 3: reference version of floating point module, such as
+            # nn.Conv2d and nnqr.Conv2d
+            new_connections.append((first_element, config.reference_quantized_module))
+
+    #
+    # Add reference module swaps from default lowering path
+    #
+
+    for source_to_target in (
+        _lower_to_native_backend.STATIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.WEIGHT_ONLY_LOWER_MODULE_MAP,
+        _lower_to_native_backend.SPECIAL_PATTERN_LOWER_MODULE_MAP,
+    ):
+        for source, target in source_to_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target))
+
+    for source_to_double_target in (
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP,
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP,
+    ):
+        for source, (target1, target2) in source_to_double_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target1))
+            new_connections.append((source, target2))
+
+    #
+    # Add function swaps from default lowering path
+    #
+
+    for source, (  # type:ignore[assignment]
+        target1,
+        target2,
+    ) in _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items():
+        new_connections.append((source, target1))
+        # pyrefly: ignore [bad-argument-type]
+        new_connections.append((source, target2))
+
+    for source_to_target in (
+        _lower_to_native_backend.QBIN_OP_MAPPING,
+        _lower_to_native_backend.QBIN_RELU_OP_MAPPING,
+        quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():  # type:ignore[assignment]
+            # pyrefly: ignore [bad-argument-type]
+            new_connections.append((source, target))
+
+    #
+    # Add other swaps, ideally in the future this could be removed
+    # after the lowering code stops using these.
+    #
+    for source_to_target in (
+        quantization_mappings.DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():  # type:ignore[assignment]
+            new_connections.append((source, target))
+
+    # add the new connections from backend_config
+    for item1, item2 in new_connections:
+        for set_of_related_ops in sets_of_related_ops:
+            if item1 in set_of_related_ops or item2 in set_of_related_ops:
+                set_of_related_ops.add(item1)
+                set_of_related_ops.add(item2)
+                break
+
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]] = {}
+
+    for counter, set_of_related_ops in enumerate(sets_of_related_ops):
+        base_name = str(counter)
+        base_name_to_sets_of_related_ops[base_name] = set_of_related_ops
+
+    return base_name_to_sets_of_related_ops
+
+
+def get_base_name_for_op(
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
+    op: NSNodeTargetType,
+) -> str | None:
+    for base_name, set_of_related_ops in base_name_to_sets_of_related_ops.items():
+        if op in set_of_related_ops:
+            return base_name
+    return None
+
+
+def add_op_to_sets_of_related_ops(
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
+    op: NSNodeTargetType,
+    related_op: NSNodeTargetType | None,
+) -> None:
+    if related_op is not None:
+        for set_of_related_ops in base_name_to_sets_of_related_ops.values():
+            if related_op in set_of_related_ops:
+                set_of_related_ops.add(op)
+                return
+        # if we got here, related_op was not found
+        raise AssertionError(f"{related_op} was not found")
+    else:
+        counter = 0
+        while str(counter) in base_name_to_sets_of_related_ops:
+            counter += 1
+        base_name_to_sets_of_related_ops[str(counter)] = {op}
+
+
+# TODO(future PR): clean this up
+def get_node_type_to_io_type_map() -> dict[str, set[NSNodeTargetType]]:
+    FUNS_IO_TYPE_FP32: set[NSNodeTargetType] = {
+        F.linear,
+        F.conv1d,
+        F.conv2d,
+        F.conv3d,
+        torch.cat,
+        F.elu,
+        F.hardswish,
+        F.instance_norm,
+        F.layer_norm,
+        F.leaky_relu,
+        F.dropout,
+        F.silu,
+        F.mish,
+        operator.add,
+        torch.add,
+        operator.mul,
+        torch.mul,
+        torch.sum,
+        F.prelu,
+    }
+
+    FUNS_IO_TYPE_FP16: set[NSNodeTargetType] = set()
+
+    FUNS_IO_TYPE_INT8: set[NSNodeTargetType] = {
+        toq.linear,
+        toq.linear_relu,
+        toq.conv1d,
+        toq.conv1d_relu,
+        toq.conv2d,
+        toq.conv2d_relu,
+        toq.conv3d,
+        toq.conv3d_relu,
+        toq.cat,
+        toq.elu,
+        toq.hardswish,
+        toq.instance_norm,
+        toq.layer_norm,
+        toq.leaky_relu,
+        toq.dropout,
+        toq.prelu,
+        # TODO(future PR): implement shadowing for binary ops and
+        # uncomment below
+        # toq.add,
+        # toq.mul,
+    }
+
+    FUNS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = {
+        F.relu,
+        F.tanh,
+        torch.tanh,
+        F.sigmoid,
+        torch.sigmoid,
+        F.hardsigmoid,
+        operator.floordiv,
+        torch.adaptive_avg_pool1d,
+        F.adaptive_avg_pool2d,
+        F.adaptive_avg_pool3d,
+        F.dropout,
+        F.hardtanh,
+        F.hardtanh_,
+        F.interpolate,
+        F.max_pool1d,
+        F.max_pool2d,
+        F.max_pool3d,
+        F.relu6,
+        F.pixel_shuffle,
+        F.pixel_unshuffle,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.cat,
+        torch.chunk,
+        torch.clamp,
+        torch.flatten,
+        torch.transpose,
+        torch.max,
+        torch.mean,
+        torch.min,
+        torch.narrow,
+        torch.repeat_interleave,
+        torch.sort,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+        operator.add,
+    }
+
+    MODS_IO_TYPE_FP32: set[NSNodeTargetType] = {
+        nn.Linear,
+        nnqat.Linear,
+        nnqatd.Linear,
+        nnqd.Linear,
+        torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.Conv3d,
+        nnqat.Conv1d,
+        nnqat.Conv2d,
+        nnqat.Conv3d,
+        nnqat.Embedding,
+        nnqat.EmbeddingBag,
+        nn.LSTM,
+        # note: nnqd.Linear is an instance of nnq.Linear, so this
+        # check has to happen before the int8 module check
+        nnqd.LSTM,
+        nn.BatchNorm2d,
+        nn.BatchNorm3d,
+        nn.Dropout,
+        nn.ConvTranspose1d,
+        nn.ConvTranspose2d,
+        nn.ConvTranspose3d,
+        nn.ELU,
+        nn.GroupNorm,
+        nn.InstanceNorm1d,
+        nn.InstanceNorm2d,
+        nn.InstanceNorm3d,
+        nn.LayerNorm,
+        nn.Hardswish,
+        nn.LeakyReLU,
+        nn.ReLU6,
+        nn.SiLU,
+        nn.Mish,
+        nn.Softmax,
+        nn.PReLU,
+        nni.BNReLU2d,
+        nni.BNReLU3d,
+        nni.ConvReLU1d,
+        nni.ConvReLU2d,
+        nni.ConvReLU3d,
+        nni.LinearReLU,
+        nni.LinearBn1d,
+        nni.ConvBn1d,
+        nni.ConvBn2d,
+        nni.ConvBn3d,
+        nniqat.ConvBn1d,
+        nniqat.ConvBn2d,
+        nniqat.ConvBn3d,
+        nniqat.ConvBnReLU1d,
+        nniqat.ConvBnReLU2d,
+        nniqat.ConvBnReLU3d,
+        nniqat.ConvReLU1d,
+        nniqat.ConvReLU2d,
+        nniqat.ConvReLU3d,
+        nniqat.LinearReLU,
+        nniqat.LinearBn1d,
+        nniqd.LinearReLU,
+        nni.LinearLeakyReLU,
+        nni.LinearTanh,
+        nni.ConvAdd2d,
+        nni.ConvAddReLU2d,
+    }
+
+    MODS_IO_TYPE_INT8: set[NSNodeTargetType] = {
+        nnq.Linear,
+        nnq.Conv1d,
+        nnq.Conv2d,
+        nnq.Conv3d,
+        nnq.BatchNorm2d,
+        nnq.BatchNorm3d,
+        nnq.Dropout,
+        nnq.ConvTranspose1d,
+        nnq.ConvTranspose2d,
+        nnq.ELU,
+        nnq.InstanceNorm1d,
+        nnq.InstanceNorm2d,
+        nnq.InstanceNorm3d,
+        nnq.LayerNorm,
+        nnq.Hardswish,
+        nnq.LeakyReLU,
+        nnq.Embedding,
+        nnq.EmbeddingBag,
+        nnq.Dropout,
+        nnq.Softmax,
+        nnq.PReLU,
+        nniq.BNReLU2d,
+        nniq.BNReLU3d,
+        nniq.ConvReLU1d,
+        nniq.ConvReLU2d,
+        nniq.ConvReLU3d,
+        nniq.LinearReLU,
+        nniq.LinearLeakyReLU,
+        nniq.LinearTanh,
+        nniq.ConvAdd2d,
+        nniq.ConvAddReLU2d,
+    }
+
+    MODS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = {
+        nn.ReLU,
+        nn.Tanh,
+        nn.Sigmoid,
+        nn.Hardsigmoid,
+        nn.AdaptiveAvgPool1d,
+        nn.AdaptiveAvgPool2d,
+        nn.AdaptiveAvgPool3d,
+        nn.AvgPool1d,
+        nn.AvgPool2d,
+        nn.AvgPool3d,
+        nn.Dropout,
+        nn.Hardtanh,
+        nn.Identity,
+        nn.MaxPool1d,
+        nn.MaxPool2d,
+        nn.MaxPool3d,
+        nn.PixelShuffle,
+        nn.PixelUnshuffle,
+        nn.ReLU6,
+    }
+
+    METHS_IO_TYPE_FP32_OR_INT8: set[NSNodeTargetType] = {
+        "sigmoid_",
+        "sigmoid",
+        "tanh_",
+        "tanh",
+        "hardsigmoid_",
+        "hardsigmoid",
+        "relu_",
+        "relu",
+    }
+
+    return {
+        "funs_io_type_fp32": FUNS_IO_TYPE_FP32,
+        "funs_io_type_fp16": FUNS_IO_TYPE_FP16,
+        "funs_io_type_int8": FUNS_IO_TYPE_INT8,
+        "funs_io_type_fp32_or_int8": FUNS_IO_TYPE_FP32_OR_INT8,
+        "mods_io_type_fp32": MODS_IO_TYPE_FP32,
+        "mods_io_type_int8": MODS_IO_TYPE_INT8,
+        "mods_io_type_fp32_or_int8": MODS_IO_TYPE_FP32_OR_INT8,
+        "meths_io_type_fp32_or_int8": METHS_IO_TYPE_FP32_OR_INT8,
+    }
+
+
+def get_unmatchable_types_map() -> dict[str, set[NSNodeTargetType]]:
+    FUNS_UNMATCHABLE: set[NSNodeTargetType] = {
+        torch.quantize_per_tensor,
+        operator.getitem,
+    }
+
+    MODS_UNMATCHABLE: set[NSNodeTargetType] = {
+        nn.Identity,
+    }
+
+    METHS_UNMATCHABLE: set[NSNodeTargetType] = {
+        "to",
+        "dequantize",
+        "reshape",
+        "view",
+        "unsqueeze_",
+        "unsqueeze",
+        "transpose",
+        "squeeze_",
+        "squeeze",
+        "size",
+        "shape",
+        "resize_",
+        "repeat_interleave",
+        "repeat",
+        "permute",
+        "numel",
+        "mean",
+        "detach_",
+        "detach",
+        "contiguous",
+        "clamp",
+        "chunk",
+    }
+
+    return {
+        "funs_unmatchable": FUNS_UNMATCHABLE,
+        "mods_unmatchable": MODS_UNMATCHABLE,
+        "meths_unmatchable": METHS_UNMATCHABLE,
+    }
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d467d9337ea24d676d282740df042d5bdd16f3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/n_shadows_utils.py
@@ -0,0 +1,1416 @@
+# mypy: allow-untyped-defs
+import collections
+import copy
+import operator
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.fx
+from torch.ao.ns.fx.graph_passes import _maybe_get_fqn
+from torch.ao.ns.fx.ns_types import NSResultsType, NSSingleResultValuesType
+from torch.ao.ns.fx.utils import (  # TODO(future PR): make this work correctly for methods
+    get_normalized_nth_input,
+    get_target_type_str,
+)
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.fx.match_utils import _MatchResult
+from torch.ao.quantization.qconfig import QConfigAny
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.fx import Graph, GraphModule, Node
+from torch.utils._pytree import tree_map
+
+
+SHADOW_NODE_NAME_PREFIX = "shadow"
+SHADOW_WRAPPER_NODE_NAME_PREFIX = "shadow_wrapper"
+
+# TODO(future PR): reuse existing mapping instead of creating a new one
+BINARY_FUNCTIONS = {
+    torch.add,
+    torch.Tensor.add,
+    operator.add,
+    torch.mul,
+    torch.Tensor.mul,
+    operator.mul,
+}
+
+
+def _get_attr_name(subgraph_idx, subgraph_candidate_idx):
+    return f"{SHADOW_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}"
+
+
+def _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx):
+    return f"{SHADOW_WRAPPER_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}"
+
+
+class OutputProp:
+    """
+    Output propagation (modeled from shape propagation).
+
+    Given a GraphModule and an example input, saves the output flowing
+    through each node on `node.traced_result`.
+
+    Code based on the example from
+    https://pytorch.org/docs/stable/fx.html#the-interpreter-pattern
+    """
+
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env: dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target: str):
+            target_atoms = target.split(".")
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(
+                        f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+                    )
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                result = next(args_iter)
+            elif node.op == "get_attr":
+                result = fetch_attr(node.target)
+            elif node.op == "call_function":
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == "call_method":
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == "call_module":
+                result = self.modules[node.target](
+                    *load_arg(node.args), **load_arg(node.kwargs)
+                )
+
+            if isinstance(result, torch.Tensor):  # type: ignore[possibly-undefined]
+                # pyrefly: ignore [unbound-name]
+                node.traced_result = result
+
+            # pyrefly: ignore [unsupported-operation]
+            # pyrefly: ignore [unbound-name]
+            env[node.name] = result
+
+        return None
+
+
+def _get_dedup_subgraphs(matches: dict[str, _MatchResult]) -> dict[str, list[Node]]:
+    # the original matches variable is unique by node, make it unique by subgraph
+    # instead
+    seen_nodes = set()
+    subgraphs_dedup = {}
+
+    # Dict items are not reversible until Python 3.8, so we hack it
+    # to be compatible with previous Python versions
+    # TODO(future PR): try reversed(list(matches.items()))
+    matches_items_reversed: list[tuple[str, _MatchResult]] = list(
+        reversed(matches.items())
+    )
+
+    # Note: the order is important.  `matches` currently provides the matches
+    # in reverse order.  We would like to process the matches in non-reverse
+    # order, so that we can create an intuitive naming scheme, such as
+    # naming the first op's submodules `shadow_0_0` through `shadow_0_(n-1)`
+    for name, cur_match in matches_items_reversed:  # type: ignore[call-overload]
+        was_seen = False
+        for node_or_tuple in cur_match[1]:
+            # Cur_match[1] has an unusual type. It says that it's a `List[Node]`,
+            # but it is really not. Furthermore, the contents of this field
+            # can change from match results of multiple nodes of the same pattern
+            #
+            # For example, for conv -> bn -> relu, we see
+            # match_results = {
+            #   'conv': (relu, [(bn, conv), relu], ...),
+            #   'bn': (relu, [(bn, conv), relu], ...),
+            #   'relu': (relu, [(bn, conv), relu], ...),
+            # }
+            #
+            # Ideally we should clean up the `find_matches` function to make
+            # this more intuitive. For the purposes of this prototype, we hack
+            # around it.
+
+            if isinstance(node_or_tuple, Node):
+                if node_or_tuple in seen_nodes:
+                    was_seen = True
+                seen_nodes.add(node_or_tuple)
+
+            else:
+                if not isinstance(node_or_tuple, tuple):
+                    raise AssertionError(f"Expected tuple, got {type(node_or_tuple)}")
+                for node in node_or_tuple:
+                    if not isinstance(node, Node):
+                        raise AssertionError(f"Expected Node, got {type(node)}")
+                    if node in seen_nodes:
+                        was_seen = True
+                    seen_nodes.add(node)
+
+        if was_seen:
+            continue
+
+        # Start with the unusual type, convert it to [op_0, ..., op_n]
+        list_of_nodes = []
+
+        if len(cur_match[1]) == 1:
+            list_of_nodes = cur_match[1]
+        else:
+            if len(cur_match[1]) != 2:
+                raise ValueError(
+                    f"Expected cur_match[1] to have length 2, got {len(cur_match[1])}"
+                )
+            # either (a, b), or ((a, b), c) or (c, (a, b))
+            # cannot make any assumptions on order, not clear what the
+            # _find_matches function is doing to populate this
+            # TODO(future PR): make this code less confusing,  see discussion
+            # in https://github.com/pytorch/pytorch/pull/80521/files#r975918836
+
+            def _order_nodes(node_a, node_b, node_c) -> list[Node]:
+                nodes = [node_a, node_b, node_c]
+                first_node = None
+                mid_node = None
+                last_node = None
+                for n in nodes:
+                    prev_n = n.args[0]
+                    next_n = next(iter(n.users))
+                    if prev_n not in nodes:
+                        first_node = n
+                    elif next_n not in nodes:
+                        last_node = n
+                    else:
+                        mid_node = n
+                if first_node is None or mid_node is None or last_node is None:
+                    raise AssertionError("Expected all nodes to be non-None")
+                if mid_node.args[0] is not first_node:
+                    raise AssertionError("Expected mid_node.args[0] to be first_node")
+                if last_node.args[0] is not mid_node:
+                    raise AssertionError("Expected last_node.args[0] to be mid_node")
+                return [last_node, mid_node, first_node]
+
+            if isinstance(cur_match[1][0], Node) and isinstance(cur_match[1][1], Node):
+                # (a, b)
+                list_of_nodes = cur_match[1]
+            elif isinstance(cur_match[1][0], tuple):
+                # ((a, b), c)
+                node_a, node_b = cur_match[1][0]
+                node_c = cur_match[1][1]
+                list_of_nodes = _order_nodes(node_a, node_b, node_c)
+            elif isinstance(cur_match[1][1], tuple):
+                # (a, (b, c))
+                node_a, node_b = cur_match[1][1]
+                node_c = cur_match[1][0]
+                list_of_nodes = _order_nodes(node_a, node_b, node_c)
+
+        # [node_n, ..., node_0], note that the order is reversed
+        # to make it chronological for simple subgraphs
+        list_of_nodes.reverse()
+        subgraphs_dedup[name] = list_of_nodes
+
+    return subgraphs_dedup
+
+
+def _get_logger_for_subgraph(
+    model: GraphModule,
+    first_node: Node,
+    last_node: Node,
+    subgraph_idx: int,
+    subgraph_candidate_idx: int,
+    qconfig_str: str,
+    logger_cls: Callable,
+    fqn: str | None,
+) -> torch.nn.Module:
+    """
+    Given a model and a linear subgraph starting from `first_node` and
+    ending with `last_node`, creates a logger for the end of this
+    subgraph.
+    """
+    if fqn is None:
+        fqn = ""
+    logger_mod_orig = logger_cls(
+        first_node.name,  # ref_node_name
+        last_node.name,  # prev_node_name
+        f"subgraph_{subgraph_idx}_{subgraph_candidate_idx}",  # model_name
+        "model",  # ref_name
+        get_target_type_str(last_node, model),  # prev_node_target_type
+        get_target_type_str(first_node, model),  # ref_node_target_type
+        NSSingleResultValuesType.NODE_OUTPUT.value,  # results_type
+        0,  # index_within_arg
+        0,  # index_of_arg
+        fqn,  # fqn
+        qconfig_str,
+    )
+    # Usually we expect the user to add loggers, then calibrate, then convert,
+    # and then populate loggers.  This is why the loggers start disabled.
+    # TODO(future PR): reconsider the design to make this more intuitive.
+    logger_mod_orig.enabled = False
+    return logger_mod_orig
+
+
+def create_submodule_from_subgraph(
+    model: torch.nn.Module,
+    first_node: Node,
+    last_node: Node,
+) -> GraphModule:
+    """
+    Input: a model, and a linear subgraph within the model from first_node to
+      last_node.
+
+    Output: a new submodule containing a copy of the subgraph, with the inputs
+      to the first node becoming the inputs to the submodule, and all other
+      nodes in the subgraph being copied.
+
+    Example inputs:
+
+    `model`: a module with graph
+
+      x0 -> op1 -> x1 -> op2 -> x2
+             |
+            arg1
+
+    `first_node`: op1
+    `last_node`: op2
+
+    Example output: a new module with graph
+
+      input1 -> op1_copy -> x1 -> op2_copy -> output1
+                   |
+                  arg1
+    """
+
+    #
+    # create a blank GraphModule with an empty graph
+    #
+
+    class M(torch.nn.Module):
+        def forward(self, x):
+            pass
+
+    m = M()
+    gm = torch.fx.symbolic_trace(m)
+    g = gm.graph
+    for node in reversed(gm.graph.nodes):
+        g.erase_node(node)
+
+    #
+    # modify the graph to have a copy of our subgraph
+    #
+
+    cur_node_orig = first_node
+
+    cur_name_idx = 0
+
+    iteration_limit = 100
+    cur_iteration = 0
+
+    while True:
+        if cur_node_orig is first_node:
+            # we are at the first node, we need to set up graph inputs
+            # TODO(future): some graphs could have placeholders which are unrelated
+            # to the first node, need to handle this
+            cur_args_copy = []
+            cur_kwargs_copy = {}
+            seen_names: set[str] = set()
+            old_name_to_new_node: dict[str, Node] = {}
+
+            def _add_placeholder(
+                g: Graph, node: Node, seen_names, old_name_to_new_node
+            ):
+                # note: for graphs starting with patterns such as `y = x + x`, we
+                # need to ensure we do not add multiple placeholders with the
+                # same name
+                counter = 0
+                while node.name + "_" + str(counter) in seen_names:
+                    counter += 1
+                cur_name = node.name + "_" + str(counter)
+                seen_names.add(cur_name)
+                placeholder = g.placeholder(cur_name)
+                old_name_to_new_node[node.name] = placeholder
+                return placeholder
+
+            for arg in cur_node_orig.args:
+                if isinstance(arg, Node):
+                    p = _add_placeholder(g, arg, seen_names, old_name_to_new_node)
+                    cur_args_copy.append(p)
+                elif isinstance(arg, (list, tuple)):
+                    new_arg = []
+                    for inner_arg in arg:
+                        if isinstance(inner_arg, Node):
+                            new_arg.append(
+                                _add_placeholder(
+                                    g, inner_arg, seen_names, old_name_to_new_node
+                                )
+                            )
+                        else:
+                            new_arg.append(inner_arg)
+                    cur_args_copy.append(new_arg)
+                else:
+                    cur_args_copy.append(arg)
+
+            # TODO(future PR): handle non-normalized kwargs
+            for kwarg_name, kwarg in cur_node_orig.kwargs.items():
+                if isinstance(kwarg, Node):
+                    cur_kwargs_copy[kwarg_name] = _add_placeholder(
+                        g, kwarg, seen_names, old_name_to_new_node
+                    )
+                elif isinstance(kwarg, (list, tuple)):
+                    new_kwarg = []
+                    for inner_kwarg in kwarg:
+                        p = _add_placeholder(
+                            g,
+                            inner_kwarg,  # type: ignore[arg-type]
+                            seen_names,
+                            old_name_to_new_node,
+                        )
+                        new_kwarg.append(p)
+                    cur_kwargs_copy[kwarg_name] = new_kwarg
+                else:
+                    cur_kwargs_copy[kwarg_name] = kwarg
+
+            cur_args_copy = tuple(cur_args_copy)  # type: ignore[assignment]
+        else:
+            # we are not at first node, first arg is from the previous node,
+            # and all other args are copied
+
+            # the current implementation is simplistic and cannot handle
+            # ops with two or more arguments which need to be passed from
+            # the previous op, so we assert them out
+            if cur_node_orig.target in BINARY_FUNCTIONS:
+                raise AssertionError(
+                    f"Unexpected binary function target: {cur_node_orig.target}"
+                )
+
+            # at this point in the code, cur_node_copy is pointing to the copy
+            # of the previous node
+            # TODO(future PR): this is not handling complicated graphs correctly, need to
+            # look at actual relationships instead of assuming sequential graph
+            # TODO(future PR): this is ignoring kwargs, will need to support kwargs
+            # for any fusion pattern which has them for a node that is not the
+            # first node.
+            cur_args_copy = [cur_node_copy]  # type: ignore[has-type, possibly-undefined]  # noqa: F821
+
+            if len(cur_node_orig.args) > 1:
+                for arg in cur_node_orig.args[1:]:
+                    if isinstance(arg, torch.nn.Parameter):
+                        new_arg = arg.detach().clone()  # type: ignore[assignment]
+                        mod_name = f"mod_{cur_name_idx}"
+                        cur_name_idx += 1
+                        setattr(gm, mod_name, new_arg)
+                        new_arg_placeholder = gm.placeholder(mod_name)  # type: ignore[operator]
+                        # pyrefly: ignore [missing-attribute]
+                        cur_args_copy.append(new_arg_placeholder)
+                    elif isinstance(arg, (float, int, torch.dtype)):
+                        # pyrefly: ignore [missing-attribute]
+                        cur_args_copy.append(arg)
+                    else:
+                        raise AssertionError(f"arg of type {type(arg)} not handled yet")
+            cur_args_copy = tuple(cur_args_copy)  # type: ignore[assignment]
+
+        # copy the node
+        if cur_node_orig.op == "call_module":
+            orig_mod = getattr_from_fqn(model, cur_node_orig.target)  # type: ignore[arg-type]
+            orig_mod_copy = copy.deepcopy(orig_mod)
+            mod_name = f"mod_{cur_name_idx}"
+            setattr(gm, mod_name, orig_mod_copy)
+            cur_name_idx += 1
+            cur_node_copy = g.call_module(mod_name, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined,arg-type]
+
+        elif cur_node_orig.op == "call_function":
+            cur_node_copy = g.call_function(
+                cur_node_orig.target,  # type: ignore[arg-type]
+                cur_args_copy,  # type: ignore[arg-type]
+                cur_kwargs_copy,  # type: ignore[possibly-undefined]
+            )
+
+        elif cur_node_orig.op == "call_method":
+            cur_node_copy = g.call_method(
+                cur_node_orig.target,  # type: ignore[arg-type]
+                cur_args_copy,  # type: ignore[arg-type]
+                cur_kwargs_copy,  # type: ignore[possibly-undefined]
+            )
+
+        else:
+            raise AssertionError(f"{cur_node_orig.op} not supported yet")
+
+        if cur_node_orig is last_node:
+            break
+
+        # go to next node
+        if len(cur_node_orig.users.keys()) != 1:
+            raise AssertionError(
+                f"{cur_node_orig} has more than 1 users, not supported yet"
+            )
+        cur_node_orig = next(iter(cur_node_orig.users.keys()))
+        cur_iteration += 1
+        if cur_iteration > iteration_limit:
+            raise AssertionError("iteration limit exceeded")
+
+    # set up outputs
+    g.output(cur_node_copy)
+
+    gm.recompile()
+    return gm
+
+
+def create_one_transformed_and_logged_copy_of_subgraph(
+    mt: GraphModule,
+    subgraph_idx: int,
+    subgraph_candidate_idx: int,
+    first_node: Node,
+    last_node: Node,
+    fqn: str | None,
+    list_of_node_name_to_qconfig: list[dict[str, QConfigAny]],
+    example_inputs: Any,
+    last_added_shadow_node_list: list[Node | None],
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
+) -> None:
+    """
+    Given a subgraph in `mt` and a subgraph candidate idx, inserts the
+    subgraph candidate copy and instruments it with loggers.
+
+    If subgraph_candidate_idx is 0, this is the baseline fp32 subgraph and we just
+    add a logger to the end.
+
+    If subgraph_candidate_idx is not 0, we create a copy of the subgraph and
+    prepare it with `prepare_fx`.
+    """
+
+    # TODO(future PR): move logger classes to utils to remove circular dependency
+    from torch.ao.ns._numeric_suite_fx import OutputComparisonLogger, OutputLogger
+
+    if subgraph_candidate_idx == 0:
+        # idx = 0 is the floating point (original) version of the subgraph
+        # We keep the subgraph as is, and add a logger at the end
+
+        qconfig_str = ""
+        logger_mod_orig = _get_logger_for_subgraph(
+            mt,
+            first_node,
+            last_node,
+            subgraph_idx,
+            subgraph_candidate_idx,
+            qconfig_str,
+            OutputLogger,
+            fqn,
+        )
+
+        attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+        if hasattr(mt, attr_name):
+            raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+        setattr(mt, attr_name, logger_mod_orig)
+        with mt.graph.inserting_after(last_node):
+            new_node = mt.graph.call_module(attr_name, args=(last_node,), kwargs={})
+            last_added_shadow_node_list[0] = new_node
+
+    else:
+        # idx > 0 means we have a candidate qconfig to try, so we need
+        # to make a copy of the subgraph, feed it with the right inputs,
+        # and add a logger at the end
+
+        # get the qconfig
+        # subtract one because the first candidate is the floating point
+        # version of the subgraph
+        node_name_to_qconfig = list_of_node_name_to_qconfig[subgraph_candidate_idx - 1]
+        qconfig = node_name_to_qconfig[first_node.name]
+
+        # if no quantization is requested, skip
+        # TODO(future PR): deduplicate equivalent qconfigs that come from
+        #   different qconfig mapping objects
+        if qconfig is None:
+            return
+
+        qconfig_mapping = QConfigMapping().set_global(qconfig)
+
+        # create a copy of the submodule, wrapped in a separate module
+        orig_mod_copy_wrapped = create_submodule_from_subgraph(
+            mt, first_node, last_node
+        )
+
+        # add a call to prepare_fx on the wrapper module
+        if custom_prepare_fn is None:
+            orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx(
+                orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs
+            )
+        else:
+            if custom_prepare_kwargs is None:
+                custom_prepare_kwargs = {}
+            for kwarg_name in [
+                "example_inputs",
+                "prepare_custom_config",
+                "qconfig_mapping",
+            ]:
+                if kwarg_name in custom_prepare_kwargs:
+                    raise AssertionError(
+                        f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+                    )
+            prepare_kwargs: dict[str, Any] = {
+                "example_inputs": example_inputs,
+                "qconfig_mapping": qconfig_mapping,
+            }
+            prepare_kwargs.update(custom_prepare_kwargs)
+            orig_mod_copy_wrapped = custom_prepare_fn(
+                orig_mod_copy_wrapped, **prepare_kwargs
+            )
+
+        # attach the wrapper to the model
+        attr_name = _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx)
+        if hasattr(mt, attr_name):
+            raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+        setattr(mt, attr_name, orig_mod_copy_wrapped)
+
+        # add a call to the wrapper module from the parent graph
+        insert_after_node = last_added_shadow_node_list[0]
+        with mt.graph.inserting_after(insert_after_node):
+            # TODO(future PR): handle fusion patterns where non-first nodes
+            # need inputs
+
+            # pass in all node args and kwargs
+
+            new_args = []
+            for arg in first_node.args:
+                if isinstance(arg, Node):
+                    new_args.append(arg)
+                elif (
+                    isinstance(arg, (list, tuple))
+                    and len(arg)
+                    and isinstance(arg[0], Node)
+                ):
+                    new_args.extend(
+                        inner_arg for inner_arg in arg if isinstance(inner_arg, Node)
+                    )
+
+            new_kwargs = {}
+            for name, old_kwarg in first_node.kwargs.items():
+                if isinstance(old_kwarg, Node):
+                    new_kwargs[name] = old_kwarg
+                elif isinstance(old_kwarg, (list, tuple)) and len(old_kwarg):
+                    # TODO(future PR): clarify why we are adding kwargs to args
+                    new_args.extend(old_kwarg)  # type: ignore[arg-type]
+
+            new_args = tuple(new_args)  # type: ignore[assignment]
+
+            new_node = mt.graph.call_module(attr_name, args=new_args, kwargs=new_kwargs)  # type: ignore[arg-type]
+
+        # add a logger to parent graph to observe the shadow wrapper
+        logger_mod_orig = _get_logger_for_subgraph(
+            mt,
+            first_node,
+            last_node,
+            subgraph_idx,
+            subgraph_candidate_idx,
+            str(qconfig),
+            OutputComparisonLogger,
+            fqn,
+        )
+
+        attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+        if hasattr(mt, attr_name):
+            raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+        setattr(mt, attr_name, logger_mod_orig)
+        with mt.graph.inserting_after(new_node):
+            logger = mt.graph.call_module(
+                attr_name, args=(new_node, last_node), kwargs={}
+            )
+            last_added_shadow_node_list[0] = logger
+
+    mt.recompile()
+
+
+def create_n_transformed_and_logged_copies_of_subgraph(
+    mt: GraphModule,
+    subgraph_idx: int,
+    match_name: str,
+    nodes_in_this_subgraph: list[Any],
+    qconfig_mappings: list[QConfigMapping],
+    list_of_node_name_to_qconfig: list[dict[str, QConfigAny]],
+    custom_prepare_fn: Callable | None = None,
+    custom_prepare_kwargs: dict[str, Any] | None = None,
+) -> None:
+    """
+    Given a model `mt` and a subgraph_idx, creates the needed copies
+    of the subgraph for all qconfigs, and instruments them with loggers.
+    """
+    # for now, assume that
+    # 1. the first node has one input
+    # 2. the last node has one output
+
+    # for now, ignore all subgraphs that contain non-nodes (tuples, etc)
+    # TODO(future PR): implement this
+    if any(not isinstance(node, Node) for node in nodes_in_this_subgraph):
+        return
+
+    first_node = nodes_in_this_subgraph[0]
+    last_node = nodes_in_this_subgraph[-1]
+    # We used output propagation to populate example values on each
+    # node. Use the example values from the previous node as the input
+    # to the current node.
+    prev_node = get_normalized_nth_input(first_node, mt, 0)
+    if isinstance(prev_node, list):
+        example_inputs = [x.traced_result for x in prev_node]
+    elif isinstance(prev_node, tuple):
+        example_inputs = (x.traced_result for x in prev_node)  # type: ignore[assignment]
+    else:
+        # currently some customer models do not have a traced_result in
+        # every node, so we have to guard for this case since we cannot
+        # quantize without an example input
+        # TODO(future PR): add a test case for this once we have an easy
+        # repro, see https://github.com/pytorch/pytorch/pull/80521/files#r975940489
+        # for additional context
+        if hasattr(prev_node, "traced_result"):
+            example_inputs = (prev_node.traced_result,)  # type: ignore[attr-defined, assignment]
+        else:
+            print(
+                "unable to get example input for node "
+                + f"{first_node.format_node()}, skipping"
+            )
+            return
+
+    # If there are no quantization configs for this subgraph, skip adding
+    # loggers. This reduces memory usage for models where not all layers are
+    # quantized.
+    # TODO(future): consider making this configurable
+    found_at_least_one_qconfig = False
+    for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
+        if subgraph_candidate_idx == 0:
+            # fp32 baseline does not need a qconfig
+            continue
+
+        # a. we have N shadows, so len(qconfig_mappings) is N
+        # b. we will have the fp32 layer + N shadows, so overall number of
+        #    (original_op) + (*shadows) will be N+1
+        # c. since `subgraph_candidate_idx` represents (b), we need
+        #    to subtract 1 to query from (a)
+        node_name_to_qconfig = list_of_node_name_to_qconfig[subgraph_candidate_idx - 1]
+        qconfig = node_name_to_qconfig[first_node.name]
+        if qconfig is not None:
+            found_at_least_one_qconfig = True
+            break
+    if not found_at_least_one_qconfig:
+        print(
+            "unable to find at least one qconfig for node "
+            + f"{first_node.format_node()}, skipping"
+        )
+        return
+
+    fqn = _maybe_get_fqn(first_node, mt)
+
+    # We want the results to contain the subgraphs in natural order,
+    # and the graph to also contain shadow wrappers and shadow loggers
+    # in natural order.
+    # If we just iterate in reverse, the graph will be in natural
+    # order but the eventual results will be in reverse order.
+    # So, we keep track of the last shadow logger we added and
+    # always insert after it.
+    last_added_shadow_node_list: list[Node | None] = [None]
+    for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
+        create_one_transformed_and_logged_copy_of_subgraph(
+            mt,
+            subgraph_idx,
+            subgraph_candidate_idx,
+            first_node,
+            last_node,
+            fqn,
+            list_of_node_name_to_qconfig,
+            example_inputs,
+            last_added_shadow_node_list,
+            custom_prepare_fn,
+            custom_prepare_kwargs,
+        )
+
+
+def create_add_loggers_graph(
+    model: GraphModule,
+    subgraphs_dedup: dict[str, list[Node]],
+    qconfig_mapping: QConfigMapping,
+    node_name_to_qconfig: dict[str, QConfigAny],
+) -> None:
+    r"""
+    Given a model, a model graph partition (currently a set of matched
+    subgraphs) and instructions how to transform each subgraph
+    (currently quantizing it according to qconfig_mapping), modifies
+    the model graph to create an alternate path through the original graph,
+    with each of the subgraphs quantized.  This is useful to compare
+    propagation error of a transformation such as quantization.
+
+    For example, given layer op0 and op1, there are four cases when handling op1:
+    1. op0 and op1 quantized
+    2. op0 and op1 unquantized
+    3. op0 quantized, op1 unquantized
+    4. op0 unquantized, op1 quantized
+
+    Example input, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \          \                 \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog    op1_1 -> x2_1 ----> clog
+
+    Example output, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \        # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_1 -> x2_1 ----> clog
+
+    """
+    # TODO(future PR): move logger classes to utils to remove circular dependency
+    from torch.ao.ns._numeric_suite_fx import OutputComparisonLogger, OutputLogger
+
+    def _get_subgraph_containing_node(node, subgraphs_dedup):
+        for subgraph in subgraphs_dedup.values():
+            if node in subgraph:
+                return subgraph
+        return None
+
+    # First, we need to create shadow branches, going from
+    #
+    #   x0 -> op0 -> x1 -> ...
+    #
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog
+    #
+    # Later, the outputs of each shadow will be rerouted to calculate
+    # propagation error.
+
+    # Note: we cannot iterate over matched subgraphs because some nodes
+    # may not be matched. So, we iterate over nodes in the graph, and
+    # associate them to matched subgraphs if possible.
+
+    nodes_to_skip = set()
+    # for each subgraph, save a mapping from first node of subgraph
+    # to first and last node of the shadow of this subgraph
+    orig_first_node_to_shadow_in_node = {}
+    orig_first_node_to_shadow_out_node = {}
+    # need to record original list because we will mutate the graph as we go
+    orig_nodes = list(model.graph.nodes)  # type: ignore[union-attr, arg-type]
+    cur_subgraph_idx = 0
+    for n in orig_nodes:
+        if n.op in ("placeholder", "get_attr", "output") or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        insert_submodule_copy = False
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            nodes_to_skip.update(maybe_subgraph)
+            qconfig = node_name_to_qconfig[first_node.name]
+            if qconfig is not None:
+                insert_submodule_copy = True
+        else:
+            first_node, last_node = n, n
+
+        if insert_submodule_copy:
+            match_name = first_node.name
+            create_n_transformed_and_logged_copies_of_subgraph(
+                model,
+                cur_subgraph_idx,
+                match_name,
+                # pyrefly: ignore [bad-argument-type]
+                maybe_subgraph,
+                [qconfig_mapping],
+                [node_name_to_qconfig],
+                None,
+                None,  # type: ignore[arg-type]
+            )
+            # find the created shadow module and record it so we
+            # can find it easily in step 2
+            expected_shadow_target = f"shadow_wrapper_{cur_subgraph_idx}_1"
+            new_shadow_mod = None
+            for maybe_shadow_mod in model.graph.nodes:
+                if (
+                    maybe_shadow_mod.op == "call_module"
+                    and maybe_shadow_mod.target == expected_shadow_target
+                ):
+                    new_shadow_mod = maybe_shadow_mod
+                    break
+            if new_shadow_mod is None:
+                raise AssertionError("Expected new_shadow_mod to be non-None")
+            orig_first_node_to_shadow_in_node[first_node] = new_shadow_mod
+            orig_first_node_to_shadow_out_node[first_node] = new_shadow_mod
+
+        else:
+            # create a copy of the subgraph by only copying FX nodes
+            # but not copying any parameters, to minimize memory usage
+            subgraph_to_use = (
+                maybe_subgraph if maybe_subgraph is not None else [first_node]
+            )
+
+            # add a regular logger after last_node
+            qconfig_str = ""
+            subgraph_candidate_idx = 0
+            fqn = _maybe_get_fqn(first_node, model)
+            logger_mod_orig = _get_logger_for_subgraph(
+                model,
+                first_node,
+                last_node,
+                cur_subgraph_idx,
+                subgraph_candidate_idx,
+                qconfig_str,
+                OutputLogger,
+                fqn,
+            )
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            if hasattr(model, attr_name):
+                raise AssertionError(
+                    f"Unexpected attribute '{attr_name}' found in {model}"
+                )
+            setattr(model, attr_name, logger_mod_orig)
+            insertion_point = last_node
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(last_node,), kwargs={}
+                )
+                insertion_point = logger
+
+            # create a copy of the subgraph
+            cur_node_orig = first_node
+            cur_node_copy = None
+            first_node_copy = None
+            # pyrefly: ignore [bad-assignment]
+            while cur_node_orig in subgraph_to_use:
+                # TODO(future PR): make this support all possible args/kwargs
+                if cur_node_orig is first_node:
+                    new_args = cur_node_orig.args
+                    new_kwargs = cur_node_orig.kwargs
+                else:
+                    first_arg_for_copy: Node | None = cur_node_copy
+                    new_args = (first_arg_for_copy, *cur_node_orig.args[1:])
+                    new_kwargs = cur_node_orig.kwargs
+                # make a copy of cur_node_orig
+                with model.graph.inserting_after(insertion_point):
+                    cur_node_copy = model.graph.create_node(
+                        cur_node_orig.op,
+                        cur_node_orig.target,
+                        new_args,
+                        new_kwargs,
+                        # cur_node_orig.name,  # TODO(future PR): set name explicitly
+                    )
+                    if first_node_copy is None:
+                        first_node_copy = cur_node_copy
+                # since now only linear subgraphs are supported, all nodes
+                # except the last one must have only one user
+                if cur_node_orig != last_node:
+                    if len(cur_node_orig.users.keys()) != 1:
+                        raise AssertionError(
+                            f"Expected exactly 1, but got {len(cur_node_orig.users)}"
+                        )
+                cur_node_orig = next(iter(cur_node_orig.users.keys()))
+                if cur_node_orig.name.startswith(SHADOW_NODE_NAME_PREFIX):
+                    raise AssertionError(
+                        "cur_node_orig should not start with SHADOW_NODE_NAME_PREFIX"
+                    )
+                insertion_point = cur_node_copy
+
+            # add a comparison logger after last_node's copy
+            subgraph_candidate_idx = 1
+            logger_mod_orig = _get_logger_for_subgraph(
+                model,
+                first_node,
+                last_node,
+                cur_subgraph_idx,
+                subgraph_candidate_idx,
+                qconfig_str,
+                OutputComparisonLogger,
+                fqn,
+            )
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            if hasattr(model, attr_name):
+                raise AssertionError(
+                    f"Unexpected attribute '{attr_name}' found in {model}"
+                )
+            setattr(model, attr_name, logger_mod_orig)
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(cur_node_copy, last_node), kwargs={}
+                )
+
+            # save the final node so we can use it in step 2
+            orig_first_node_to_shadow_in_node[first_node] = first_node_copy
+            orig_first_node_to_shadow_out_node[first_node] = cur_node_copy
+
+        cur_subgraph_idx += 1
+
+    model.recompile()
+
+    # Now, we go from
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> x1 -> op1_0 -> ...
+    #    \                     \       \
+    #      -> op0_1 -> x1_1 -> clog      -> op1_1 -> ...
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log --> x1_0 -> op1_0 -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog -> x1_1 -> op1_1 -> ...
+    #
+    # sample values of key internal variables for the example above:
+    #
+    #   orig_first_node_to_shadow_in_node = {op0_0: op0_1, op1_0: op1_1}
+    #   orig_first_node_to_shadow_out_node = {op0_0: op0_1, op1_0: op1_1}
+    #
+    # note: for subgraphs with more than one node, in_node will be different
+    # compared to out_node
+
+    nodes_to_skip = set()
+    for n in orig_nodes:
+        if n.op in ("placeholder", "get_attr", "output") or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            nodes_to_skip.update(maybe_subgraph)
+        else:
+            first_node, last_node = n, n
+
+        def maybe_remap_node_to_shadow(node):
+            """
+            If unshadowed `node` has a shadow version, return that. If not,
+            return `node`.
+            """
+            if not isinstance(node, Node):
+                # handle scalars
+                return node
+
+            if node.op in ("placeholder", "get_attr"):
+                return node
+
+            # Find the shadowed version of this arg from the previous
+            # subgraph. For this, we need to:
+            # 1. navigate to the first node of the previous subgraph
+            # 2. get the output of the shadow wrapper which has (1) as an input
+
+            # For now, assume the arg is in matched subgraphs. In the
+            # future we may have to handle the case where this is not true.
+            prev_subgraph = _get_subgraph_containing_node(node, subgraphs_dedup)
+            if prev_subgraph is None:
+                prev_subgraph = [node]
+            prev_first_node = prev_subgraph[0]
+            prev_shadow_output = orig_first_node_to_shadow_out_node[prev_first_node]
+            return prev_shadow_output
+
+        cur_shadow_input = orig_first_node_to_shadow_in_node[first_node]
+        if cur_shadow_input is None:
+            raise AssertionError("Expected cur_shadow_input to be non-None")
+        cur_shadow_input.args = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.args
+        )
+        cur_shadow_input.kwargs = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.kwargs
+        )
+
+        model.recompile()
+
+
+def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module):
+    # input: shadow wrapper module
+    # output if shadow wrapper module has a weighted op:
+    #   (quantize_fn, (quantize_fn_args))
+    # output if shadow wrapper module doesn't have a weighted op:
+    #   None
+
+    # For now, assume that the weight is the second input
+    # to the shadow module. If that changes, we can fix it later.
+    placeholders_seen = 0
+    for shadow_n in shadow_wrapper.graph.nodes:  # type: ignore[union-attr]
+        if shadow_n.op != "placeholder":
+            continue
+
+        placeholders_seen += 1
+        if placeholders_seen != 2:
+            continue
+
+        # the subgraph looks like
+        #
+        #   _input_scale_1 = self._input_scale_1
+        #   _input_zero_point_1 = self._input_zero_point_1
+        #   quantize_per_channel = torch.quantize_per_channel(
+        #       w2_0, _input_scale_1, _input_zero_point_1,
+        #       0, torch.qint8)
+        #
+        #  we have `w2_0`, and are navigating this subgraph
+        #  to get `_input_scale_1` and `_input_zero_point_1`
+
+        if len(shadow_n.users) != 1:
+            raise AssertionError(f"Expected exactly 1, got {len(shadow_n.users)}")
+        quant_node = next(iter(shadow_n.users.keys()))
+        new_args: Any = None
+        if quant_node.target is torch.quantize_per_channel:
+            _weight, scale_node, zp_node, axis, dtype = quant_node.args
+            scale_val = getattr_from_fqn(shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, axis, dtype)
+        else:
+            if quant_node.target != torch.quantize_per_tensor:
+                raise AssertionError(
+                    f"Expected torch.quantize_per_tensor, but got {quant_node.target}"
+                )
+            _weight, scale_node, zp_node, dtype = quant_node.args
+            scale_val = getattr_from_fqn(shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, dtype)
+        return (quant_node.target, new_args)
+
+    return None
+
+
+def extract_weight_comparison(m: GraphModule) -> NSResultsType:
+    # example graph:
+    #
+    #   w1 = self.w1
+    #   b1 = self.b1
+    #   linear = torch._C._nn.linear(x, w1, b1)
+    #   shadow_0_0 = self.shadow_0_0(linear)
+    #   shadow_wrapper_0_1 = self.shadow_wrapper_0_1(x, w1, b1)
+    #   shadow_0_1 = self.shadow_0_1(shadow_wrapper_0_1, linear)
+    #
+    # algorithm:
+    # 1. for each call_function node matching our allowlist:
+    # 2.   if corresponding shadow wrapper exists, extract the weight pair
+    #
+    # Note: this is not super robust, but that's ok because this is
+    # just for legacy customers who depend on the previous two-model version
+    # of this API. TBD if we need to make this robust.
+    # Note: modules are not supported, since existing customers only
+    # use functions.
+
+    # TODO(future PR): move this to config
+    weighted_ops = {
+        torch.nn.functional.linear,
+    }
+
+    results: NSResultsType = {"model": {NSSingleResultValuesType.WEIGHT.value: {}}}
+
+    for n in m.graph.nodes:  # type: ignore[union-attr]
+        if not (n.op == "call_function" and n.target in weighted_ops):
+            continue
+
+        # Check if we have a corresponding shadow wrapper
+        # TODO(future PR, if needed): support kwargs
+        # TODO(future PR, if needed): support multiple shadow users
+        first_arg = n.args[0]
+        shadow_wrapper_node = None
+        for user in first_arg.users:
+            # TODO(before land): fix string match
+            if user.op == "call_module" and user.target.startswith("shadow_wrapper"):
+                shadow_wrapper_node = user
+                break
+
+        if shadow_wrapper_node is None:
+            continue
+
+        shadow_wrapper = getattr_from_fqn(m, shadow_wrapper_node.target)  # type: ignore[arg-type]
+        weight_info = _get_weight_info_from_shadow_wrapper(shadow_wrapper)
+        if weight_info is None:
+            continue
+
+        # get weight
+        w_node = n.args[1]
+        w_obj = getattr_from_fqn(m, w_node.target).detach()
+
+        # get a quantized version of weight
+        quant_fn, quant_fn_args_except_first = weight_info
+        new_args = (w_obj, *quant_fn_args_except_first)
+        w_obj_q = quant_fn(*new_args)
+
+        # add a comparison
+        ref_node_name = n.name
+        prev_node_name = n.name
+        ref_node_type = get_target_type_str(n, m)
+        prev_node_type = ref_node_type
+        fqn = None
+        if hasattr(m, "_node_name_to_scope"):
+            fqn = m._node_name_to_scope[n.name][0]  # type: ignore[index]
+        comparison = torch.ao.ns.fx.utils.compute_sqnr(w_obj, w_obj_q)
+        result_fp32 = {
+            "res_type": NSSingleResultValuesType.WEIGHT.value,
+            "values": [w_obj],
+            "prev_node_name": prev_node_name,
+            "prev_node_target_type": prev_node_type,
+            "ref_node_name": ref_node_name,
+            "ref_node_target_type": ref_node_type,
+            "index_within_arg": 0,
+            "index_of_arg": 0,
+            "fqn": fqn,
+            "qconfig_str": "",
+            "comparisons": [comparison],
+            "comparison_fn_name": "sqnr",
+        }
+        result_q = {
+            "res_type": NSSingleResultValuesType.WEIGHT.value,
+            "values": [w_obj_q],
+            "prev_node_name": prev_node_name,
+            "prev_node_target_type": prev_node_type,
+            "ref_node_name": ref_node_name,
+            "ref_node_target_type": ref_node_type,
+            "index_within_arg": 0,
+            "index_of_arg": 0,
+            "fqn": fqn,
+            "qconfig_str": "",
+            "comparisons": [comparison],
+            "comparison_fn_name": "sqnr",
+        }
+
+        # go from subgraph_n_1 to subgraph_n_0
+        _1, _2, node_idx, _3 = shadow_wrapper_node.target.split("_")
+        name_fp32 = f"subgraph_{node_idx}_0"
+        name_q = f"subgraph_{node_idx}_1"
+
+        results["model"][NSSingleResultValuesType.WEIGHT.value][name_fp32] = [
+            result_fp32
+        ]
+        results["model"][NSSingleResultValuesType.WEIGHT.value][name_q] = [result_q]
+
+    return results
+
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def group_results_by_subgraph(results: NSResultsType) -> Any:
+    """
+    Creates a comparison of results
+
+    Input:
+
+    {
+      'model': {
+        'node_output': {
+          'subgraph_0_0': [
+            'values': [torch.tensor(...), ...], ...
+            'ref_node_name': ...,
+            'ref_node_target_type': ...,
+            'qconfig_str': ...,
+            'comparisons': [], ...
+            'comparison_fn_name': '',
+            'fqn': '...',
+          ],
+          'subgraph_0_1': [
+            'values': [torch.tensor(...), ...], ...
+            'ref_node_name': ...,
+            'ref_node_target_type': ...,
+            'qconfig_str': ...,
+            'comparisons': [torch.tensor(...), ...], ...
+            'comparison_fn_name': '...',
+            'fqn': '...',
+          ],
+          ...
+        },
+      },
+    }
+
+    Output:
+    {
+      'subgraph_0': {
+        '0': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': None,
+          'comparisons': [torch.tensor(...), ...], ...
+          'comparison_fn_name': '...',
+          'fqn': '...',
+        },
+        '1': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '...',
+          'comparisons': [torch.tensor(...), ...], ...
+          'comparison_fn_name': '...',
+          'fqn': '...',
+        },
+      },
+    }
+
+    """
+    subgraph_name_to_subgraph_results: Any = collections.defaultdict(dict)
+
+    # node_output or weight
+    key_to_use = next(iter(results["model"].keys()))
+
+    for subgraph_name_with_idx, subgraph_candidate_results in results["model"][
+        key_to_use
+    ].items():
+        # convert from `subgraph_m_n` to `subgraph_m` and `n`
+        (
+            subgraph_str,
+            subgraph_idx,
+            subgraph_candidate_idx,
+        ) = subgraph_name_with_idx.split("_")
+        subgraph_name = f"{subgraph_str}_{subgraph_idx}"
+
+        subgraph_results = {
+            "ref_node_name": subgraph_candidate_results[0]["ref_node_name"],
+            "ref_node_target_type": subgraph_candidate_results[0][
+                "ref_node_target_type"
+            ],
+            "fqn": subgraph_candidate_results[0]["fqn"],
+            "values": subgraph_candidate_results[0]["values"],
+            "qconfig_str": subgraph_candidate_results[0]["qconfig_str"],
+            "comparisons": subgraph_candidate_results[0]["comparisons"],
+            "comparison_fn_name": subgraph_candidate_results[0]["comparison_fn_name"],
+        }
+
+        subgraph_name_to_subgraph_results[subgraph_name][subgraph_candidate_idx] = (
+            subgraph_results
+        )
+
+    return dict(subgraph_name_to_subgraph_results)
+
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def create_results_comparison(
+    results_grouped,
+) -> Any:
+    """
+    Input:
+
+    {
+      'subgraph_0': {
+        '0': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '',
+          'comparisons': [],
+          'comparison_fn_name': '',
+          'fqn': '...',
+        },
+        '1': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '...',
+          'comparisons': [torch.tensor(...), ...],
+          'comparison_fn_name': 'sqnr',
+          'fqn': '...',
+        },
+      },
+    }
+
+    Output:
+    {
+      'subgraph_0': {
+        'ref_node_name': '...',
+        'ref_node_target_type': '...',
+        'fqn': '...',
+        'candidates': {
+          '1': {
+            'qconfig_str': ...,
+            'comparison_fn_name': 'sqnr',
+            'cmp_raw': [..., ...],
+            'cmp_mean': ...,
+          },
+          ...,
+        },
+      },
+    }
+    """
+
+    results_comparison = {}
+
+    for subgraph_name, subgraph_results in results_grouped.items():
+        candidates = {}
+        for subgraph_inner_name, subgraph_inner_result in subgraph_results.items():
+            # skip comparing baseline to baseline
+            if subgraph_inner_name == "0":
+                continue
+
+            # we expect the comparisons to be precalculated from
+            # calibration, so we just fetch them here
+            cmp_raw = subgraph_inner_result["comparisons"]
+            cmp_raw_tensor = torch.stack(cmp_raw)
+
+            candidates[subgraph_inner_name] = {
+                "qconfig_str": subgraph_inner_result["qconfig_str"],
+                "comparison_fn_name": subgraph_inner_result["comparison_fn_name"],
+                "cmp_raw": cmp_raw_tensor,
+                "cmp_mean": torch.mean(cmp_raw_tensor),
+            }
+
+        results_comparison[subgraph_name] = {
+            "ref_node_name": subgraph_results["0"]["ref_node_name"],
+            "ref_node_target_type": subgraph_results["0"]["ref_node_target_type"],
+            "fqn": subgraph_results["0"]["fqn"],
+            "candidates": candidates,
+        }
+
+    return results_comparison
+
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def print_n_shadows_summary(
+    results_comparison,
+) -> None:
+    """
+    Input:
+
+    {
+      'subgraph_0': {
+        'ref_node_name': 'linear1',
+        'ref_node_target_type': '...',
+        'fqn': '...',
+        'candidates': {
+          '1': {
+            'qconfig_str': ...,
+            'comparison_fn_name': ...,
+            'cmp_raw': [45.0, 55.0],
+            'cmp_mean': 50.0,
+          },
+          ...,
+        },
+      },
+    }
+
+    Prints:
+
+    node_name | node_type | fqn | 0    | 1    | ...
+    linear1   | ...       | ... | 45.0 | 50.0 | ...
+    """
+
+    try:
+        from tabulate import tabulate
+    except ImportError:
+        print(
+            "`print_tabular` relies on the library `tabulate`, "
+            "which could not be found on this machine. Run `pip "
+            "install tabulate` to install the library."
+        )
+        return
+
+    results = []
+    for subgraph_data in results_comparison.values():
+        mean_all_candidates = [
+            candidate["cmp_mean"]
+            for candidate_name, candidate in subgraph_data["candidates"].items()
+        ]
+
+        data_row = [
+            subgraph_data["ref_node_name"],
+            subgraph_data["ref_node_target_type"],
+            subgraph_data["fqn"],
+            *mean_all_candidates,
+        ]
+        results.append(data_row)
+
+    max_candidate_idx_len = -1
+    for data_row in results:
+        max_candidate_idx_len = max(max_candidate_idx_len, len(data_row[1]))
+    candidate_idx_headers = [str(x) for x in range(max_candidate_idx_len)]
+
+    headers = ["node_name", "node_type", "fqn", *candidate_idx_headers]
+    print(tabulate(results, headers=headers))
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..134fd485130e0069ab992197ea6e176e1e1e216b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/ns_types.py
@@ -0,0 +1,66 @@
+import enum
+from collections.abc import Callable
+from typing import Any, NamedTuple, Union
+
+from torch.fx.graph import Node
+
+
+class NSSingleResultValuesType(str, enum.Enum):
+    WEIGHT = "weight"
+    NODE_OUTPUT = "node_output"
+    NODE_INPUT = "node_input"
+
+
+class NSSubgraph(NamedTuple):
+    start_node: Node
+    end_node: Node
+    base_op_node: Node
+
+
+# TODO(future PR): see if we can use typing_extensions's TypedDict instead
+# to properly type the various keys
+# {
+#   # one of NSSingleResultValuesType
+#   'type': 'weight',
+#   # the values of type specified above
+#   'values': [torch.tensor(...), ...],
+#   # name of the node directly before the logger
+#   'prev_node_name': 'linear1',
+#   # type of the underlying function or module
+#   'prev_node_target_type': torch.nn.functional.linear  # or torch.nn.Linear, etc
+#   # name of the node responsible for adding this logger
+#   # Note: this may differ from prev_node_name if we are logging inputs
+#   'ref_node_name': 'linear1',
+#   # index of this node within the arg of the input/output node
+#   # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1
+#   'index_within_arg': 0,
+#   # index of this node within the args of the input/output node
+#   # for example, in add(x1, x2), x2 would have index_of_arg == 1
+#   'index_of_arg': 0,
+#   # precomputed comparisons of logger values to reference values
+#   'comparisons': [torch.tensor(...), ...]
+#   # name of function used for precomputed comparisons
+#   'comparison_fn_name': 'sqnr',
+#   # string representation of qconfig responsible for creating this logger
+#   'qconfig_str': 'QConfig(...)',
+# }
+NSSingleResultType = dict[str, Any]
+
+# {
+#   'layer_name_1': {  # subgraph name
+#     'node_output': {  # results type (node_output, node_input, weight)
+#       'model_name_a':  # model name
+#          [NSSingleResultType, ...],  # results, ordered by index_within_arg
+#       'model_name_b':
+#          [NSSingleResultType, ...],
+#     },
+#   },
+# }
+#
+NSResultsType = dict[str, dict[str, dict[str, list[NSSingleResultType]]]]
+
+# Defines the underlying target type of a node, for example:
+# `F.conv1d` for a `call_function` conv node
+# `nn.Conv1d` for a `call_module` node calling the forward of a `nn.Conv1d` module
+# `'sigmoid'` for a `call_method` node calling `x.sigmoid()`
+NSNodeTargetType = Union[Callable, str]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10fdd39da9080144d3f6ef577d3ca5aca313538
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/pattern_utils.py
@@ -0,0 +1,214 @@
+from collections.abc import Callable
+from typing import Any, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.quantization import FakeQuantizeBase, ObserverBase
+from torch.ao.quantization.backend_config import get_native_backend_config
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .ns_types import NSNodeTargetType
+
+
+toq = torch.ops.quantized
+
+
+def get_type_a_related_to_b(
+    base_name_to_sets_of_related_ops: dict[str, set[NSNodeTargetType]],
+) -> set[tuple[NSNodeTargetType, NSNodeTargetType]]:
+    # TODO(future PR): allow customizations
+    # TODO(future PR): reuse existing quantization mappings
+    # TODO(future PR): add the rest of modules and ops here
+    type_a_related_to_b: set[tuple[NSNodeTargetType, NSNodeTargetType]] = set()
+
+    for s in base_name_to_sets_of_related_ops.values():
+        s_list = list(s)
+        # add every bidirectional pair
+        for idx_0 in range(len(s_list)):
+            for idx_1 in range(idx_0, len(s_list)):
+                type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
+                type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
+
+    return type_a_related_to_b
+
+
+NSFusionElType = Union[
+    Callable,  # call_function or call_module type, example: F.linear or nn.Conv2d
+    str,  # call_method name, example: "dequantize"
+    tuple[
+        str, Any
+    ],  # call_method name and first argument, example: ("to", torch.float16)
+]
+NSFusionType = Union[
+    tuple[NSFusionElType, NSFusionElType],
+    tuple[NSFusionElType, NSFusionElType, NSFusionElType, NSFusionElType],
+]
+
+
+def get_reversed_fusions() -> list[tuple[NSFusionType, int]]:
+    """
+    Set of potential fusions, in reverse order.  The order is reversed
+    to match how fusion patterns are defined in quantization code.
+
+    Fusion format:
+    ((fusion_op_0, fusion_op_1), base_op_idx)
+
+    Where base_op_idx is the idx of the op we should use to match other related
+    ops. Note: base_op_idx is specified in non-reverse order, i.e. a base_op_idx
+    of 0 represents the first op in regular (non-reverse) order, 1 represents the
+    second op, etc.
+    """
+    results: list[tuple[NSFusionType, int]] = []
+
+    # Possible syntaxes:
+    # * single op: torch.nn.Conv2d
+    # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
+    # For fusions, we only care about patterns composed of multiple ops.
+    # TODO(future PR): allow customizations from default patterns.
+    all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config())
+
+    default_base_op_idx = 0
+    for quant_pattern in all_quant_patterns:
+        # TODO: this is a temporary hack to flatten the patterns from quantization so
+        # that it works with the ns matcher function, maybe we should use `_is_match`
+        # in torch.ao.quantization.fx.match_utils to match the patterns
+        if (
+            isinstance(quant_pattern, tuple)
+            and len(quant_pattern) == 2
+            and isinstance(quant_pattern[1], tuple)
+            and len(quant_pattern[1]) == 2
+        ):
+            # flatten the pattern with form (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d))
+            quant_pattern = (quant_pattern[0], quant_pattern[1][0], quant_pattern[1][1])
+
+        # Only patterns of multiple ops are fusions, ignore
+        # patterns which contain a single ops (they get matched
+        # without caring about fusions).
+        if isinstance(quant_pattern, tuple):
+            results.append((quant_pattern, default_base_op_idx))  # type: ignore[arg-type]
+
+        # For each pattern, add additional patterns with observers and
+        # fake quants at the end.
+        # TODO(future PR): if needed, implement matching for a node
+        #   having multiple output observers.
+        for cls in (ObserverBase, FakeQuantizeBase):
+            if isinstance(quant_pattern, tuple):
+                new_pattern = (cls, *quant_pattern)
+            else:
+                new_pattern = (cls, quant_pattern)
+            results.append((new_pattern, default_base_op_idx))  # type: ignore[arg-type]
+
+    # After this point, results contains values such as
+    # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...]
+
+    # Patterns for matching fp16 emulation are not specified in the quantization
+    # fusion mappings.  For now, define them here.
+    fp16_em_base_op_idx = 1
+    patterns_to_add = [
+        # linear-relu fp16 emulation:
+        # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16
+        (
+            (("to", torch.float16), F.relu, F.linear, "dequantize"),
+            fp16_em_base_op_idx,
+        ),
+        # Conv-BN fusion (this happens outside of quantization patterns,
+        # which is why it is defined separately here).
+        ((nn.BatchNorm1d, nn.Conv1d), default_base_op_idx),
+        ((nn.BatchNorm2d, nn.Conv2d), default_base_op_idx),
+        ((nn.BatchNorm3d, nn.Conv3d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm1d, nn.Conv1d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm2d, nn.Conv2d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm3d, nn.Conv3d), default_base_op_idx),
+    ]
+    for p in patterns_to_add:
+        results.append(p)  # type: ignore[arg-type]
+        results.append(((ObserverBase, *p[0]), p[1]))  # type: ignore[arg-type]
+        results.append(((FakeQuantizeBase, *p[0]), p[1]))  # type: ignore[arg-type]
+
+    return results
+
+
+def end_node_matches_reversed_fusion(
+    end_node: Node,
+    reversed_fusion: NSFusionType,
+    gm: GraphModule,
+    seen_nodes: set[Node],
+) -> bool:
+    """
+    Returns true if a pattern ending with `end_node` matches
+    the fusion pattern.
+    """
+    cur_node = end_node
+    for fusion_idx in range(len(reversed_fusion)):
+        # each node can only belong to one matched pattern
+        if cur_node in seen_nodes:
+            return False
+
+        cur_fusion_el = reversed_fusion[fusion_idx]
+
+        if cur_node.op == "call_function":
+            fusion_el_is_fun = (not isinstance(cur_fusion_el, str)) and (
+                not isinstance(cur_fusion_el, type)
+            )
+            if fusion_el_is_fun:
+                if cur_node.target != cur_fusion_el:
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == "call_module":
+            fusion_el_is_mod = isinstance(cur_fusion_el, type)
+            if fusion_el_is_mod:
+                if not isinstance(cur_node.target, str):
+                    raise AssertionError(f"Expected str, got {type(cur_node.target)}")
+                target_mod = getattr_from_fqn(gm, cur_node.target)
+                if not isinstance(cur_fusion_el, type):
+                    return False
+                if not isinstance(target_mod, cur_fusion_el):
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == "call_method":
+            fusion_el_is_meth_with_second_arg = (
+                isinstance(cur_fusion_el, tuple) and len(cur_fusion_el) == 2
+            )
+            fusion_el_is_meth_without_args = isinstance(cur_fusion_el, str)
+            if fusion_el_is_meth_without_args or fusion_el_is_meth_with_second_arg:
+                if fusion_el_is_meth_without_args:
+                    if cur_node.target != cur_fusion_el:
+                        return False
+                else:
+                    if not isinstance(cur_fusion_el, tuple):
+                        raise AssertionError(
+                            f"Expected tuple, got {type(cur_fusion_el)}"
+                        )
+                    if cur_node.target != cur_fusion_el[0]:
+                        return False
+                    elif len(cur_node.args) < 2:
+                        return False
+                    elif cur_node.args[1] != cur_fusion_el[1]:
+                        return False
+
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+        else:
+            return False
+
+    return True
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36914b46929d7eb8311097cd6b5d0fdc0c82f12
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -0,0 +1,251 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import copy
+from typing import Any, TYPE_CHECKING
+
+import torch
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig_mapping import _QCONFIG_STYLE_ORDER
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from torch.ao.quantization.qconfig import QConfigAny
+
+__all__ = ["QConfigMultiMapping"]
+
+_QCONFIG_STYLE_TO_METHOD: dict[str, str] = {
+    "global_qconfig": "set_global",
+    "object_type_qconfigs": "set_object_type",
+    "module_name_regex_qconfigs": "set_module_name_regex",
+    "module_name_qconfigs": "set_module_name",
+    "module_name_object_type_order_qconfigs": "set_module_name_object_type_order",
+}
+
+
+def _remove_duplicates_and_none(qconfig_list: list[QConfigAny]) -> None:
+    to_remove = []
+    for index, cur_qconfig in enumerate(qconfig_list):
+        if cur_qconfig is None:
+            to_remove.append(index)
+            break
+        for checked_qconfig in qconfig_list[:index]:
+            if torch.ao.quantization.qconfig_equals(cur_qconfig, checked_qconfig):
+                to_remove.append(index)
+                break
+    for index in to_remove[::-1]:
+        qconfig_list.pop(index)
+
+
+class QConfigMultiMapping:
+    """
+    This class, used with the prepare_n_shadows_model API, stores a list of :class:`torch.ao.quantization.QConfigMapping`s
+    so that multiple QConfigs can be specified for each QConfig matching style.
+
+    The user can specify QConfigs using the following methods (in increasing match priority):
+
+        ``set_global`` : sets the global (default) QConfigs
+
+        ``set_object_type`` : sets the QConfigs for a given module type, function, or method name
+
+        ``set_module_name_regex`` : sets the QConfigs for modules matching the given regex string
+
+        ``set_module_name`` : sets the QConfigs for modules matching the given module name
+
+        ``set_module_name_object_type_order`` : sets the QConfigs for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
+
+    Note: Usage of set methods is the same as in QConfigMapping except with a passed in list of QConfigs rather than a
+    single QConfig.
+
+    Example usage::
+
+        qconfig_mapping = QConfigMultiMapping()
+            .set_global([qconfig1, qconfig2])
+            .set_object_type(torch.nn.Linear, [qconfig2, qconfig3])
+            .set_object_type(torch.nn.ReLU, [qconfig1])
+            .set_module_name_regex("foo.*bar.*conv[0-9]+", [qconfig2])
+            .set_module_name_regex("foo.*", [qconfig1, qconfig2, qconfig3])
+            .set_module_name("module1", [None])
+            .set_module_name("module2", [qconfig2])
+            .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, [qconfig3])
+
+    """
+
+    def __init__(self) -> None:
+        # initialize this with 1 QConfigMapping to avoid corner cases
+        self.qconfig_mappings_list: list[QConfigMapping] = [QConfigMapping()]
+
+    def _handle_list_size_mismatch(
+        self, qconfig_list: list[QConfigAny], style: str
+    ) -> None:
+        # this method handles cases where the size of qconfig_list does not match
+        # the size of qconfig_mappings_list.
+        # Issue: Consider a user inserting global_qconfig A and B first, then inserting
+        # qconfig C as an object_type_qconfig for conv ops. If we internally store
+        # 1 QConfigMapping with A and C and another with just B, then the
+        # second QConfigMapping will match B to conv ops (which is not wanted), since B is global.
+
+        # we avoid this by maintaining the invariant that if any QConfigMapping
+        # has a qconfig style+key with a qconfig in it, all QConfigMappings must
+        # have either a qconfig or None for that same style+key. In the above
+        # example, a None qconfig would prevent the unwanted match in the
+        # second QConfigMapping
+
+        if len(qconfig_list) > len(self.qconfig_mappings_list):
+            # Case: we have more qconfigs (in qconfig_list) than QConfigMappings
+
+            # Add new QConfigMappings (initialized so we maintain the `invariant`)
+
+            new_qconfig_mapping = QConfigMapping()
+            # searches other QConfigMappings for qconfig style+keys
+            # that need to be inserted as `None` into the new QConfigMapping
+            for qconfig_mapping in self.qconfig_mappings_list:
+                # global_qconfig has None by default
+                for check_style in _QCONFIG_STYLE_ORDER[1:]:
+                    qconfigs_dict = getattr(qconfig_mapping, check_style)
+                    target_qconfigs_dict = getattr(new_qconfig_mapping, check_style)
+                    for key in qconfigs_dict:
+                        target_qconfigs_dict[key] = None
+                break
+
+            # insert copies of this new QConfigMapping until all entries
+            # in qconfig_list can fit among the QConfigMappings
+            while len(qconfig_list) > len(self.qconfig_mappings_list):
+                self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping))
+        else:
+            # Case: we have fewer qconfigs in qconfig_list than QConfigMappings
+
+            # pad qconfig_list with `None` until length is same
+            while len(qconfig_list) < len(self.qconfig_mappings_list):
+                qconfig_list.append(None)
+
+    # this function applies the insertion method across each QConfigMapping
+    def _insert_qconfig_list(
+        self,
+        style: str,
+        args: list[str | int | Callable],
+        qconfig_list: list[QConfigAny],
+    ) -> None:
+        # we remove duplicates and None to make the ordering of qconfigs
+        # deterministic upon insertion.
+        _remove_duplicates_and_none(qconfig_list)
+
+        self._handle_list_size_mismatch(qconfig_list, style)
+        method_name = _QCONFIG_STYLE_TO_METHOD[style]
+        for qconfig_mapping, qconfig in zip(self.qconfig_mappings_list, qconfig_list):
+            # uses QConfigMapping set method to insert qconfig
+            set_method = getattr(qconfig_mapping, method_name)
+            set_method(*args, qconfig)
+
+    def set_global(self, global_qconfig_list: list[QConfigAny]) -> QConfigMultiMapping:
+        """
+        Set global QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_global()` for more info
+        """
+        self._insert_qconfig_list("global_qconfig", [], global_qconfig_list)
+        return self
+
+    def set_object_type(
+        self, object_type: Callable | str, qconfig_list: list[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set object type QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_object_type()` for more info
+        """
+        self._insert_qconfig_list("object_type_qconfigs", [object_type], qconfig_list)
+        return self
+
+    def set_module_name_regex(
+        self, module_name_regex: str, qconfig_list: list[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name_regex QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_regex()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_regex_qconfigs", [module_name_regex], qconfig_list
+        )
+        return self
+
+    def set_module_name(
+        self, module_name: str, qconfig_list: list[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name()` for more info
+        """
+        self._insert_qconfig_list("module_name_qconfigs", [module_name], qconfig_list)
+        return self
+
+    def set_module_name_object_type_order(
+        self,
+        module_name: str,
+        object_type: Callable,
+        index: int,
+        qconfig_list: list[QConfigAny],
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_object_type_order()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_object_type_order_qconfigs",
+            [module_name, object_type, index],
+            qconfig_list,
+        )
+        return self
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + " ["
+            + "".join(
+                f"\n{qconfig_mapping.__repr__()},"
+                for qconfig_mapping in self.qconfig_mappings_list
+            )
+            + "\n]"
+        )
+
+    @classmethod
+    def from_list_qconfig_mapping(
+        cls, qconfig_mapping_list: list[QConfigMapping]
+    ) -> QConfigMultiMapping:
+        """
+        Creates a QConfigMultiMapping from a list of QConfigMappings
+        """
+        new_qconfig_multi_mapping = cls()
+
+        new_qconfig_multi_mapping.qconfig_mappings_list = copy.deepcopy(
+            qconfig_mapping_list
+        )
+
+        # we need to avoid the issue described in _handle_list_size_mismatch,
+        # so we reinsert all the qconfigs using the QConfigMultiMapping
+        # set methods
+
+        # go through all qconfig styles
+        # note: global can be ignored since it is None by default
+        for style in _QCONFIG_STYLE_ORDER[1:]:
+            # gather all key+qconfigs for current style
+            # into qconfig_dict_list
+            qconfig_dict_list: dict[Any, list[QConfigAny]] = {}
+            for qconfig_mapping in qconfig_mapping_list:
+                qconfig_dict = getattr(qconfig_mapping, style)
+                for key, qconfig in qconfig_dict.items():
+                    if key not in qconfig_dict_list:
+                        qconfig_dict_list[key] = []
+                    qconfig_dict_list[key].append(qconfig)
+
+            # reinsert all gathered key+qconfigs
+            set_method_name = _QCONFIG_STYLE_TO_METHOD[style]
+            set_method = getattr(new_qconfig_multi_mapping, set_method_name)
+            for key, qconfig_list in qconfig_dict_list.items():
+                if isinstance(key, tuple):
+                    set_method(*key, qconfig_list)
+                else:
+                    set_method(key, qconfig_list)
+
+        return new_qconfig_multi_mapping
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e72ae2fd4b64ae1b529e06bb8af988a747f690
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/utils.py
@@ -0,0 +1,579 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import enum
+import operator
+from collections.abc import Callable
+
+import torch
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+from torch.ao.quantization import FakeQuantizeBase, ObserverBase
+from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .ns_types import NSNodeTargetType, NSResultsType
+
+
+toq = torch.ops.quantized
+
+
+# TODO(future PR): consider deleting this enum and using the torch types
+# directly.  This might be tricky because it is not a one to one mapping.
+class NodeInputOrOutputType(enum.Enum):
+    FP32 = enum.auto()  # torch.float
+    INT8 = enum.auto()  # torch.qint8 or torch.quint8
+    FP16 = enum.auto()  # torch.float16
+    UNKNOWN = enum.auto()  # we cannot determine input/output dtype
+    # TODO(future PR): while these functions can support multiple dtypes,
+    #   for the purposes of numerical debugging we want to get the actual
+    #   dtype used in the model. We will likely need some kind of dtype
+    #   propagation to estimate this.
+    FP32_OR_INT8 = enum.auto()  # either torch.float or torch.quint8 or torch.qint8
+    # TODO(future PRs): dynamic quant, fake quant, etc
+
+
+def get_node_first_input_and_output_type(
+    node: Node,
+    gm: GraphModule,
+    logger_cls: Callable,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]],
+) -> tuple[NodeInputOrOutputType, NodeInputOrOutputType]:
+    # TODO(future PR): clean this up
+    FUNS_IO_TYPE_FP32 = node_type_to_io_type_map["funs_io_type_fp32"]
+    FUNS_IO_TYPE_FP16 = node_type_to_io_type_map["funs_io_type_fp16"]
+    FUNS_IO_TYPE_INT8 = node_type_to_io_type_map["funs_io_type_int8"]
+    FUNS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["funs_io_type_fp32_or_int8"]
+    MODS_IO_TYPE_FP32 = node_type_to_io_type_map["mods_io_type_fp32"]
+    MODS_IO_TYPE_INT8 = node_type_to_io_type_map["mods_io_type_int8"]
+    MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"]
+    METHS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["meths_io_type_fp32_or_int8"]
+
+    if node.op == "call_function":
+        if node.target in FUNS_IO_TYPE_FP32:
+            return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        if node.target in FUNS_IO_TYPE_FP16:
+            return (NodeInputOrOutputType.FP16, NodeInputOrOutputType.FP16)
+        elif node.target in FUNS_IO_TYPE_INT8:
+            return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        elif node.target in FUNS_IO_TYPE_FP32_OR_INT8:
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            if not isinstance(first_arg, Node):
+                raise AssertionError(f"Expected Node, got {type(first_arg)}")
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+    elif node.op == "call_module":
+        if node.op != "call_module":
+            raise AssertionError(f"Expected call_module, got '{node.op}'")
+        if not isinstance(node.target, str):
+            raise AssertionError(f"Expected str, but got {type(node.target)}")
+
+        mod = getattr_from_fqn(gm, node.target)
+        is_known_fp32_or_int8_input_module = any(
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32_OR_INT8
+        )
+        if (
+            isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase))  # type: ignore[arg-type]
+            or is_known_fp32_or_int8_input_module
+        ):
+            # A logger or observer's input and output type is the output
+            # type of the preceding node.
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            if not isinstance(first_arg, Node):
+                raise AssertionError(f"Expected Node, got {type(first_arg)}")
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+        is_known_fp32_input_module = any(
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32
+        )
+        is_known_int8_input_module = any(
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_INT8
+        )
+        if is_known_fp32_input_module:
+            return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        elif is_known_int8_input_module:
+            return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+    elif node.op == "call_method":
+        if node.target == "dequantize":
+            # Dequantize is a special node because it allows multiple input types.
+            # So, we look up the output type of the previous node and return that
+            # as the input type of this node instance.
+            prev_node = get_normalized_nth_input(node, gm, 0)
+            if not isinstance(prev_node, Node):
+                raise AssertionError(f"Expected Node, got {type(prev_node)}")
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                prev_node, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, NodeInputOrOutputType.FP32)
+
+        elif node.target == "to":
+            # to is a special node because it allows multiple input types.
+            # So, we look up the output type of the previous node and return that
+            # as the input type of this node instance. We also look up the target
+            # of to and return the correct output type.
+            prev_node = get_normalized_nth_input(node, gm, 0)
+            if not isinstance(prev_node, Node):
+                raise AssertionError(f"Expected Node, got {type(prev_node)}")
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                prev_node, gm, logger_cls, node_type_to_io_type_map
+            )
+
+            cur_node_dtype_target = get_normalized_nth_input(node, gm, 1)
+            if cur_node_dtype_target is not torch.float16:
+                raise AssertionError(
+                    f"{cur_node_dtype_target} handling needs to be added"
+                )
+
+            return (prev_node_output_type, NodeInputOrOutputType.FP16)
+
+        elif node.target in METHS_IO_TYPE_FP32_OR_INT8:
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            if not isinstance(first_arg, Node):
+                raise AssertionError(f"Expected Node, got {type(first_arg)}")
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+    else:
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+
+def get_node_input_qparams(
+    node: Node,
+    gm: GraphModule,
+    node_type_to_io_type_map: dict[str, set[NSNodeTargetType]],
+) -> tuple[torch.Tensor | float, torch.Tensor | int] | None:
+    """
+    Returns the qparams (scale, zero_point) of the first input to `node`,
+    if they can be inferred from the graph.
+    """
+    prev_node = get_normalized_nth_input(node, gm, 0)
+
+    if not isinstance(prev_node, Node):
+        return None
+
+    MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"]
+
+    def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
+        scale_node = get_normalized_nth_input(node, gm, scale_arg_idx)
+        zp_node = get_normalized_nth_input(node, gm, zp_arg_idx)
+        if not isinstance(scale_node, Node):
+            raise AssertionError(f"Expected Node, got {type(scale_node)}")
+        if not isinstance(scale_node.target, str):
+            raise AssertionError(f"Expected str, got {type(scale_node.target)}")
+        if not isinstance(zp_node, Node):
+            raise AssertionError(f"Expected Node, got {type(zp_node)}")
+        if not isinstance(zp_node.target, str):
+            raise AssertionError(f"Expected str, got {type(zp_node.target)}")
+        scale_obj = getattr_from_fqn(gm, scale_node.target)
+        zp_obj = getattr_from_fqn(gm, zp_node.target)
+        return (scale_obj, zp_obj)
+
+    if prev_node.op == "call_function":
+        # quantize - read the args directly
+        if prev_node.target is torch.quantize_per_tensor:
+            return _get_scale_zp_from_function_args(prev_node, gm, 1, 2)
+        elif prev_node.target in (toq.add, toq.add_relu, toq.mul, toq.mul_relu):
+            return _get_scale_zp_from_function_args(prev_node, gm, 2, 3)
+
+        return None
+        # TODO(future PR): handle more functionals
+        # TODO(future PR): handle functional ops which inherit qparams from input
+
+    elif prev_node.op == "call_module":
+        # get type of the module
+        if not isinstance(prev_node.target, str):
+            raise AssertionError(f"Expected str, got {type(prev_node.target)}")
+        module_obj = getattr_from_fqn(gm, prev_node.target)
+        if isinstance(
+            module_obj,
+            (
+                nnq.Linear,
+                nnq.Conv1d,
+                nnq.Conv2d,
+                nniq.ConvReLU2d,
+                nnq.Conv3d,
+                nnq.BatchNorm2d,
+                nnq.BatchNorm3d,
+                nnq.ConvTranspose1d,
+                nnq.ConvTranspose2d,
+                nnq.ELU,
+                nnq.GroupNorm,
+                nnq.InstanceNorm1d,
+                nnq.InstanceNorm2d,
+                nnq.InstanceNorm3d,
+                nnq.LayerNorm,
+                nnq.Hardswish,
+                nnq.LeakyReLU,
+                nnq.ReLU6,
+                nniq.BNReLU2d,
+                nniq.BNReLU3d,
+                nniq.ConvReLU1d,
+                nniq.ConvReLU2d,
+                nniq.ConvReLU3d,
+                nniq.LinearReLU,
+            ),
+        ):
+            return (module_obj.scale, module_obj.zero_point)  # type: ignore[return-value]
+
+        is_known_fp32_or_int8_input_module = any(
+            isinstance(module_obj, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32_OR_INT8
+        )
+        if is_known_fp32_or_int8_input_module:
+            return get_node_input_qparams(prev_node, gm, node_type_to_io_type_map)
+
+    return None
+
+
+def return_first_non_observer_node(
+    node: Node,
+    gm: GraphModule,
+) -> Node:
+    """
+    If node is not an observer, returns it.  If node is an observer,
+    navigates up the graph and returns the first parent which is not an
+    observer.  For example,
+
+    graph: (node_non_obs), node = node_non_obs : returns node_non_obs
+    graph: (node_non_obs -> obs0), node = obs0 : returns node_non_obs
+    graph: (node_non_obs -> obs0 -> fq0), node = fq0 : returns node_non_obs
+    """
+    if node.op == "call_module":
+        node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
+        if _is_activation_post_process(node_obj):
+            if len(node.args) != 1:
+                raise AssertionError(
+                    f"Expected node.args to have length 1, got {len(node.args)}"
+                )
+            if not isinstance(node.args[0], Node):
+                raise AssertionError(f"Expected Node, got {type(node.args[0])}")
+            node = node.args[0]
+            # code duplication intended, not worth refactoring
+            if not isinstance(node.target, str):
+                raise AssertionError(f"Expected str, got {type(node.target)}")
+            node_obj = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(node_obj):
+                if len(node.args) != 1:
+                    raise AssertionError(
+                        f"Expected node.args to have length 1, got {len(node.args)}"
+                    )
+                if not isinstance(node.args[0], Node):
+                    raise AssertionError(f"Expected Node, got {type(node.args[0])}")
+                node = node.args[0]
+    return node
+
+
+def get_number_of_non_param_args(
+    node: Node,
+    gm: GraphModule,
+) -> int:
+    """
+    Assumes that all non-param args occur first. Returns the number of
+    non-param args expected for a node.  For example, for
+
+      F.linear(x, weight, bias)
+
+    Returns 1, because x is a non-param arg and weight and bias are params.
+    For
+
+      lstm_mod(x, hid)
+
+    Returns 2, because both x and hid are non-param args.
+    """
+    if node.op == "call_module":
+        node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
+        if isinstance(node_obj, nn.LSTM):
+            return 2
+
+    # default is 1
+    return 1
+
+
+def get_arg_indices_of_inputs_to_log(node: Node) -> list[int]:
+    """
+    Returns the indices of args of the node which we should attach
+    loggers to, if input logging is enabled.
+
+    For example,
+    * for (x + y), returns [0, 1]
+    * for (1 + y), returns [1]
+    * for (x + 1), returns [0]
+    * for (linear(x, w, b)) returns [0]
+    * by default, returns [0]
+    """
+    if len(node.args) == 0:
+        return []
+    if node.op == "call_function" and (
+        # TODO(future PR): use relationship map instead of hardcoding
+        node.target in (torch.add, torch.ops.quantized.add, operator.add)
+        or node.target in (torch.mul, torch.ops.quantized.mul, operator.mul)
+    ):
+        result = [i for i in range(2) if type(node.args[i]) is Node]
+        return result
+    return [0]
+
+
+def get_target_type_str(node: Node, gm: GraphModule) -> str:
+    """
+    Returns a string representation of the type of the function or module
+    pointed to by this node, or '' for other node types.
+    """
+    target_type = ""
+    if node.op in ("call_function", "call_method"):
+        target_type = torch.typename(node.target)
+    elif node.op == "call_module":
+        if not isinstance(node.target, str):
+            raise AssertionError(f"Expected str, got {type(node.target)}")
+        target_mod = getattr_from_fqn(gm, node.target)
+        target_type = torch.typename(target_mod)
+    return target_type
+
+
+def rekey_logger_info_on_node_name_of_model(
+    results: NSResultsType,
+    model_name: str,
+) -> NSResultsType:
+    """
+    Rekeys the layer name of a results dictionary to use node names
+    from `model_name`.
+
+    For example, transforms
+
+        {'base_op_1_0': {'node_output': {'model_a':
+          [{'ref_node_name': 'linear1', ...}]}}}
+
+    into
+
+        {'linear1': {'node_output': {'model_a':
+          [{'ref_node_name': 'linear1', ...}]}}}
+
+    Note: we cannot use these node names directly because they are not
+    guaranteed to be consistent across models. This is why we extract
+    the results first and rekey afterwards.
+    """
+    new_results = {}
+    for old_layer_name, result_type_to_results in results.items():
+        new_layer_name = None
+        for model_name_to_results in result_type_to_results.values():
+            for cur_model_name, list_of_results in model_name_to_results.items():
+                if cur_model_name == model_name:
+                    if len(list_of_results) == 0:
+                        raise AssertionError("Expected list_of_results to be not empty")
+                    new_layer_name = list_of_results[0]["ref_node_name"]
+                else:
+                    continue
+        if new_layer_name is not None:
+            new_results[new_layer_name] = result_type_to_results
+        else:
+            new_results[old_layer_name] = result_type_to_results
+    return new_results
+
+
+def maybe_add_missing_fqns(results: NSResultsType) -> None:
+    """
+    If `fqn` entries are filled in for one of the models in `results`, copies
+    them over to any models which do not have them filled out.
+
+    A common use case benefitting from this is comparing a model prepared by
+    quantization to a quantized model. In this case, the model prepared by
+    quantization would have `fqn` entries, and the quantized model would not.
+    """
+
+    # Check in the first result to find any model with fqn entries defined.
+    model_name_with_fqns = None
+    for result_type_to_results in results.values():
+        for model_name_to_results in result_type_to_results.values():
+            for model_name, model_results in model_name_to_results.items():
+                if len(model_results) > 0:
+                    if model_results[0]["fqn"] is not None:
+                        model_name_with_fqns = model_name
+                        break
+            break
+        break
+
+    if model_name_with_fqns:
+        for result_type_to_results in results.values():
+            for model_name_to_results in result_type_to_results.values():
+                ref_model_results = model_name_to_results[model_name_with_fqns]
+                for model_name, model_results in model_name_to_results.items():
+                    if model_name == model_name_with_fqns:
+                        continue
+
+                    for i in range(len(model_results)):
+                        fqn = ref_model_results[i]["fqn"]
+                        model_results[i]["fqn"] = fqn
+
+
+def maybe_dequantize_first_two_tensor_args_and_handle_tuples(f):
+    def inner(*args, **kwargs):
+        a0, a1, *a_other = args
+
+        if (isinstance(a0, tuple) and isinstance(a1, tuple)) or (
+            isinstance(a0, list) and isinstance(a1, list)
+        ):
+            results = []
+            for el0, el1 in zip(a0, a1):
+                new_args = (el0, el1, *a_other)
+                results.append(inner(*new_args, **kwargs))
+            return results
+
+        elif isinstance(a0, torch.Tensor) and isinstance(a1, torch.Tensor):
+            if a0.is_quantized:
+                a0 = a0.dequantize()
+            if a1.is_quantized:
+                a1 = a1.dequantize()
+
+        # for the purposes of this util, only handle floats
+        if a0.dtype != torch.float or a1.dtype != torch.float:
+            return None
+
+        new_args = (a0, a1, *a_other)
+        return f(*new_args, **kwargs)
+
+    return inner
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_sqnr(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the SQNR between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    Ps = torch.norm(x)
+    Pn = torch.norm(x - y)
+    return 20 * torch.log10(Ps / Pn)
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_normalized_l2_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the normalized L2 error between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    # pyrefly: ignore [unsupported-operation]
+    return torch.sqrt(((x - y) ** 2).sum() / (x**2).sum())
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_cosine_similarity(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the cosine similarity between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    # For convolutions, the shape of the quantized weight has one additional
+    # dimension compared to the shape of the fp32 weight. Match the shapes
+    # to enable cosine similarity comparison.
+    x = x.reshape(1, -1)
+    y = y.reshape(1, -1)
+    return torch.nn.functional.cosine_similarity(x, y)
+
+
+def op_type_supports_shadowing(node: Node) -> bool:
+    if node.op == "call_function":
+        if node.target in (
+            torch.add,
+            torch.mul,
+            operator.add,
+            operator.mul,
+            torch.cat,
+            torch.stack,
+        ):
+            # shadowing for ops with multiple tensor inputs is not implemented yet
+            return False
+    return True
+
+
+def get_normalized_nth_input(node: Node, gm: GraphModule, idx: int) -> Node:
+    """
+    Given a node, gets the n'th input to that node, normalizing
+    args and kwargs to the best of its ability.
+    """
+    try:
+        norm_args_and_kwargs = node.normalized_arguments(
+            gm, normalize_to_only_use_kwargs=True
+        )
+        if norm_args_and_kwargs is not None:
+            norm_args, norm_kwargs = norm_args_and_kwargs
+            if len(norm_args) + len(norm_kwargs) <= idx:
+                raise AssertionError(
+                    f"Index {idx} out of range: total = {len(norm_args) + len(norm_kwargs)}"
+                )
+            if idx < len(norm_args):
+                return norm_args[idx]
+            else:
+                # note: in Python 3.7+ dicts are ordered
+                return list(norm_kwargs.values())[idx]
+        else:
+            if len(node.args) + len(node.kwargs) <= idx:
+                raise AssertionError(
+                    f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}"
+                )
+            if idx < len(node.args):
+                return node.args[idx]  # type: ignore[return-value]
+            else:
+                kwargs_idx = idx + len(node.args)
+                return list(node.kwargs.values())[kwargs_idx]  # type: ignore[return-value]
+    except RuntimeError:
+        # this RuntimeError happens when node argument normalization
+        # requires typehints to proceed, such as for torch.add where
+        # either the first, second or both arguments could be tensors
+        if len(node.args) + len(node.kwargs) <= idx:
+            raise AssertionError(
+                f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}"
+            ) from None
+        if idx < len(node.args):
+            return node.args[idx]  # type: ignore[return-value]
+        else:
+            kwargs_idx = idx + len(node.args)
+            return list(node.kwargs.values())[kwargs_idx]  # type: ignore[return-value]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bff44215e46174856918883f35aac92b4491c25
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/ns/fx/weight_utils.py
@@ -0,0 +1,302 @@
+from collections.abc import Callable
+
+import torch
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .ns_types import NSSingleResultType, NSSingleResultValuesType
+from .utils import get_target_type_str, getattr_from_fqn, return_first_non_observer_node
+
+
+toq = torch.ops.quantized
+
+
+def mod_weight_detach(mod: nn.Module) -> torch.Tensor:
+    return mod.weight.detach()  # type: ignore[operator]
+
+
+def mod_0_weight_detach(mod: nn.Module) -> torch.Tensor:
+    return mod[0].weight.detach()  # type: ignore[index]
+
+
+def mod_weight_bias_0(mod: nn.Module) -> torch.Tensor:
+    return mod._weight_bias()[0]  # type: ignore[operator]
+
+
+def get_lstm_weight(mod: nn.Module) -> list[torch.Tensor]:
+    res = []
+    for idx, param_name in enumerate(mod._flat_weights_names):  # type: ignore[arg-type]
+        if "weight_ih_l" in param_name or "weight_hh_l" in param_name:
+            param_value = mod._flat_weights[idx].detach()  # type: ignore[index,union-attr]
+            res.append(param_value)
+    return res
+
+
+def get_qlstm_weight(mod: nn.Module) -> list[torch.Tensor]:
+    res = []
+    for weight_value in mod._all_weight_values:  # type: ignore[union-attr]
+        res.append(weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0])
+        res.append(weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0])
+    return res
+
+
+def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
+    if isinstance(mod, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        return mod.weight.detach()
+    elif isinstance(mod, (nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d)):
+        return mod[0].weight.detach()  # type: ignore[operator]
+    else:
+        return mod._weight_bias()[0]  # type: ignore[operator]
+
+
+def get_linear_mod_weight(mod: nn.Module) -> torch.Tensor:
+    if isinstance(mod, nn.Linear):
+        return mod.weight.detach()
+    elif isinstance(mod, nni.LinearReLU):
+        return mod[0].weight.detach()  # type: ignore[operator]
+    else:
+        return mod._weight_bias()[0]  # type: ignore[operator]
+
+
+def get_lstm_mod_weights(mod: nn.Module) -> list[torch.Tensor]:
+    # TODO(future PR): make more generic, handle everything
+    if isinstance(mod, nn.LSTM):
+        res = []
+        for idx, param_name in enumerate(mod._flat_weights_names):
+            if "weight_ih_l" in param_name or "weight_hh_l" in param_name:
+                param_value = mod._flat_weights[idx].detach()  # type: ignore[index,union-attr]
+                res.append(param_value)
+        return res
+    else:
+        if not isinstance(mod, nnqd.LSTM):
+            raise AssertionError(f"type {type(mod)} not handled yet")
+        res = []
+        for weight_value in mod._all_weight_values:
+            res.append(
+                weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0]  # type: ignore[index]
+            )
+            res.append(
+                weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0]  # type: ignore[index]
+            )
+        return res
+
+
+def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # traverse backwards from the weight arg, accounting for any observers
+    weight_arg_node = node.args[1]
+    if not isinstance(weight_arg_node, Node):
+        raise AssertionError(f"Expected Node, got {type(weight_arg_node)}")
+    weight_node = return_first_non_observer_node(weight_arg_node, gm)
+    if not isinstance(weight_node, Node):
+        raise AssertionError(f"Expected Node, got {type(weight_node)}")
+    if weight_node.op != "get_attr":
+        raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+    weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+    return weight.detach()
+
+
+def get_qconv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # qconv state is arg 1
+    qconv_state_node = node.args[1]
+    if not isinstance(qconv_state_node, Node):
+        raise AssertionError(f"Expected Node, got {type(qconv_state_node)}")
+    if qconv_state_node.op != "get_attr":
+        raise AssertionError(f"Expected get_attr, got {qconv_state_node.op}")
+    qconv_state_obj = getattr_from_fqn(gm, qconv_state_node.target)  # type: ignore[arg-type]
+    return qconv_state_obj.weight()
+
+
+def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # traverse backwards from the weight arg, accounting for any observers
+    # supported patterns:
+    # weight -> obs -> linear
+    # weight -> to(torch.float16) -> dequantize -> linear
+    linear_second_arg = node.args[1]
+    if not isinstance(linear_second_arg, Node):
+        raise AssertionError(f"Expected Node, got {type(linear_second_arg)}")
+
+    if linear_second_arg.op == "call_module":
+        # weight -> obs -> linear
+        weight_arg_node = node.args[1]
+        if not isinstance(weight_arg_node, Node):
+            raise AssertionError(f"Expected Node, got {type(weight_arg_node)}")
+        weight_node = weight_arg_node.args[0]
+        if not isinstance(weight_node, Node):
+            raise AssertionError(f"Expected Node, got {type(weight_node)}")
+        if weight_node.op != "get_attr":
+            raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+        weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+        return weight.detach()
+    elif linear_second_arg.op == "call_method":
+        # weight -> to(torch.float16) -> dequantize -> linear
+        if linear_second_arg.op != "call_method":
+            raise AssertionError(f"Expected call_method, got {linear_second_arg.op}")
+        dequant_node = node.args[1]
+        if not isinstance(dequant_node, Node):
+            raise AssertionError(f"Expected Node, got {type(dequant_node)}")
+        to_fp16_node = dequant_node.args[0]
+        if not isinstance(to_fp16_node, Node):
+            raise AssertionError(f"Expected Node, got {type(to_fp16_node)}")
+        # extract the dtype, so we can cast to it before returning
+        target_dtype = to_fp16_node.args[1]
+        weight_node = to_fp16_node.args[0]
+        if not isinstance(weight_node, Node):
+            raise AssertionError(f"Expected Node, got {type(weight_node)}")
+        if weight_node.op != "get_attr":
+            raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+        weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+        # return the weight with fp16 cast
+        return weight.detach().to(target_dtype)
+    else:
+        if linear_second_arg.op != "get_attr":
+            raise AssertionError(f"Expected get_attr, got {linear_second_arg.op}")
+        weight = getattr_from_fqn(gm, linear_second_arg.target)  # type: ignore[arg-type]
+        return weight.detach()
+
+
+def get_qlinear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # packed weight is arg 1
+    packed_weight_node = node.args[1]
+    if not isinstance(packed_weight_node, Node):
+        raise AssertionError(f"Expected Node, got {type(packed_weight_node)}")
+    if packed_weight_node.op != "get_attr":
+        raise AssertionError(f"Expected get_attr, got {packed_weight_node.op}")
+    packed_weight = getattr_from_fqn(gm, packed_weight_node.target)  # type: ignore[arg-type]
+    # TODO(future PR): why does packed_weight.unpack() not work?
+    (weight, _bias), _name = packed_weight.__getstate__()
+    return weight
+
+
+def get_op_to_type_to_weight_extraction_fn() -> dict[str, dict[Callable, Callable]]:
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]] = {
+        "call_module": {
+            # Conv1d
+            nn.Conv1d: mod_weight_detach,
+            nni.ConvReLU1d: mod_0_weight_detach,
+            nnq.Conv1d: mod_weight_bias_0,
+            nnqat.Conv1d: mod_weight_detach,
+            nniqat.ConvBn1d: mod_weight_detach,
+            nniqat.ConvBnReLU1d: mod_weight_detach,
+            nniqat.ConvReLU1d: mod_weight_detach,
+            nniq.ConvReLU1d: mod_weight_bias_0,
+            # Conv2d
+            nn.Conv2d: mod_weight_detach,
+            nni.ConvReLU2d: mod_0_weight_detach,
+            nnq.Conv2d: mod_weight_bias_0,
+            nnqat.Conv2d: mod_weight_detach,
+            nniqat.ConvBn2d: mod_weight_detach,
+            nniqat.ConvBnReLU2d: mod_weight_detach,
+            nniqat.ConvReLU2d: mod_weight_detach,
+            nniq.ConvReLU2d: mod_weight_bias_0,
+            # Conv3d
+            nn.Conv3d: mod_weight_detach,
+            nni.ConvReLU3d: mod_0_weight_detach,
+            nnq.Conv3d: mod_weight_bias_0,
+            nnqat.Conv3d: mod_weight_detach,
+            nniqat.ConvBn3d: mod_weight_detach,
+            nniqat.ConvBnReLU3d: mod_weight_detach,
+            nniqat.ConvReLU3d: mod_weight_detach,
+            nniq.ConvReLU3d: mod_weight_bias_0,
+            # Linear
+            nn.Linear: mod_weight_detach,
+            nnq.Linear: mod_weight_bias_0,
+            nni.LinearReLU: mod_0_weight_detach,
+            nniq.LinearReLU: mod_weight_bias_0,
+            nnqat.Linear: mod_weight_detach,
+            nnqd.Linear: mod_weight_bias_0,
+            nniqat.LinearReLU: mod_weight_detach,
+            nniqat.LinearBn1d: mod_weight_detach,
+            nn.modules.linear.NonDynamicallyQuantizableLinear: mod_weight_detach,
+            # LSTM
+            nn.LSTM: get_lstm_weight,
+            nnqd.LSTM: get_qlstm_weight,
+        },
+        "call_function": {
+            # Conv
+            F.conv1d: get_conv_fun_weight,
+            F.conv2d: get_conv_fun_weight,
+            F.conv3d: get_conv_fun_weight,
+            toq.conv1d: get_qconv_fun_weight,
+            toq.conv2d: get_qconv_fun_weight,
+            toq.conv3d: get_qconv_fun_weight,
+            toq.conv1d_relu: get_qconv_fun_weight,
+            toq.conv2d_relu: get_qconv_fun_weight,
+            toq.conv3d_relu: get_qconv_fun_weight,
+            # Linear
+            F.linear: get_linear_fun_weight,
+            toq.linear: get_qlinear_fun_weight,
+            toq.linear_relu: get_qlinear_fun_weight,
+        },
+    }
+
+    return op_to_type_to_weight_extraction_fn
+
+
+def extract_weight_from_node(
+    node: Node,
+    gm: GraphModule,
+    op_to_type_to_weight_extraction_fn: dict[str, dict[Callable, Callable]]
+    | None = None,
+) -> NSSingleResultType | None:
+    res_type = NSSingleResultValuesType.WEIGHT.value
+
+    # Not all graphmodules have _node_name_to_scope, so only fill it
+    # out if it exists.
+    fqn = None
+    if hasattr(gm, "_node_name_to_scope"):
+        fqn = gm._node_name_to_scope[node.name][0]  # type: ignore[index]
+
+    if op_to_type_to_weight_extraction_fn is None:
+        op_to_type_to_weight_extraction_fn = get_op_to_type_to_weight_extraction_fn()
+
+    ref_node_type = get_target_type_str(node, gm)
+    # for extracting weights, these are always the same
+    prev_node_type = ref_node_type
+
+    if node.op == "call_function":
+        function_mapping = op_to_type_to_weight_extraction_fn["call_function"]
+        for target_fn_type, weight_extraction_fn in function_mapping.items():
+            if node.target == target_fn_type:
+                weight = weight_extraction_fn(node, gm)
+                return {
+                    "type": res_type,
+                    "values": [weight],
+                    "prev_node_name": node.name,
+                    "prev_node_target_type": prev_node_type,
+                    "ref_node_name": node.name,
+                    "ref_node_target_type": ref_node_type,
+                    "index_within_arg": 0,
+                    "index_of_arg": 0,
+                    "fqn": fqn,
+                }
+
+    elif node.op == "call_module":
+        # for call_module, we need to look up the modules to do the type check
+        if not isinstance(node.target, str):
+            raise AssertionError(f"Expected str, got {type(node.target)}")
+        mod = getattr_from_fqn(gm, node.target)
+        module_mapping = op_to_type_to_weight_extraction_fn["call_module"]
+        for target_mod_type, weight_extraction_fn in module_mapping.items():
+            if type(mod) is target_mod_type:
+                weight = weight_extraction_fn(mod)
+                return {
+                    "type": res_type,
+                    "values": [weight],
+                    "prev_node_name": node.name,
+                    "prev_node_target_type": prev_node_type,
+                    "ref_node_name": node.name,
+                    "ref_node_target_type": ref_node_type,
+                    "index_within_arg": 0,
+                    "index_of_arg": 0,
+                    "fqn": fqn,
+                }
+
+    return None
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52fc301befd34642d51f1c27e07600a1f3ef26ff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/__init__.py
@@ -0,0 +1,23 @@
+# Variables
+from ._mappings import (
+    get_dynamic_sparse_quantized_mapping,
+    get_static_sparse_quantized_mapping,
+)
+
+# Scheduler
+from .scheduler.base_scheduler import BaseScheduler
+from .scheduler.cubic_scheduler import CubicSL
+from .scheduler.lambda_scheduler import LambdaSL
+
+# Sparsifier
+from .sparsifier.base_sparsifier import BaseSparsifier
+from .sparsifier.nearly_diagonal_sparsifier import NearlyDiagonalSparsifier
+
+# Parametrizations
+from .sparsifier.utils import (
+    FakeSparsity,
+    fqn_to_module,
+    get_arg_info_from_tensor_fqn,
+    module_to_fqn,
+)
+from .sparsifier.weight_norm_sparsifier import WeightNormSparsifier
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f421363b5dfaa2c3b89e4bfee4ac5635f0edbefe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..646a778f3102a5f8aec972bb027ba221e3b189bf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21080ae3c00beb650088da033db1f22beff9c9ca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a628bcfc8a34c8bcca7f336dba82815663bf49d1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e80dac2c4f07b80fec598edae866e965ef28d09
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc2c4f10aef5585072f36116282a2048965197a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/_mappings.py
@@ -0,0 +1,23 @@
+# mypy: allow-untyped-defs
+__all__ = [
+    "get_static_sparse_quantized_mapping",
+    "get_dynamic_sparse_quantized_mapping",
+]
+
+
+def get_static_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+
+    _static_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.Linear,
+    }
+    return _static_sparse_quantized_mapping
+
+
+def get_dynamic_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+
+    _dynamic_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.dynamic.Linear,
+    }
+    return _dynamic_sparse_quantized_mapping
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07a54dc243ba800ec67e21234d63332ff759aa09
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5f3934407f43c0f3ec7f09ee74185e912f525b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5aab3d7fb0d011c21c1e8ed428e176f008be6f43
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5705d37eaf3391f31c77d9fc48d365cf91eac65f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63ba0376ccd1419ea024c840e86a863b85bdc8ab
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba65948903f8312e624210e7324512a310fa82ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3659b40d55baa55bd7773ef4abb61bcc4dc64763
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4383c7f0a73b5ea7ce5605a5f49165367e727255
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd37ddb155117d3a4bfeabe2c0cf4d24eacf36bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2efc24081b0c13d94b7ab256f635eafce8614543
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__init__.py
@@ -0,0 +1,247 @@
+# mypy: allow-untyped-defs
+
+import sys
+from collections.abc import Callable
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+from .fake_quantize import *  # noqa: F403
+from .fuse_modules import fuse_modules, fuse_modules_qat  # noqa: F403
+from .fuser_method_mappings import *  # noqa: F403
+from .observer import *  # noqa: F403
+from .pt2e._numeric_debugger import (  # noqa: F401
+    compare_results,
+    CUSTOM_KEY,
+    extract_results_from_loggers,
+    generate_numeric_debug_handle,
+    NUMERIC_DEBUG_HANDLE_KEY,
+    prepare_for_propagation_comparison,
+)
+from .pt2e.export_utils import (
+    _allow_exported_model_train_eval as allow_exported_model_train_eval,
+    _move_exported_model_to_eval as move_exported_model_to_eval,
+    _move_exported_model_to_train as move_exported_model_to_train,
+)
+
+# pyrefly: ignore [deprecated]
+from .qconfig import *  # noqa: F403
+from .qconfig_mapping import *  # noqa: F403
+from .quant_type import *  # noqa: F403
+from .quantization_mappings import *  # noqa: F403 # type: ignore[no-redef]
+from .quantize import *  # noqa: F403
+from .quantize_jit import *  # noqa: F403
+from .stubs import *  # noqa: F403
+
+
+# ensure __module__ is set correctly for public APIs
+if sys.version_info < (3, 12):
+    ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+    ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
+else:
+    from typing import TypeAliasType
+
+    ObserverOrFakeQuantize = TypeAliasType(
+        "ObserverOrFakeQuantize", ObserverBase | FakeQuantizeBase
+    )
+
+for _f in [
+    compare_results,
+    extract_results_from_loggers,
+    generate_numeric_debug_handle,
+    prepare_for_propagation_comparison,
+]:
+    _f.__module__ = "torch.ao.quantization"
+
+__all__ = [
+    "DeQuantStub",
+    "FakeQuantize",
+    "FakeQuantizeBase",
+    "FixedQParamsFakeQuantize",
+    "FixedQParamsObserver",
+    "FusedMovingAvgObsFakeQuantize",
+    "HistogramObserver",
+    "MatchAllNode",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "ObserverOrFakeQuantize",
+    "Pattern",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "QConfig",
+    "QConfigAny",
+    "QConfigDynamic",
+    "QConfigMapping",
+    "QuantStub",
+    "QuantType",
+    "QuantWrapper",
+    "RecordingObserver",
+    "ReuseInputObserver",
+    "UniformQuantizationObserverBase",
+    "add_quant_dequant",
+    "convert",
+    "convert_dynamic_jit",
+    "convert_jit",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_affine_fixed_qparams_observer",
+    "default_debug_observer",
+    "default_dynamic_fake_quant",
+    "default_dynamic_quant_observer",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_eval_fn",
+    "default_fake_quant",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_float_qparams_observer",
+    "default_float_qparams_observer_4bit",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_histogram_fake_quant",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_fake_quant",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_reuse_input_observer",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_symmetric_fixed_qparams_observer",
+    "default_weight_fake_quant",
+    "default_weight_observer",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer",
+    "fuse_conv_bn",
+    "fuse_conv_bn_jit",
+    "fuse_conv_bn_relu",
+    "fuse_convtranspose_bn",
+    "fuse_linear_bn",
+    "fuse_modules",
+    "fuse_modules_qat",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+    "fused_wt_fake_quant_range_neg_127_to_127",
+    "get_combined_dict",
+    "get_default_compare_output_module_list",
+    "get_default_custom_config_dict",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_dynamic_sparse_quant_module_mappings",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qat_qconfig",
+    "get_default_qat_qconfig_dict",
+    "get_default_qat_qconfig_mapping",
+    "get_default_qconfig",
+    "get_default_qconfig_dict",
+    "get_default_qconfig_mapping",
+    "get_default_qconfig_propagation_list",
+    "get_default_static_quant_module_mappings",
+    "get_default_static_quant_reference_module_mappings",
+    "get_default_static_sparse_quant_module_mappings",
+    "get_dynamic_quant_module_class",
+    "get_embedding_qat_module_mappings",
+    "get_embedding_static_quant_module_mappings",
+    "get_fuser_method",
+    "get_fuser_method_new",
+    "get_observer_state_dict",
+    "get_quantized_operator",
+    "get_static_quant_module_class",
+    "load_observer_state_dict",
+    "move_exported_model_to_eval",
+    "move_exported_model_to_train",
+    "allow_exported_model_train_eval",
+    "no_observer_set",
+    "per_channel_weight_observer_range_neg_127_to_127",
+    "prepare",
+    "prepare_dynamic_jit",
+    "prepare_jit",
+    "prepare_qat",
+    "propagate_qconfig_",
+    "qconfig_equals",
+    "quantize",
+    "quantize_dynamic",
+    "quantize_dynamic_jit",
+    "quantize_jit",
+    "quantize_qat",
+    "script_qconfig",
+    "script_qconfig_dict",
+    "swap_module",
+    "weight_observer_range_neg_127_to_127",
+    "generate_numeric_debug_handle",
+    "CUSTOM_KEY",
+    "NUMERIC_DEBUG_HANDLE_KEY",
+    "prepare_for_propagation_comparison",
+    "extract_results_from_loggers",
+    "compare_results",
+    # from torchao, should be merged with torchao
+    # in the future
+    "AffineQuantizedObserverBase",
+    "Granularity",
+    "MappingType",
+    "PerAxis",
+    "PerBlock",
+    "PerGroup",
+    "PerRow",
+    "PerTensor",
+    "PerToken",
+    "TorchAODType",
+    "ZeroPointDomain",
+    "get_block_size",
+]
+
+
+def default_eval_fn(model, calib_data):
+    r"""Define the default evaluation function.
+
+    Default evaluation function takes a torch.utils.data.Dataset or a list of
+    input Tensors and run the model on the dataset
+    """
+    for data, _target in calib_data:
+        model(data)
+
+
+class _DerivedObserverOrFakeQuantize(ObserverBase):
+    r"""This observer is used to describe an observer whose quantization parameters
+    are derived from other observers
+    """
+
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        obs_or_fqs: list[ObserverOrFakeQuantize],
+        derive_qparams_fn: Callable[
+            [list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor]
+        ],
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        qscheme: torch.qscheme | None = None,
+        ch_axis: int | None = None,
+    ):
+        super().__init__(dtype)
+        self.obs_or_fqs = obs_or_fqs
+        self.derive_qparams_fn = derive_qparams_fn
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.qscheme = qscheme
+        self.ch_axis = ch_axis
+
+        from .utils import is_per_channel
+
+        if is_per_channel(self.qscheme):
+            if self.ch_axis is None:
+                raise AssertionError(
+                    "Must provide a valid ch_axis if qscheme is per channel"
+                )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x
+
+    def calculate_qparams(self):  # type:ignore[override]
+        return self.derive_qparams_fn(self.obs_or_fqs)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90d024f87660250d10128e8042312fe55d37560a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50c4b8feb6400d8a6b68f0ff80c7fafef3b556a4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f29e5a6cbe8a8adeeeb299289305d20778040733
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/observer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e47f8810eaebcec5d6b00fe50f30bef2621abcb0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71c09c388aa7bbf7059b8b5f343f58b54811303c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3e89847b98e212ce24f1d86e400e54ca28f4ee2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0025f83a91e50570d030b1de3decd357d1ed9990
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..081e0f9e3569670ae81232a6ce487149d77731f7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..4309e4530cb72bd6620a69527cbe87e2a533c323
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_correct_bias.py
@@ -0,0 +1,156 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.ao.ns._numeric_suite as ns
+import torch.ao.quantization
+import torch.nn as nn
+
+
+__all__ = [
+    "get_module",
+    "parent_child_names",
+    "get_param",
+    "MeanShadowLogger",
+    "bias_correction",
+]
+
+_supported_modules = {nn.Linear, nn.Conv2d}
+_supported_modules_quantized = {nnq.Linear, nnq.Conv2d}
+
+
+def get_module(model, name):
+    """Given name of submodule, this function grabs the submodule from given model."""
+    return dict(model.named_modules())[name]
+
+
+def parent_child_names(name):
+    """Split full name of submodule into parent submodule's full name and submodule's name."""
+    split_name = name.rsplit(".", 1)
+    if len(split_name) == 1:
+        return "", split_name[0]
+    else:
+        return split_name[0], split_name[1]
+
+
+def get_param(module, attr):
+    """Get the parameter given a module and attribute.
+
+    Sometimes the weights/bias attribute gives you the raw tensor, but sometimes
+    gives a function that will give you the raw tensor, this function takes care of that logic
+    """
+    param = getattr(module, attr, None)
+    if callable(param):
+        return param()
+    else:
+        return param
+
+
+class MeanShadowLogger(ns.Logger):
+    """Mean Logger for a Shadow module.
+
+    A logger for a Shadow module whose purpose is to record the rolling mean
+    of the data passed to the floating point and quantized models
+    """
+
+    def __init__(self):
+        """Set up initial values for float and quantized stats, count, float sum, and quant sum."""
+        super().__init__()
+        self.stats["float"] = None
+        self.stats["quantized"] = None
+        self.count = 0
+        self.float_sum = None
+        self.quant_sum = None
+
+    def forward(self, x, y):  # type: ignore[override]
+        """Compute the average of quantized and floating-point data from modules.
+
+        The inputs x,y are output data from the quantized and floating-point modules.
+        x is for the quantized module, y is for the floating point module
+        """
+        if x.is_quantized:
+            x = x.dequantize()
+
+        self.count += 1
+        if self.stats["quantized"] is None:
+            self.stats["quantized"] = x
+            self.quant_sum = x
+        else:
+            self.quant_sum += x
+            self.stats["quantized"] = self.quant_sum / self.count
+
+        if self.stats["float"] is None:
+            self.stats["float"] = y
+            self.float_sum = y
+        else:
+            self.float_sum += y
+            self.stats["float"] = self.float_sum / self.count
+
+    def clear(self):
+        self.stats["float"] = None
+        self.stats["quantized"] = None
+        self.count = 0
+        self.float_sum = None
+        self.quant_sum = None
+
+
+def bias_correction(
+    float_model,
+    quantized_model,
+    img_data,
+    target_modules=_supported_modules_quantized,
+    neval_batches=None,
+):
+    """Perform bias correction on a module.
+
+    Using numeric suite shadow module, the expected output of the floating point and quantized modules
+    is recorded. Using that data the bias of supported modules is shifted to compensate for the drift caused
+    by quantization
+    Paper reference: https://arxiv.org/pdf/1906.04721.pdf (Section 4.2)
+
+    Args:
+        float_model: a trained model that serves as a reference to what bias correction should aim for
+        quantized_model: quantized form of float_model that bias correction is to applied to
+        img_data: calibration data to estimate the expected output (used to find quantization error)
+        target_modules: specifies what submodules in quantized_model need bias correction (can be extended to
+                unquantized submodules)
+        neval_batches: a cap to the number of batches you want to be used for estimating the expected output
+    """
+    ns.prepare_model_with_stubs(
+        float_model, quantized_model, _supported_modules, MeanShadowLogger
+    )
+
+    uncorrected_modules = {
+        name: submodule
+        for name, submodule in quantized_model.named_modules()
+        if type(submodule) in target_modules
+    }
+
+    for uncorrected_module in uncorrected_modules:
+        quantized_submodule = get_module(quantized_model, uncorrected_module)
+        bias = get_param(quantized_submodule, "bias")
+        if bias is not None:
+            for count, data in enumerate(img_data, start=1):
+                quantized_model(data[0])
+                if count == neval_batches:
+                    break
+            ob_dict = ns.get_logger_dict(quantized_model)
+            parent_name, _ = parent_child_names(uncorrected_module)
+
+            float_data = ob_dict[parent_name + ".stats"]["float"]
+            quant_data = ob_dict[parent_name + ".stats"]["quantized"]
+
+            # math for expected_error
+            quantization_error = quant_data - float_data
+            dims = list(range(quantization_error.dim()))
+            # Note: we don't want to take the mean over the output channel dimension
+            dims.remove(1)
+            expected_error = torch.mean(quantization_error, dims)
+
+            updated_bias = bias.data - expected_error
+
+            bias.data = updated_bias
+
+            # Resets the data contained in the loggers
+            for submodule in quantized_model.modules():
+                if isinstance(submodule, MeanShadowLogger):
+                    submodule.clear()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff327f285aa4c17f05a9cbf61b7323a0536a12
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_equalize.py
@@ -0,0 +1,279 @@
+# mypy: allow-untyped-defs
+import copy
+from itertools import chain
+from typing import Any
+
+import torch
+
+
+__all__ = [
+    "set_module_weight",
+    "set_module_bias",
+    "has_bias",
+    "get_module_weight",
+    "get_module_bias",
+    "max_over_ndim",
+    "min_over_ndim",
+    "channel_range",
+    "get_name_by_module",
+    "cross_layer_equalization",
+    "process_paired_modules_list_to_name",
+    "expand_groups_in_paired_modules_list",
+    "equalize",
+    "converged",
+]
+
+_supported_types = {torch.nn.Conv2d, torch.nn.Linear, torch.nn.Conv1d}
+_supported_intrinsic_types = {
+    torch.ao.nn.intrinsic.ConvReLU2d,
+    torch.ao.nn.intrinsic.LinearReLU,
+    torch.ao.nn.intrinsic.ConvReLU1d,
+}
+_all_supported_types = _supported_types.union(_supported_intrinsic_types)
+
+
+def set_module_weight(module, weight) -> None:
+    if type(module) in _supported_types:
+        module.weight = torch.nn.Parameter(weight)
+    else:
+        module[0].weight = torch.nn.Parameter(weight)
+
+
+def set_module_bias(module, bias) -> None:
+    if type(module) in _supported_types:
+        module.bias = torch.nn.Parameter(bias)
+    else:
+        module[0].bias = torch.nn.Parameter(bias)
+
+
+def has_bias(module) -> bool:
+    if type(module) in _supported_types:
+        return module.bias is not None
+    else:
+        return module[0].bias is not None
+
+
+def get_module_weight(module):
+    if type(module) in _supported_types:
+        return module.weight
+    else:
+        return module[0].weight
+
+
+def get_module_bias(module):
+    if type(module) in _supported_types:
+        return module.bias
+    else:
+        return module[0].bias
+
+
+def max_over_ndim(input, axis_list, keepdim=False):
+    """Apply 'torch.max' over the given axes."""
+    axis_list.sort(reverse=True)
+    for axis in axis_list:
+        input, _ = input.max(axis, keepdim)
+    return input
+
+
+def min_over_ndim(input, axis_list, keepdim=False):
+    """Apply 'torch.min' over the given axes."""
+    axis_list.sort(reverse=True)
+    for axis in axis_list:
+        input, _ = input.min(axis, keepdim)
+    return input
+
+
+def channel_range(input, axis=0):
+    """Find the range of weights associated with a specific channel."""
+    size_of_tensor_dim = input.ndim
+    axis_list = list(range(size_of_tensor_dim))
+    axis_list.remove(axis)
+
+    mins = min_over_ndim(input, axis_list)
+    maxs = max_over_ndim(input, axis_list)
+
+    if mins.size(0) != input.size(axis):
+        raise AssertionError(
+            "Dimensions of resultant channel range does not match size of requested axis"
+        )
+    return maxs - mins
+
+
+def get_name_by_module(model, module):
+    """Get the name of a module within a model.
+
+    Args:
+        model: a model (nn.module) that equalization is to be applied on
+        module: a module within the model
+
+    Returns:
+        name: the name of the module within the model
+    """
+    for name, m in model.named_modules():
+        if m is module:
+            return name
+    raise ValueError("module is not in the model")
+
+
+def cross_layer_equalization(module1, module2, output_axis=0, input_axis=1):
+    """Scale the range of Tensor1.output to equal Tensor2.input.
+
+    Given two adjacent tensors', the weights are scaled such that
+    the ranges of the first tensors' output channel are equal to the
+    ranges of the second tensors' input channel
+    """
+    if (
+        type(module1) not in _all_supported_types
+        or type(module2) not in _all_supported_types
+    ):
+        raise ValueError(
+            "module type not supported:", type(module1), " ", type(module2)
+        )
+
+    bias = get_module_bias(module1) if has_bias(module1) else None
+
+    weight1 = get_module_weight(module1)
+    weight2 = get_module_weight(module2)
+
+    if weight1.size(output_axis) != weight2.size(input_axis):
+        raise TypeError(
+            "Number of output channels of first arg do not match \
+        number input channels of second arg"
+        )
+
+    weight1_range = channel_range(weight1, output_axis)
+    weight2_range = channel_range(weight2, input_axis)
+
+    # producing scaling factors to applied
+    weight2_range += 1e-9
+    scaling_factors = torch.sqrt(weight1_range / weight2_range)
+    inverse_scaling_factors = torch.reciprocal(scaling_factors)
+
+    if bias is not None:
+        bias = bias * inverse_scaling_factors
+
+    # formatting the scaling (1D) tensors to be applied on the given argument tensors
+    # pads axis to (1D) tensors to then be broadcasted
+    size1 = [1] * weight1.ndim
+    size1[output_axis] = weight1.size(output_axis)
+    size2 = [1] * weight2.ndim
+    size2[input_axis] = weight2.size(input_axis)
+
+    scaling_factors = torch.reshape(scaling_factors, size2)
+    inverse_scaling_factors = torch.reshape(inverse_scaling_factors, size1)
+
+    weight1 = weight1 * inverse_scaling_factors
+    weight2 = weight2 * scaling_factors
+
+    set_module_weight(module1, weight1)
+    if bias is not None:
+        set_module_bias(module1, bias)
+    set_module_weight(module2, weight2)
+
+
+def process_paired_modules_list_to_name(model, paired_modules_list):
+    """Processes a list of paired modules to a list of names of paired modules."""
+
+    for group in paired_modules_list:
+        for i, item in enumerate(group):
+            if isinstance(item, torch.nn.Module):
+                group[i] = get_name_by_module(model, item)
+            elif not isinstance(item, str):
+                raise TypeError("item must be a nn.Module or a string")
+    return paired_modules_list
+
+
+def expand_groups_in_paired_modules_list(paired_modules_list):
+    """Expands module pair groups larger than two into groups of two modules."""
+    new_list = []
+
+    for group in paired_modules_list:
+        if len(group) == 1:
+            raise ValueError("Group must have at least two modules")
+        elif len(group) == 2:
+            new_list.append(group)
+        elif len(group) > 2:
+            new_list.extend([group[i], group[i + 1]] for i in range(len(group) - 1))
+
+    return new_list
+
+
+def equalize(model, paired_modules_list, threshold=1e-4, inplace=True):
+    """Equalize modules until convergence is achieved.
+
+    Given a list of adjacent modules within a model, equalization will
+    be applied between each pair, this will repeated until convergence is achieved
+
+    Keeps a copy of the changing modules from the previous iteration, if the copies
+    are not that different than the current modules (determined by converged_test),
+    then the modules have converged enough that further equalizing is not necessary
+
+    Reference is section 4.1 of this paper https://arxiv.org/pdf/1906.04721.pdf
+
+    Args:
+        model: a model (nn.Module) that equalization is to be applied on
+            paired_modules_list (List(List[nn.module || str])): a list of lists
+            where each sublist is a pair of two submodules found in the model,
+            for each pair the two modules have to be adjacent in the model,
+            with only piece-wise-linear functions like a (P)ReLU or LeakyReLU in between
+            to get expected results.
+            The list can contain either modules, or names of modules in the model.
+            If you pass multiple modules in the same list, they will all be equalized together.
+            threshold (float): a number used by the converged function to determine what degree
+            of similarity between models is necessary for them to be called equivalent
+        inplace (bool): determines if function is inplace or not
+    """
+
+    paired_modules_list = process_paired_modules_list_to_name(
+        model, paired_modules_list
+    )
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    paired_modules_list = expand_groups_in_paired_modules_list(paired_modules_list)
+
+    name_to_module: dict[str, torch.nn.Module] = {}
+    previous_name_to_module: dict[str, Any] = {}
+    name_set = set(chain.from_iterable(paired_modules_list))
+
+    for name, module in model.named_modules():
+        if name in name_set:
+            name_to_module[name] = module
+            previous_name_to_module[name] = None
+    while not converged(name_to_module, previous_name_to_module, threshold):
+        for pair in paired_modules_list:
+            previous_name_to_module[pair[0]] = copy.deepcopy(name_to_module[pair[0]])
+            previous_name_to_module[pair[1]] = copy.deepcopy(name_to_module[pair[1]])
+
+            cross_layer_equalization(name_to_module[pair[0]], name_to_module[pair[1]])
+
+    return model
+
+
+def converged(curr_modules, prev_modules, threshold=1e-4):
+    """Test whether modules are converged to a specified threshold.
+
+    Tests for the summed norm of the differences between each set of modules
+    being less than the given threshold
+
+    Takes two dictionaries mapping names to modules, the set of names for each dictionary
+    should be the same, looping over the set of names, for each name take the difference
+    between the associated modules in each dictionary
+
+    """
+    if curr_modules.keys() != prev_modules.keys():
+        raise ValueError(
+            "The keys to the given mappings must have the same set of names of modules"
+        )
+
+    summed_norms = torch.tensor(0.0)
+    if None in prev_modules.values():
+        return False
+    for name in curr_modules:
+        curr_weight = get_module_weight(curr_modules[name])
+        prev_weight = get_module_weight(prev_modules[name])
+
+        difference = curr_weight.sub(prev_weight)
+        summed_norms += torch.norm(difference)
+    return bool(summed_norms < threshold)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b824f8d1ecfe2086576eb3a4c16c4321e9e892
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/_learnable_fake_quantize.py
@@ -0,0 +1,199 @@
+# mypy: allow-untyped-defs
+
+import torch
+from torch.nn.parameter import Parameter
+
+
+__all__: list[str] = []
+
+
+class _LearnableFakeQuantize(torch.ao.quantization.FakeQuantizeBase):
+    r"""Generalized extension of the FakeQuantize module in fake_quantize.py.
+
+    This is an extension of the FakeQuantize module in fake_quantize.py, which
+    supports more generalized lower-bit quantization and supports learning of the scale
+    and zero point parameters through backpropagation.
+
+    In addition to the attributes in the original FakeQuantize module, the _LearnableFakeQuantize
+    module also includes the following attributes to support quantization parameter learning.
+
+    * :attr:`channel_len` defines the length of the channel when initializing scale and zero point
+      for the per channel case.
+
+    * :attr:`use_grad_scaling` defines the flag for whether the gradients for scale and zero point are
+      normalized by the constant, which is proportional to the square root of the number of
+      elements in the tensor. The related literature justifying the use of this particular constant
+      can be found here: https://openreview.net/pdf?id=rkgO66VKDS.
+
+    * :attr:`fake_quant_enabled` defines the flag for enabling fake quantization on the output.
+
+    * :attr:`static_enabled` defines the flag for using observer's static estimation for
+      scale and zero point.
+
+    * :attr:`learning_enabled` defines the flag for enabling backpropagation for scale and zero point.
+    """
+
+    def __init__(
+        self,
+        observer,
+        quant_min=0,
+        quant_max=255,
+        scale=1.0,
+        zero_point=0.0,
+        channel_len=-1,
+        use_grad_scaling=False,
+        **observer_kwargs,
+    ):
+        super().__init__()
+        if quant_min >= quant_max:
+            raise AssertionError("quant_min must be strictly less than quant_max.")
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        # also pass quant_min and quant_max to observer
+        observer_kwargs["quant_min"] = quant_min
+        observer_kwargs["quant_max"] = quant_max
+        self.use_grad_scaling = use_grad_scaling
+        if channel_len == -1:
+            self.scale = Parameter(torch.tensor([scale]))
+            self.zero_point = Parameter(torch.tensor([zero_point]))
+        else:
+            if not (isinstance(channel_len, int) and channel_len > 0):
+                raise AssertionError("Channel size must be a positive integer.")
+            self.scale = Parameter(torch.tensor([scale] * channel_len))
+            self.zero_point = Parameter(torch.tensor([zero_point] * channel_len))
+
+        self.activation_post_process = observer(**observer_kwargs)
+        if torch.iinfo(self.activation_post_process.dtype).min > quant_min:
+            raise AssertionError("quant_min out of bound")
+        if quant_max > torch.iinfo(self.activation_post_process.dtype).max:
+            raise AssertionError("quant_max out of bound")
+        self.dtype = self.activation_post_process.dtype
+        self.qscheme = self.activation_post_process.qscheme
+        self.ch_axis = (
+            self.activation_post_process.ch_axis
+            if hasattr(self.activation_post_process, "ch_axis")
+            else -1
+        )
+        self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer("static_enabled", torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer("learning_enabled", torch.tensor([0], dtype=torch.uint8))
+
+        bitrange = torch.tensor(quant_max - quant_min + 1).double()
+        self.bitwidth = int(torch.log2(bitrange).item())
+        self.register_buffer("eps", torch.tensor([torch.finfo(torch.float32).eps]))
+
+    @torch.jit.export
+    def enable_param_learning(self):
+        r"""Enable parameter learning over static observer estimates.
+
+        Enables learning of quantization parameters and
+        disables static observer estimates. Forward path returns fake quantized X.
+        """
+        self.toggle_qparam_learning(enabled=True).toggle_fake_quant(
+            enabled=True
+        ).toggle_observer_update(enabled=False)
+        return self
+
+    @torch.jit.export
+    def enable_static_estimate(self):
+        """Enable static estimates of quantization parameters.
+
+        Enables static observer estimates and disables learning of
+        quantization parameters. Forward path returns fake quantized X.
+        """
+        self.toggle_qparam_learning(enabled=False).toggle_fake_quant(
+            enabled=True
+        ).toggle_observer_update(enabled=True)
+
+    @torch.jit.export
+    def enable_static_observation(self):
+        """Enable accumulation of data without updating quantization parameters.
+
+        Enables static observer accumulating data from input but doesn't
+        update the quantization parameters. Forward path returns the original X.
+        """
+        self.toggle_qparam_learning(enabled=False).toggle_fake_quant(
+            enabled=False
+        ).toggle_observer_update(enabled=True)
+
+    @torch.jit.export
+    def toggle_observer_update(self, enabled=True):
+        self.static_enabled[0] = int(enabled)  # type: ignore[operator]
+        return self
+
+    @torch.jit.export
+    def enable_observer(self, enabled=True):
+        self.toggle_observer_update(enabled)
+
+    @torch.jit.export
+    def toggle_qparam_learning(self, enabled=True):
+        self.learning_enabled[0] = int(enabled)  # type: ignore[operator]
+        self.scale.requires_grad = enabled
+        self.zero_point.requires_grad = enabled
+        return self
+
+    @torch.jit.export
+    def toggle_fake_quant(self, enabled=True):
+        self.fake_quant_enabled[0] = int(enabled)
+        return self
+
+    @torch.jit.export
+    def observe_quant_params(self):
+        print(f"_LearnableFakeQuantize Scale: {self.scale.detach()}")
+        print(f"_LearnableFakeQuantize Zero Point: {self.zero_point.detach()}")
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        self.scale.data.clamp_(min=self.eps.item())  # type: ignore[operator]
+        scale = self.scale.detach()
+        zero_point = (
+            self.zero_point.detach()
+            .round()
+            .clamp(self.quant_min, self.quant_max)
+            .long()
+        )
+        return scale, zero_point
+
+    def forward(self, X):
+        if self.static_enabled[0] == 1:  # type: ignore[index]
+            self.activation_post_process(X.detach())
+            _scale, _zero_point = self.activation_post_process.calculate_qparams()
+            _scale = _scale.to(self.scale.device)
+            _zero_point = _zero_point.to(self.zero_point.device)
+            self.scale.data.copy_(_scale)
+            self.zero_point.data.copy_(_zero_point)
+        else:
+            self.scale.data.clamp_(min=self.eps.item())  # type: ignore[operator]
+
+        if self.fake_quant_enabled[0] == 1:
+            if self.qscheme in (
+                torch.per_channel_symmetric,
+                torch.per_tensor_symmetric,
+            ):
+                self.zero_point.data.zero_()
+
+            if self.use_grad_scaling:
+                grad_factor = 1.0 / (X.numel() * self.quant_max) ** 0.5
+            else:
+                grad_factor = 1.0
+            if self.qscheme in (torch.per_channel_symmetric, torch.per_channel_affine):
+                X = torch._fake_quantize_learnable_per_channel_affine(
+                    X,
+                    self.scale,
+                    self.zero_point,
+                    self.ch_axis,
+                    self.quant_min,
+                    self.quant_max,
+                    grad_factor,
+                )
+            else:
+                X = torch._fake_quantize_learnable_per_tensor_affine(
+                    X,
+                    self.scale,
+                    self.zero_point,
+                    self.quant_min,
+                    self.quant_max,
+                    grad_factor,
+                )
+
+        return X
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a380946c8a06dd884680fc52cf1350f49772f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fake_quantize.py
@@ -0,0 +1,663 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+"""Implements modules  used to perform fake quantization."""
+
+import re
+from abc import ABC, abstractmethod
+from typing import Any
+
+import torch
+from torch.ao.quantization.observer import (
+    _with_args,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
+    FixedQParamsObserver,
+    HistogramObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+)
+from torch.nn import Module
+
+
+__all__ = [
+    "FakeQuantizeBase",
+    "FakeQuantize",
+    "FixedQParamsFakeQuantize",
+    "FusedMovingAvgObsFakeQuantize",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer",
+    "default_fake_quant",
+    "default_weight_fake_quant",
+    "default_dynamic_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_per_channel_weight_fake_quant",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_histogram_fake_quant",
+    "default_fused_act_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "fused_wt_fake_quant_range_neg_127_to_127",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+]
+
+
+def _is_per_channel(qscheme: "torch.qscheme") -> bool:
+    return qscheme in [
+        torch.per_channel_symmetric,
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+    ]
+
+
+def _is_per_tensor(qscheme: "torch.qscheme") -> bool:
+    return qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]
+
+
+def _is_symmetric_quant(qscheme: "torch.qscheme") -> bool:
+    return qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]
+
+
+def _is_float_qparams(qscheme: "torch.qscheme") -> bool:
+    return qscheme == torch.per_channel_affine_float_qparams
+
+
+class FakeQuantizeBase(ABC, Module):
+    r"""Base fake quantize module.
+
+    Base fake quantize module
+    Any fake quantize implementation should derive from this class.
+
+    Concrete fake quantize module should follow the same API. In forward, they will update
+    the statistics of the observed Tensor and fake quantize the input. They should also provide a
+    `calculate_qparams` function that computes the quantization parameters given
+    the collected statistics.
+
+    """
+
+    fake_quant_enabled: torch.Tensor
+    observer_enabled: torch.Tensor
+
+    def __init__(self) -> None:
+        """Set fake_quant_enabled and observer_enabled."""
+        super().__init__()
+        # fake_quant_enabled and observer_enabled are buffers to support their
+        # replication in DDP. Data type is uint8 because NCCL does not support
+        # bool tensors.
+        self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.uint8))
+
+    @abstractmethod
+    def forward(self, x):
+        pass
+
+    @abstractmethod
+    def calculate_qparams(self, **kwargs):
+        pass
+
+    @torch.jit.export
+    def enable_fake_quant(self, enabled: bool = True) -> None:
+        self.fake_quant_enabled[0] = 1 if enabled else 0
+
+    @torch.jit.export
+    def disable_fake_quant(self):
+        self.enable_fake_quant(False)
+
+    @torch.jit.export
+    def enable_observer(self, enabled: bool = True) -> None:
+        self.observer_enabled[0] = 1 if enabled else 0
+
+    @torch.jit.export
+    def disable_observer(self):
+        self.enable_observer(False)
+
+    @classmethod
+    def with_args(cls, **kwargs):
+        fake_quant_constructor = _with_args(cls, **kwargs)
+        # need to assign the correct module to fake_quantize
+        # constructors to satisfy public v private requirements
+        fake_quant_constructor.__module__ = "torch.ao.quantization.fake_quantize"
+        return fake_quant_constructor
+
+
+class FakeQuantize(FakeQuantizeBase):
+    r"""Simulate the quantize and dequantize operations in training time.
+
+    The output of this module is given by::
+
+        x_out = (
+            clamp(round(x / scale + zero_point), quant_min, quant_max) - zero_point
+        ) * scale
+
+    * :attr:`is_dynamic` indicates whether the fake quantie is a placeholder for dynamic quantization
+      operators (choose_qparams -> q -> dq) or static quantization operators (q -> dq)
+
+    * :attr:`scale` defines the scale factor used for quantization.
+
+    * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to
+
+    * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that
+      statistics can still be updated.
+
+    * :attr:`observer_enabled` controls statistics collection on tensors
+
+    * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization,
+        allowable values are torch.qint8 and torch.quint8.
+
+    Args:
+
+        observer (module): Module for observing statistics on input tensors and calculating scale
+          and zero-point.
+        observer_kwargs (optional): Arguments for the observer module
+
+    Attributes:
+        activation_post_process (Module): User provided module that collects statistics on the input tensor and
+          provides a method to calculate scale and zero-point.
+
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(
+        self,
+        observer=MovingAverageMinMaxObserver,
+        quant_min=None,
+        quant_max=None,
+        is_dynamic=False,
+        **observer_kwargs,
+    ):
+        super().__init__()
+        # Populate quant_min/quant_max to observer_kwargs if valid
+        if quant_min is not None and quant_max is not None:
+            if quant_min > quant_max:
+                raise AssertionError(
+                    "quant_min must be less than or equal to quant_max"
+                )
+            dtype = observer_kwargs.get("dtype", torch.quint8)
+            if hasattr(observer, "p"):
+                # In case observer is _PartialWrapper, dtype can be stored in
+                # observer.p.keywords["dtype"]
+                dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
+                    "dtype", dtype
+                )
+            # pyrefly: ignore [bad-argument-type]
+            if torch.iinfo(dtype).min > quant_min:
+                raise AssertionError("quant_min out of bound")
+            # pyrefly: ignore [bad-argument-type]
+            if quant_max > torch.iinfo(dtype).max:
+                raise AssertionError("quant_max out of bound")
+            observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
+        observer_kwargs["is_dynamic"] = is_dynamic
+        self.activation_post_process = observer(**observer_kwargs)
+        # TODO: keeping self.quant_min/max for BC; remove after a couple releases
+        # Users should use self.activation_post_process.quant_min
+        self.quant_min = self.activation_post_process.quant_min
+        self.quant_max = self.activation_post_process.quant_max
+        self.is_dynamic = self.activation_post_process.is_dynamic
+        if _is_float_qparams(self.activation_post_process.qscheme):
+            zero_point_dtype = torch.float
+        else:
+            zero_point_dtype = torch.int
+        self.register_buffer("scale", torch.tensor([1.0], dtype=torch.float))
+        self.register_buffer("zero_point", torch.tensor([0], dtype=zero_point_dtype))
+        self.dtype = self.activation_post_process.dtype
+        self.qscheme = self.activation_post_process.qscheme
+        self.ch_axis = (
+            self.activation_post_process.ch_axis
+            if hasattr(self.activation_post_process, "ch_axis")
+            else -1
+        )
+        if not (_is_per_channel(self.qscheme) or _is_per_tensor(self.qscheme)):
+            raise AssertionError(
+                "Only per channel and per tensor quantization are supported in fake quantize"
+                + " got qscheme: "
+                + str(self.qscheme)
+            )
+        self.is_per_channel = _is_per_channel(self.qscheme)
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        return self.activation_post_process.calculate_qparams()
+
+    def forward(self, X):
+        if self.observer_enabled[0] == 1:
+            self.activation_post_process(X.detach())
+            _scale, _zero_point = self.calculate_qparams()
+            _scale, _zero_point = (
+                _scale.to(self.scale.device),
+                _zero_point.to(self.zero_point.device),
+            )
+            if self.scale.shape != _scale.shape:
+                self.scale.resize_(_scale.shape)
+                self.zero_point.resize_(_zero_point.shape)
+            self.scale.copy_(_scale)
+            self.zero_point.copy_(_zero_point)
+
+        if self.fake_quant_enabled[0] == 1:
+            if self.is_per_channel:
+                X = torch.fake_quantize_per_channel_affine(
+                    X,
+                    self.scale,
+                    self.zero_point,
+                    self.ch_axis,
+                    self.activation_post_process.quant_min,
+                    self.activation_post_process.quant_max,
+                )
+            else:
+                X = torch.fake_quantize_per_tensor_affine(
+                    X,
+                    self.scale,
+                    self.zero_point,
+                    self.activation_post_process.quant_min,
+                    self.activation_post_process.quant_max,
+                )
+        return X
+
+    @torch.jit.export
+    def extra_repr(self):
+        return (
+            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
+            f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, "
+            f"dtype={self.dtype}, qscheme={self.qscheme}, ch_axis={self.ch_axis}, "
+            f"scale={self.scale}, zero_point={self.zero_point}"
+        )
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # We cannot currently register scalar values as buffers, so need to manually
+        # specify serialization here.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "scale"] = self.scale
+        destination[prefix + "zero_point"] = self.zero_point
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        # Removing this function throws an error that the size of the loaded tensor does not match the original size
+        # i.e., These buffers start out with numel 0 and become numel 1 once they have their first forward pass.
+        local_state = ["scale", "zero_point"]
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                # Custom handling to allow loading scale and zero_point
+                # of size N into uninitialized buffers of size 0. The
+                # buffers are resized here, and the values are copied in
+                # the default state_dict loading code of the parent.
+                if name == "scale":
+                    self.scale.resize_(val.shape)
+                else:
+                    if name != "zero_point":
+                        raise AssertionError(
+                            "Expected 'zero_point' but got different state key"
+                        )
+                    self.zero_point.resize_(val.shape)
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == "scale":
+                        self.scale.copy_(val)
+                    else:
+                        if name != "zero_point":
+                            raise AssertionError(
+                                "Expected 'zero_point' but got different state key"
+                            )
+                        self.zero_point.copy_(val)
+            elif strict:
+                missing_keys.append(key)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class FixedQParamsFakeQuantize(FakeQuantize):
+    """Simulate quantize and dequantize in training time.
+
+    Simulate quantize and dequantize with fixed quantization
+    parameters in training time. Only per tensor quantization
+    is supported.
+    """
+
+    # TODO: rename observer to observer_ctr
+    def __init__(self, observer):
+        super().__init__(observer=observer)
+        if type(self.activation_post_process) is not FixedQParamsObserver:
+            raise AssertionError(
+                f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
+            )
+        self._observer_ctr = observer
+        self.scale = self.activation_post_process.scale
+        self.zero_point = self.activation_post_process.zero_point
+        if not _is_per_tensor(self.qscheme):
+            raise AssertionError(
+                "Only per tensor quantization is supported"
+                + " FixedQParamsFakeQuantize module, got qscheme:"
+                + str(self.qscheme)
+            )
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        return self.scale, self.zero_point
+
+    @torch.jit.export
+    def extra_repr(self):
+        """Define a string representation of the object's attributes."""
+        return (
+            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
+            f"scale={self.scale}, zero_point={self.zero_point}, "
+            f"dtype={self.dtype}, quant_min={self.activation_post_process.quant_min}, "
+            f"quant_max={self.activation_post_process.quant_max}, qscheme={self.qscheme}"
+        )
+
+
+class FusedMovingAvgObsFakeQuantize(FakeQuantize):
+    r"""Define a fused module to observe the tensor.
+
+    Fused module that is used to observe the input tensor (compute min/max), compute
+    scale/zero_point and fake_quantize the tensor.
+    This module uses calculation similar MovingAverageMinMaxObserver for the inputs,
+    to compute the min/max values in order to compute the scale/zero_point.
+    The qscheme input in the observer is used to differentiate between symmetric/affine
+    quantization scheme.
+
+    The output of this module is given by
+    x_out = (clamp(round(x/scale + zero_point), quant_min, quant_max)-zero_point)*scale
+
+    Similar to :class:`~torch.ao.quantization.FakeQuantize`, and accepts the same attributes as the
+    base class.
+
+    """
+
+    def __init__(
+        self,
+        observer: Any = MovingAverageMinMaxObserver,
+        quant_min: int = 0,
+        quant_max: int = 255,
+        **observer_kwargs: Any,
+    ) -> None:
+        super().__init__(observer, quant_min, quant_max, **observer_kwargs)
+        if not isinstance(
+            self.activation_post_process,
+            (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver),
+        ):
+            raise AssertionError(
+                "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
+            )
+        self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
+        self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
+        self.is_symmetric_quant = _is_symmetric_quant(
+            self.activation_post_process.qscheme
+        )
+
+    @torch.jit.export
+    def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:  # type: ignore[override]
+        return self.activation_post_process.calculate_qparams()
+
+    @torch.jit.export
+    def extra_repr(self) -> str:
+        return (
+            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
+            f"scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}, "
+            f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, "
+            f"qscheme={self.qscheme}, reduce_range={self.activation_post_process.reduce_range}"
+        )
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        return torch.fused_moving_avg_obs_fake_quant(
+            X,
+            self.observer_enabled,
+            self.fake_quant_enabled,
+            self.activation_post_process.min_val,
+            self.activation_post_process.max_val,
+            self.scale,
+            self.zero_point,
+            self.activation_post_process.averaging_constant,
+            self.activation_post_process.quant_min,
+            self.activation_post_process.quant_max,
+            self.ch_axis,
+            self.is_per_channel,
+            self.is_symmetric_quant,
+        )
+
+
+default_fake_quant = FakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=0,
+    quant_max=255,
+    dtype=torch.quint8,
+    qscheme=torch.per_tensor_affine,
+    reduce_range=True,
+)
+"""
+Default fake_quant for activations.
+"""
+
+default_weight_fake_quant = FakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=-128,
+    quant_max=127,
+    dtype=torch.qint8,
+    qscheme=torch.per_tensor_symmetric,
+    reduce_range=False,
+)
+"""
+Default fake_quant for weights.
+Observer is memoryless since averaging_constant is 1.
+"""
+
+default_dynamic_fake_quant = FakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=0,
+    quant_max=255,
+    is_dynamic=True,
+    dtype=torch.quint8,
+    averaging_constant=1,
+)
+"""
+Default dynamic fake_quant for activations.
+"""
+
+default_fixed_qparams_range_neg1to1_fake_quant = FixedQParamsFakeQuantize.with_args(
+    observer=default_fixed_qparams_range_neg1to1_observer
+)
+default_fixed_qparams_range_0to1_fake_quant = FixedQParamsFakeQuantize.with_args(
+    observer=default_fixed_qparams_range_0to1_observer
+)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_fake_quant = (
+    default_fixed_qparams_range_neg1to1_fake_quant
+)
+default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant
+
+default_per_channel_weight_fake_quant = FakeQuantize.with_args(
+    observer=MovingAveragePerChannelMinMaxObserver,
+    quant_min=-128,
+    quant_max=127,
+    dtype=torch.qint8,
+    qscheme=torch.per_channel_symmetric,
+    reduce_range=False,
+    ch_axis=0,
+)
+"""
+Default fake_quant for per-channel weights.
+Observer is memoryless since averaging_constant is 1.
+"""
+default_embedding_fake_quant = FakeQuantize.with_args(
+    observer=MovingAveragePerChannelMinMaxObserver,
+    qscheme=torch.per_channel_affine_float_qparams,
+    dtype=torch.quint8,
+    quant_min=0,
+    quant_max=255,
+    ch_axis=0,
+    averaging_constant=1,
+)
+"""
+Default fake_quant for embeddings.
+Observer is memoryless since averaging_constant is 1.
+"""
+
+default_embedding_fake_quant_4bit = FakeQuantize.with_args(
+    observer=MovingAveragePerChannelMinMaxObserver,
+    qscheme=torch.per_channel_affine_float_qparams,
+    ch_axis=0,
+    dtype=torch.quint4x2,
+    averaging_constant=1,
+)
+
+default_histogram_fake_quant = FakeQuantize.with_args(
+    observer=HistogramObserver,
+    quant_min=0,
+    quant_max=255,
+    dtype=torch.quint8,
+    qscheme=torch.per_tensor_affine,
+    reduce_range=True,
+)
+"""
+Fake_quant for activations using a histogram..
+"""
+
+
+default_fused_act_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=0,
+    quant_max=255,
+    dtype=torch.quint8,
+)
+
+"""
+Fused version of `default_fake_quant`, with improved performance.
+"""
+
+
+default_fused_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=-128,
+    quant_max=127,
+    dtype=torch.qint8,
+    qscheme=torch.per_tensor_symmetric,
+)
+"""
+Fused version of `default_weight_fake_quant`, with improved performance.
+"""
+
+default_fused_per_channel_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(
+    observer=MovingAveragePerChannelMinMaxObserver,
+    quant_min=-128,
+    quant_max=127,
+    dtype=torch.qint8,
+    qscheme=torch.per_channel_symmetric,
+)
+"""
+Fused version of `default_per_channel_weight_fake_quant`, with improved performance.
+"""
+
+fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver,
+    quant_min=-127,
+    quant_max=127,
+    dtype=torch.qint8,
+    qscheme=torch.per_tensor_symmetric,
+    eps=2**-12,
+)
+"""
+Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+fused_per_channel_wt_fake_quant_range_neg_127_to_127 = (
+    FusedMovingAvgObsFakeQuantize.with_args(
+        observer=MovingAveragePerChannelMinMaxObserver,
+        quant_min=-127,
+        quant_max=127,
+        dtype=torch.qint8,
+        qscheme=torch.per_channel_symmetric,
+        eps=2**-12,
+    )
+)
+
+"""
+Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+
+def _is_fake_quant_script_module(mod):
+    """Return true if given mod is an instance of FakeQuantize script module."""
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.ao.quantization.fake_quantize.___torch_mangle_2.FakeQuantize'
+        suffix = mod._c.qualified_name.split(".", 1)[1]
+        name = re.sub(r"\.___torch_mangle_\d+", "", suffix)
+        return (
+            name == "torch.ao.quantization.fake_quantize.FakeQuantize"
+            or name
+            == "torch.ao.quantization.fake_quantize.FusedMovingAvgObsFakeQuantize"
+        )
+    return False
+
+
+def disable_fake_quant(mod):
+    """Disable fake quantization for the module.
+
+    Disable fake quantization for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.disable_fake_quant)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.disable_fake_quant()
+
+
+def enable_fake_quant(mod):
+    """Enable fake quantization for the module.
+
+    Enable fake quantization for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.enable_fake_quant)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.enable_fake_quant()
+
+
+def disable_observer(mod):
+    """Disable observation for this module.
+
+    Disable observation for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.disable_observer)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.disable_observer()
+
+
+def enable_observer(mod):
+    """Enable observation for this module.
+
+    Enable observation for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.enable_observer)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.enable_observer()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f664c699144917d3314eee7bdf5dd92f9697108
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuse_modules.py
@@ -0,0 +1,215 @@
+# mypy: allow-untyped-defs
+import copy
+
+import torch.nn as nn
+
+# for backward compatibility
+from torch.ao.quantization.fuser_method_mappings import (  # noqa: F401  # noqa: F401
+    fuse_conv_bn,
+    fuse_conv_bn_relu,
+    get_fuser_method,
+)
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+
+__all__ = [
+    "fuse_known_modules",
+    "fuse_modules",
+    "fuse_modules_qat",
+]
+
+
+# Generalization of getattr
+def _get_module(model, submodule_key):
+    tokens = submodule_key.split(".")
+    cur_mod = model
+    for s in tokens:
+        cur_mod = getattr(cur_mod, s)
+    return cur_mod
+
+
+# Generalization of setattr
+def _set_module(model, submodule_key, module):
+    tokens = submodule_key.split(".")
+    sub_tokens = tokens[:-1]
+    cur_mod = model
+    for s in sub_tokens:
+        cur_mod = getattr(cur_mod, s)
+
+    setattr(cur_mod, tokens[-1], module)
+
+
+def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None):
+    r"""Return a list of known fuse modules.
+
+    Returns a list of modules that fuses the operations specified
+     in the input module list.
+
+    Fuses only the following sequence of modules:
+    conv, bn
+    conv, bn, relu
+    conv, relu
+    linear, bn
+    linear, relu
+    For these sequences, the first element in the output module list performs
+    the fused operation. The rest of the elements are set to nn.Identity()
+    """
+    types = tuple(type_before_parametrizations(m) for m in mod_list)
+    fuser_method = get_fuser_method(types, additional_fuser_method_mapping)
+    if fuser_method is None:
+        raise NotImplementedError(f"Cannot fuse modules: {types}")
+    new_mod: list[nn.Module | None] = [None] * len(mod_list)
+    fused = fuser_method(is_qat, *mod_list)
+    # NOTE: forward hooks not processed in the two following for loops will be lost after the fusion
+    # Move pre forward hooks of the base module to resulting fused module
+    for pre_hook_fn in mod_list[0]._forward_pre_hooks.values():
+        fused.register_forward_pre_hook(pre_hook_fn)
+    mod_list[0]._forward_pre_hooks.clear()
+    # Move post forward hooks of the last module to resulting fused module
+    for hook_fn in mod_list[-1]._forward_hooks.values():
+        fused.register_forward_hook(hook_fn)
+    mod_list[-1]._forward_hooks.clear()
+    new_mod[0] = fused
+
+    for i in range(1, len(mod_list)):
+        identity = nn.Identity()
+        identity.training = mod_list[0].training
+        new_mod[i] = identity
+
+    return new_mod
+
+
+def _fuse_modules_helper(
+    model,
+    modules_to_fuse,
+    is_qat,
+    fuser_func=fuse_known_modules,
+    fuse_custom_config_dict=None,
+):
+    if fuse_custom_config_dict is None:
+        fuse_custom_config_dict = {}
+    additional_fuser_method_mapping = fuse_custom_config_dict.get(
+        "additional_fuser_method_mapping", {}
+    )
+    mod_list = [_get_module(model, item) for item in modules_to_fuse]
+
+    # Fuse list of modules
+    new_mod_list = fuser_func(mod_list, is_qat, additional_fuser_method_mapping)
+
+    # Replace original module list with fused module list
+    for i, item in enumerate(modules_to_fuse):
+        _set_module(model, item, new_mod_list[i])
+
+
+def _fuse_modules(
+    model,
+    modules_to_fuse,
+    is_qat,
+    inplace=False,
+    fuser_func=fuse_known_modules,
+    fuse_custom_config_dict=None,
+):
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    if all(isinstance(module_element, str) for module_element in modules_to_fuse):
+        # Handle case of modules_to_fuse being a list
+        _fuse_modules_helper(
+            model, modules_to_fuse, is_qat, fuser_func, fuse_custom_config_dict
+        )
+    else:
+        # Handle case of modules_to_fuse being a list of lists
+        for module_list in modules_to_fuse:
+            _fuse_modules_helper(
+                model, module_list, is_qat, fuser_func, fuse_custom_config_dict
+            )
+    return model
+
+
+def fuse_modules(
+    model,
+    modules_to_fuse,
+    inplace=False,
+    fuser_func=fuse_known_modules,
+    fuse_custom_config_dict=None,
+):
+    r"""Fuse a list of modules into a single module.
+
+    Fuses only the following sequence of modules:
+    conv, bn
+    conv, bn, relu
+    conv, relu
+    linear, relu
+    bn, relu
+    All other sequences are left unchanged.
+    For these sequences, replaces the first item in the list
+    with the fused module, replacing the rest of the modules
+    with identity.
+
+    Args:
+        model: Model containing the modules to be fused
+        modules_to_fuse: list of list of module names to fuse. Can also be a list
+                         of strings if there is only a single list of modules to fuse.
+        inplace: bool specifying if fusion happens in place on the model, by default
+                 a new model is returned
+        fuser_func: Function that takes in a list of modules and outputs a list of fused modules
+                    of the same length. For example,
+                    fuser_func([convModule, BNModule]) returns the list [ConvBNModule, nn.Identity()]
+                    Defaults to torch.ao.quantization.fuse_known_modules
+        `fuse_custom_config_dict`: custom configuration for fusion
+
+    .. code-block:: python
+
+       # Example of fuse_custom_config_dict
+       fuse_custom_config_dict = {
+           # Additional fuser_method mapping
+           "additional_fuser_method_mapping": {
+               (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn
+           },
+       }
+
+    Returns:
+        model with fused modules. A new copy is created if inplace=True.
+
+    Examples::
+
+            >>> # xdoctest: +SKIP
+            >>> m = M().eval()
+            >>> # m is a module containing the sub-modules below
+            >>> modules_to_fuse = [ ['conv1', 'bn1', 'relu1'], ['submodule.conv', 'submodule.relu']]
+            >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse)
+            >>> output = fused_m(input)
+
+            >>> m = M().eval()
+            >>> # Alternately provide a single list of modules to fuse
+            >>> modules_to_fuse = ['conv1', 'bn1', 'relu1']
+            >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse)
+            >>> output = fused_m(input)
+
+    """
+    return _fuse_modules(
+        model,
+        modules_to_fuse,
+        is_qat=False,
+        inplace=inplace,
+        fuser_func=fuser_func,
+        fuse_custom_config_dict=fuse_custom_config_dict,
+    )
+
+
+def fuse_modules_qat(
+    model,
+    modules_to_fuse,
+    inplace=False,
+    fuser_func=fuse_known_modules,
+    fuse_custom_config_dict=None,
+):
+    """QAT version for `fuse_modules`."""
+    return _fuse_modules(
+        model,
+        modules_to_fuse,
+        is_qat=True,
+        inplace=inplace,
+        fuser_func=fuser_func,
+        fuse_custom_config_dict=fuse_custom_config_dict,
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..d72a3579438bc3e5e2687982ab4b550c680d2110
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/fuser_method_mappings.py
@@ -0,0 +1,314 @@
+# mypy: allow-untyped-defs
+import itertools
+from collections.abc import Callable
+from typing import Any
+
+import torch.ao.nn.intrinsic as nni
+import torch.nn as nn
+from torch.ao.quantization.utils import get_combined_dict, MatchAllNode, Pattern
+
+
+__all__ = [
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "fuse_linear_bn",
+    "fuse_convtranspose_bn",
+    "get_fuser_method",
+    "get_fuser_method_new",
+]
+
+
+def fuse_conv_bn(is_qat, conv, bn):
+    r"""Return the fused the conv and bn modules.
+    Given the conv and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        conv: Module instance of type conv2d/conv3d
+        bn: Spatial BN instance that needs to be fused with the conv
+
+    Examples::
+
+        >>> m1 = nn.Conv2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_conv_bn(m1, b1)
+    """
+    if conv.training != bn.training:
+        raise AssertionError(
+            "Conv and BN both must be in the same mode (train or eval)."
+        )
+
+    fused_module_class_map = {
+        nn.Conv1d: nni.ConvBn1d,
+        nn.Conv2d: nni.ConvBn2d,
+        nn.Conv3d: nni.ConvBn3d,
+    }
+
+    if is_qat:
+        if bn.num_features != conv.out_channels:
+            raise AssertionError(
+                "Output channel of Conv2d must match num_features of BatchNorm2d."
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+            )
+        fused_module_class = fused_module_class_map.get((type(conv)), None)
+        if fused_module_class is not None:
+            return fused_module_class(conv, bn)
+        else:
+            raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn)}")
+    else:
+        return nn.utils.fuse_conv_bn_eval(conv, bn)
+
+
+def fuse_conv_bn_relu(is_qat, conv, bn, relu):
+    r"""Return the fused conv and bv modules.
+
+    Given the conv and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        conv: Module instance of type conv2d/conv3d
+        bn: Spatial BN instance that needs to be fused with the conv
+
+    Examples::
+
+        >>> m1 = nn.Conv2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> r1 = nn.ReLU(inplace=False)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_conv_bn_relu(m1, b1, r1)
+    """
+    if not (conv.training == bn.training == relu.training):
+        raise AssertionError(
+            "Conv and BN both must be in the same mode (train or eval)."
+        )
+    fused_module: type[nn.Sequential] | None = None
+    if is_qat:
+        map_to_fused_module_train = {
+            nn.Conv1d: nni.ConvBnReLU1d,
+            nn.Conv2d: nni.ConvBnReLU2d,
+            nn.Conv3d: nni.ConvBnReLU3d,
+        }
+        if bn.num_features != conv.out_channels:
+            raise AssertionError(
+                "Output channel of Conv2d must match num_features of BatchNorm2d"
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+            )
+        fused_module = map_to_fused_module_train.get(type(conv), None)
+        if fused_module is not None:
+            return fused_module(conv, bn, relu)
+        else:
+            raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, relu)}")
+    else:
+        map_to_fused_module_eval = {
+            nn.Conv1d: nni.ConvReLU1d,
+            nn.Conv2d: nni.ConvReLU2d,
+            nn.Conv3d: nni.ConvReLU3d,
+        }
+        fused_module = map_to_fused_module_eval.get(type(conv), None)
+        if fused_module is not None:
+            fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+            return fused_module(fused_conv, relu)
+        else:
+            raise NotImplementedError(f"Cannot fuse eval modules: {(conv, bn, relu)}")
+
+
+def fuse_linear_bn(is_qat, linear, bn):
+    r"""Return the fused linear and bn modules.
+    Given the linear and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        linear: Module instance of type Linear
+        bn: BatchNorm1d instance that needs to be fused with the linear layer
+
+    Examples::
+
+        >>> m1 = nn.Linear(20, 10)
+        >>> b1 = nn.BatchNorm1d(10)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_linear_bn(m1, b1)
+    """
+    if linear.training != bn.training:
+        raise AssertionError(
+            "Linear and BN both must be in the same mode (train or eval)."
+        )
+
+    if is_qat:
+        if bn.num_features != linear.out_features:
+            raise AssertionError(
+                "Output features of Linear must match num_features of BatchNorm1d"
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm1d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+            )
+        return nni.LinearBn1d(linear, bn)
+    else:
+        return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
+
+
+def fuse_convtranspose_bn(is_qat, convt, bn):
+    r"""Return the fused ConvTranspose and bn modules.
+    Given ConvTranspose and bn modules, fuses them and returns the fused module
+
+    Args:
+        convt: Module instance of type ConvTransposeNd
+        bn: BatchNormNd instance that needs to be fused with the linear layer.
+            batch norm N should match the ConvTranspose N
+
+    Examples::
+
+        >>> m1 = nn.ConvTranspose2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_convtranspose_bn(m1, b1)
+    """
+    if convt.training != bn.training:
+        raise AssertionError(
+            "ConvTranspose and BN both must be in the same mode (train or eval)."
+        )
+
+    if is_qat:
+        raise Exception(  # noqa: TRY002
+            "Fusing ConvTranspose+BatchNorm not yet supported in QAT."
+        )
+    else:
+        return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True)
+
+
+def _sequential_wrapper2(sequential):
+    """Return a sequential wrapped that for is_qat and two modules.
+    Given a sequential class for two modules, return a function that takes
+    is_qat, and then two modules as argument, that ignores the is_qat flag
+    and always returns the sequential that combines the two input modules
+    """
+
+    def fuser_method(is_qat, m1, m2):
+        return sequential(m1, m2)
+
+    return fuser_method
+
+
+_DEFAULT_OP_LIST_TO_FUSER_METHOD: dict[tuple, nn.Sequential | Callable] = {
+    (nn.Conv1d, nn.BatchNorm1d): fuse_conv_bn,
+    (nn.Conv1d, nn.BatchNorm1d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv2d, nn.BatchNorm2d): fuse_conv_bn,
+    (nn.Conv2d, nn.BatchNorm2d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv3d, nn.BatchNorm3d): fuse_conv_bn,
+    (nn.Conv3d, nn.BatchNorm3d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv1d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU1d),
+    (nn.Conv2d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU2d),
+    (nn.Conv3d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU3d),
+    (nn.Linear, nn.BatchNorm1d): fuse_linear_bn,
+    (nn.Linear, nn.ReLU): _sequential_wrapper2(nni.LinearReLU),
+    (nn.BatchNorm2d, nn.ReLU): _sequential_wrapper2(nni.BNReLU2d),
+    (nn.BatchNorm3d, nn.ReLU): _sequential_wrapper2(nni.BNReLU3d),
+    (nn.ConvTranspose1d, nn.BatchNorm1d): fuse_convtranspose_bn,
+    (nn.ConvTranspose2d, nn.BatchNorm2d): fuse_convtranspose_bn,
+    (nn.ConvTranspose3d, nn.BatchNorm3d): fuse_convtranspose_bn,
+}
+
+
+def get_fuser_method(op_list, additional_fuser_method_mapping=None):
+    """Get fuser method for the given list of module types.
+
+    Get fuser method for the given list of module types,
+    return None if fuser method does not exist
+    """
+    if additional_fuser_method_mapping is None:
+        additional_fuser_method_mapping = {}
+    all_mappings = get_combined_dict(
+        _DEFAULT_OP_LIST_TO_FUSER_METHOD, additional_fuser_method_mapping
+    )
+    fuser_method = all_mappings.get(op_list, None)
+    if fuser_method is None:
+        raise AssertionError(f"did not find fuser method for: {op_list} ")
+    return fuser_method
+
+
+def _reverse2(f):
+    def reversed(is_qat, x, y):
+        return f(is_qat, y, x)
+
+    return reversed
+
+
+def _reverse3(f):
+    def reversed(is_qat, x, w):
+        y, z = w
+        return f(is_qat, z, y, x)
+
+    return reversed
+
+
+def _get_valid_patterns(op_pattern):
+    """Return a list of valid patterns generated from the op_pattern.
+
+    Returns a list of valid patterns generated from the op_pattern,
+    since MatchAllNode can match all types of nodes,
+    e.g. pattern (torch.nn.Conv2d, torch.add) should also be able to match keys like
+    (MatchAllNode, torch.add) and (torch.nn.Conv2d, MatchAllNode)
+
+    Example Input:
+    (torch.add, (torch.nn.ReLU, torch.nn.Conv2d))
+
+    Example Output:
+    [(torch.add, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (torch.add, (torch.nn.ReLU, MatchAllNode)),
+     (torch.add, (MatchAllNode, torch.nn.Conv2d)),
+     (torch.add, (MatchAllNode, MatchAllNode)),
+     (MatchAllNode, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (MatchAllNode, (torch.nn.ReLU, MatchAllNode)),
+     (MatchAllNode, (MatchAllNode, torch.nn.Conv2d)),
+     (MatchAllNode, (MatchAllNode, MatchAllNode)),
+    ]
+    """
+    result: list[Any]
+    if isinstance(op_pattern, (tuple, list)):
+        sub_combs = [_get_valid_patterns(sub_pattern) for sub_pattern in op_pattern]
+        result = list(itertools.product(*sub_combs))
+    else:
+        result = [op_pattern, MatchAllNode]
+    return result
+
+
+def get_fuser_method_new(
+    op_pattern: Pattern,
+    fuser_method_mapping: dict[Pattern, nn.Sequential | Callable],
+):
+    """Get fuser method.
+
+    This will be made default after we deprecate the get_fuser_method
+    Would like to implement this first and have a separate PR for deprecation
+    """
+    op_patterns = _get_valid_patterns(op_pattern)
+    fuser_method = None
+    for op_pattern in op_patterns:
+        fuser_method = fuser_method_mapping.get(op_pattern)
+        if fuser_method is not None:
+            break
+    if fuser_method is None:
+        raise AssertionError(f"did not find fuser method for: {op_pattern} ")
+    return fuser_method
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb81c2a54d0091e16ff7cbbf6ef6bb2112485de
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/observer.py
@@ -0,0 +1,2155 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+# temporarily skip RUF for this file for now, we can re-enable
+# after move the affine quantization related things to torchao
+# noqa: RUF
+"""
+This module implements observers which are used to collect statistics about
+the values observed during calibration (PTQ) or training (QAT).
+"""
+
+import operator
+import re
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from functools import partial
+from typing import Any
+
+import torch
+import torch.nn as nn
+from torch.ao.quantization.utils import (
+    calculate_qmin_qmax,
+    check_min_max_valid,
+    is_per_channel,
+    is_per_tensor,
+    validate_qmin_qmax,
+)
+from torch.fx import Node
+
+
+__all__ = [
+    "default_affine_fixed_qparams_observer",
+    "default_debug_observer",
+    "default_dynamic_quant_observer",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_float_qparams_observer",
+    "default_float_qparams_observer_4bit",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_reuse_input_observer",
+    "default_symmetric_fixed_qparams_observer",
+    "default_weight_observer",
+    "get_observer_state_dict",
+    "load_observer_state_dict",
+    "per_channel_weight_observer_range_neg_127_to_127",
+    "weight_observer_range_neg_127_to_127",
+    "FixedQParamsObserver",
+    "HistogramObserver",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "RecordingObserver",
+    "ReuseInputObserver",
+    "UniformQuantizationObserverBase",
+    "AffineQuantizedObserverBase",
+    "Granularity",
+    "MappingType",
+    "PerAxis",
+    "PerBlock",
+    "PerGroup",
+    "PerRow",
+    "PerTensor",
+    "PerToken",
+    "TorchAODType",
+    "ZeroPointDomain",
+    "get_block_size",
+]
+
+
+class _PartialWrapper:
+    def __init__(self, p):
+        self.p = p
+        self.callable_args = {}
+
+    def __call__(self, *args, **keywords):
+        # call each arg in callable_args and add them partial, then run with keywords
+        # skip if arg_name in keywords so its possible to overwrite
+        for arg_name in self.callable_args:
+            if arg_name not in keywords:
+                keywords = {**keywords, arg_name: self.callable_args[arg_name]()}
+        return self.p(*args, **keywords)
+
+    def __repr__(self):
+        return self.p.__repr__() + self.callable_args.__repr__()
+
+    def with_args(self, **kwargs):
+        return _with_args(self, **kwargs)
+
+    def with_callable_args(self, **kwargs):
+        result = _PartialWrapper(p=self.p)
+        result.callable_args = {**self.callable_args, **kwargs}
+        return result
+
+
+def _with_args(cls_or_self, **kwargs):
+    r"""Wrapper that allows creation of class factories.
+
+    This can be useful when there is a need to create classes with the same
+    constructor arguments, but different instances. Can be used in conjunction with
+    _callable_args
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined vars")
+        >>> Foo.with_args = classmethod(_with_args)
+        >>> foo_builder = Foo.with_args(a=3, b=4).with_args(answer=42)
+        >>> foo_instance1 = foo_builder()
+        >>> foo_instance2 = foo_builder()
+        >>> id(foo_instance1) == id(foo_instance2)
+        False
+    """
+    r = _PartialWrapper(partial(cls_or_self, **kwargs))
+    return r
+
+
+def _with_callable_args(cls_or_self, **kwargs):
+    r"""Wrapper that allows creation of class factories args that need to be
+    called at construction time.
+
+    This can be useful when there is a need to create classes with the same
+    constructor arguments, but different instances and those arguments should only
+    be calculated at construction time. Can be used in conjunction with _with_args
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined vars")
+        >>> Foo.with_callable_args = classmethod(_with_callable_args)
+        >>> Foo.with_args = classmethod(_with_args)
+        >>> foo_builder = Foo.with_callable_args(cur_time=get_time_func).with_args(name="dan")
+        >>> foo_instance1 = foo_builder()
+        >>> # wait 50
+        >>> foo_instance2 = foo_builder()
+        >>> id(foo_instance1.creation_time) == id(foo_instance2.creation_time)
+        False
+    """
+    r = _PartialWrapper(partial(cls_or_self))
+    return r.with_callable_args(**kwargs)
+
+
+ABC: Any = ABCMeta("ABC", (object,), {})  # compatible with Python 2 *and* 3:
+
+
+class ObserverBase(ABC, nn.Module):
+    r"""Base observer Module.
+    Any observer implementation should derive from this class.
+
+    Concrete observers should follow the same API. In forward, they will update
+    the statistics of the observed Tensor. And they should provide a
+    `calculate_qparams` function that computes the quantization parameters given
+    the collected statistics.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        is_dynamic: indicator for whether the observer is a placeholder for dynamic quantization
+        or static quantization
+    """
+
+    def __init__(self, dtype, is_dynamic: bool = False):
+        super().__init__()
+        self.dtype = dtype
+        self.is_dynamic = is_dynamic
+
+    @abstractmethod
+    def forward(self, x):
+        pass
+
+    @abstractmethod
+    def calculate_qparams(self, **kwargs):
+        pass
+
+    with_args = classmethod(_with_args)
+    with_callable_args = classmethod(_with_callable_args)
+
+
+class UniformQuantizationObserverBase(ObserverBase):
+    r"""Common base for all observers using uniform quantization to calculate
+    scale and zero_point.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used.
+        reduce_range: Reduces the range of the quantized data type by 1 bit.
+                      This is sometimes required to avoid instruction overflow.
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    .. warning::
+
+        :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
+               or `torch.int8` or `torch.uint8`
+
+    .. warning::
+
+        :attr:`qscheme` can only take one of the following options:
+
+        - ``torch.per_tensor_affine``
+        - ``torch.per_tensor_symmetric``
+        - ``torch.per_channel_affine``
+        - ``torch.per_channel_symmetric``
+    """
+
+    # Note: the version is shared by all observer types
+    #
+    # Version 1/None
+    #   self
+    #
+    # Version 2 (base class only, does not include child class buffers)
+    #   self
+    #   |--- eps : Tensor
+    #
+    # Version 3
+    #   for HistogramObserver only, changed the shape of uninitialized
+    #   min_val and max_val buffers from torch.Size([0]) to torch.Size([])
+    #   for PerChannelObservers, changed the name of the buffers from min_vals
+    #   to min_val and from max_vals to max_val.
+    _version = 3
+
+    eps: torch.Tensor
+
+    def __init__(
+        self,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs)
+        self.qscheme = qscheme
+        if reduce_range:
+            warnings.warn(
+                "Please use quant_min and quant_max to specify the range for observers. \
+                    reduce_range will be deprecated in a future release of PyTorch.",
+                stacklevel=2,
+            )
+        self.reduce_range = reduce_range
+        self.register_buffer("eps", torch.tensor([eps], **factory_kwargs))
+        if self.qscheme not in (
+            torch.per_tensor_affine,
+            torch.per_tensor_symmetric,
+            torch.per_channel_affine,
+            torch.per_channel_symmetric,
+            torch.per_channel_affine_float_qparams,
+        ):
+            raise AssertionError(
+                "Default Observer only works for per_tensor_affine, per_tensor_symmetric, "
+                "per_channel_affine, per_channel_symmetric and per_channel_float_qparams quantization scheme"
+            )
+
+        _ALLOWED_DTYPES = (
+            torch.qint8,
+            torch.quint8,
+            torch.quint4x2,
+            torch.qint32,
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.uint16,
+        )
+
+        if self.dtype not in _ALLOWED_DTYPES:
+            raise AssertionError(
+                f"Default Observer only works for {_ALLOWED_DTYPES} data type"
+            )
+        self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
+        if self.has_customized_qrange:
+            # pyrefly: ignore [bad-argument-type]
+            validate_qmin_qmax(quant_min, quant_max)
+        self.quant_min, self.quant_max = calculate_qmin_qmax(
+            # pyrefly: ignore [bad-argument-type]
+            quant_min,
+            # pyrefly: ignore [bad-argument-type]
+            quant_max,
+            self.has_customized_qrange,
+            self.dtype,
+            self.reduce_range,
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version == 1:
+            # eps was moved to a buffer in version 2
+            eps = torch.tensor([torch.finfo(torch.float32).eps])
+            state_dict[prefix + "eps"] = eps
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
+        r"""Validates that the user-specified quantization range is properly initialized
+        and within the given bound supported by the observer dtype.
+
+        To accommodate lower-bit quantization with respect to the existing torch.qint8 and
+        torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing
+        in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax
+        values are used to calculate static estimates of the scale and zero point for aggressive lower-bit
+        fake quantization. These estimates are compared against parameters learned through backpropagation.
+        The related literatures for scale and zero point via backpropagation are as follows:
+
+        Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS
+        Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf
+        """
+        # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
+        # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+        if not quant_min <= 0 <= quant_max:
+            raise AssertionError("Used-specified quantization range must include 0.")
+        if quant_min >= quant_max:
+            raise AssertionError(
+                "qmin must be strictly less than qmax for user-specified quantization range."
+            )
+
+    @torch.jit.export
+    def _calculate_qparams(
+        self, min_val: torch.Tensor, max_val: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        r"""Calculates the quantization parameters, given min and max
+        value tensors. Works for both per tensor and per channel cases
+
+        Args:
+            min_val: Minimum values per channel
+            max_val: Maximum values per channel
+
+        Returns:
+            scales: Scales tensor of shape (#channels,)
+            zero_points: Zero points tensor of shape (#channels,)
+        """
+        # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme
+        # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+        # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code
+        # seems unlikely to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+        # TODO(jakeszwe, jerryzh168)
+        if not check_min_max_valid(min_val, max_val):
+            return torch.tensor([1.0], device=min_val.device.type), torch.tensor(
+                [0], device=min_val.device.type
+            )
+
+        quant_min, quant_max = self.quant_min, self.quant_max
+        min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+        max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+
+        device = min_val_neg.device
+        scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device)
+        zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+        if (
+            self.qscheme == torch.per_tensor_symmetric
+            or self.qscheme == torch.per_channel_symmetric
+        ):
+            max_val_pos = torch.max(-min_val_neg, max_val_pos)
+            scale = max_val_pos / (float(quant_max - quant_min) / 2)
+            scale = torch.max(scale, self.eps)
+            if self.dtype in [torch.quint8, torch.uint8]:
+                if self.has_customized_qrange:
+                    # When customized quantization range is used, down-rounded midpoint of the range is chosen.
+                    zero_point = zero_point.new_full(
+                        zero_point.size(), (quant_min + quant_max) // 2
+                    )
+                else:
+                    zero_point = zero_point.new_full(zero_point.size(), 128)
+            elif self.dtype == torch.uint16:
+                zero_point = zero_point.new_full(zero_point.size(), 2**15)
+        elif self.qscheme == torch.per_channel_affine_float_qparams:
+            scale = (max_val - min_val) / float(quant_max - quant_min)
+            scale = torch.where(scale > self.eps, scale, torch.ones_like(scale))
+            # We use the quantize function
+            # xq = Round(Xf * inv_scale + zero_point),
+            # setting zero_point to (-1 * min *inv_scale) we get
+            # Xq = Round((Xf - min) * inv_scale)
+            zero_point = -1 * min_val / scale
+        else:
+            scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+            scale = torch.max(scale, self.eps)
+            zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+            zero_point = torch.clamp(zero_point, quant_min, quant_max)
+
+        # For scalar values, cast them to Tensors of size 1 to keep the shape
+        # consistent with default values in FakeQuantize.
+        if len(scale.shape) == 0:
+            # TODO: switch to scale.item() after adding JIT support
+            scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device)
+        if len(zero_point.shape) == 0:
+            # TODO: switch to zero_point.item() after adding JIT support
+            zero_point = torch.tensor(
+                [int(zero_point)], dtype=zero_point.dtype, device=device
+            )
+            if self.qscheme == torch.per_channel_affine_float_qparams:
+                zero_point = torch.tensor(
+                    [float(zero_point)], dtype=zero_point.dtype, device=device
+                )
+
+        return scale, zero_point
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        raise NotImplementedError("Cannot reset min/max values in the given observer.")
+
+
+# Originally, this class was called `_ObserverBase`.  Keeping the old name around
+# for backwards compatibility.
+# TODO(after v1.13): delete this
+_ObserverBase = UniformQuantizationObserverBase
+
+
+class MinMaxObserver(UniformQuantizationObserverBase):
+    r"""Observer module for computing the quantization parameters based on the
+    running min and max values.
+
+    This observer uses the tensor min/max statistics to compute the quantization
+    parameters. The module records the running minimum and maximum of incoming
+    tensors, and uses this statistic to compute the quantization parameters.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    Given running min/max as :math:`x_\text{min}` and :math:`x_\text{max}`,
+    scale :math:`s` and zero point :math:`z` are computed as:
+
+    The running minimum/maximum :math:`x_\text{min/max}` is computed as:
+
+    .. math::
+
+        \begin{array}{ll}
+        x_\text{min} &= \begin{cases}
+            \min(X) & \text{if~}x_\text{min} = \text{None} \\
+            \min\left(x_\text{min}, \min(X)\right) & \text{otherwise}
+        \end{cases}\\
+        x_\text{max} &= \begin{cases}
+            \max(X) & \text{if~}x_\text{max} = \text{None} \\
+            \max\left(x_\text{max}, \max(X)\right) & \text{otherwise}
+        \end{cases}\\
+        \end{array}
+
+    where :math:`X` is the observed tensor.
+
+    The scale :math:`s` and zero point :math:`z` are then computed as:
+
+    .. math::
+
+        \begin{aligned}
+            \text{if Symmetric:}&\\
+            &s = 2 \max(|x_\text{min}|, x_\text{max}) /
+                \left( Q_\text{max} - Q_\text{min} \right) \\
+            &z = \begin{cases}
+                0 & \text{if dtype is qint8} \\
+                128 & \text{otherwise}
+            \end{cases}\\
+            \text{Otherwise:}&\\
+                &s = \left( x_\text{max} - x_\text{min}  \right ) /
+                    \left( Q_\text{max} - Q_\text{min} \right ) \\
+                &z = Q_\text{min} - \text{round}(x_\text{min} / s)
+        \end{aligned}
+
+    where :math:`Q_\text{min}` and :math:`Q_\text{max}` are the minimum and
+    maximum of the quantized data type.
+
+    .. warning:: :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
+
+    .. note:: If the running minimum equals to the running maximum, the scale
+              and zero_point are set to 1.0 and 0.
+    """
+
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                "MinMaxObserver's qscheme only support torch.per_tensor_symmetric \
+                    and torch.per_tensor_affine."
+            )
+        # TODO: MinMaxObserver by itself doesn't support dynamic quantization, but
+        # if it's inherited by MovingAverageObserver, and averaging_constant is 1, it
+        # supports dynamic quantization, we may need to better error checking here
+
+        # For x86 quantized kernels, we need to ensure that the vpmaddubsw
+        # instruction does not overflow. We allow for a reduce_range argument to
+        # observers that reduces the quantized range to (0,127) or (-64, 63).
+        # For more details see aten/src/ATen/native/quantized/cpu/qconv.cpp
+        # This is not an optimal choice for non x86 backends as it loses a bit
+        # of precision for activations.
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
+        if (
+            self.qscheme == torch.per_tensor_symmetric
+            and self.reduce_range
+            and self.dtype == torch.quint8
+        ):
+            raise NotImplementedError(
+                "Cannot reduce range for symmetric \
+                                       quantization for quint8"
+            )
+
+    def forward(self, x_orig):
+        r"""Records the running minimum and maximum of ``x``."""
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val_cur, max_val_cur = torch.aminmax(x)
+        min_val = torch.min(min_val_cur, self.min_val)
+        max_val = torch.max(max_val_cur, self.max_val)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        r"""Calculates the quantization parameters."""
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+    @torch.jit.export
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        """Resets the min/max values."""
+        self.min_val.copy_(torch.tensor(float("inf")))
+        self.max_val.copy_(torch.tensor(float("-inf")))
+
+
+class MovingAverageMinMaxObserver(MinMaxObserver):
+    r"""Observer module for computing the quantization parameters based on the
+    moving average of the min and max values.
+
+    This observer computes the quantization parameters based on the moving
+    averages of minimums and maximums of the incoming tensors. The module
+    records the average minimum and maximum of incoming tensors, and uses this
+    statistic to compute the quantization parameters.
+
+    Args:
+        averaging_constant: Averaging constant for min/max.
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The moving average min/max is computed as follows
+
+    .. math::
+
+        \begin{array}{ll}
+                x_\text{min} = \begin{cases}
+                    \min(X) & \text{if~}x_\text{min} = \text{None} \\
+                    (1 - c) x_\text{min} + c \min(X) & \text{otherwise}
+                \end{cases}\\
+                x_\text{max} = \begin{cases}
+                    \max(X) & \text{if~}x_\text{max} = \text{None} \\
+                    (1 - c) x_\text{max} + c \max(X) & \text{otherwise}
+                \end{cases}\\
+        \end{array}
+
+    where :math:`x_\text{min/max}` is the running average min/max, :math:`X` is
+    is the incoming tensor, and :math:`c` is the ``averaging_constant``.
+
+    The scale and zero point are then computed as in
+    :class:`~torch.ao.quantization.observer.MinMaxObserver`.
+
+    .. note:: Only works with ``torch.per_tensor_affine`` quantization scheme.
+
+    .. note:: If the running minimum equals to the running maximum, the scale
+              and zero_point are set to 1.0 and 0.
+    """
+
+    def __init__(
+        self,
+        averaging_constant=0.01,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                f"MovingAverageMinMaxObserver's qscheme only support \
+                torch.per_tensor_symmetric and torch.per_tensor_affine. \
+                but got: {qscheme}"
+            )
+        self.averaging_constant = averaging_constant
+        if is_dynamic and self.averaging_constant != 1:
+            raise NotImplementedError(
+                "MovingAverageMinMaxObserver doesn't support dynamic quantization for "
+                f"averaging constant of {self.averaging_constant}"
+            )
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+
+    def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val = self.min_val
+        max_val = self.max_val
+        if min_val == float("inf") and max_val == float("-inf"):
+            min_val, max_val = torch.aminmax(x)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(x)
+            min_val = min_val + self.averaging_constant * (min_val_cur - min_val)
+            max_val = max_val + self.averaging_constant * (max_val_cur - max_val)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+
+class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
+    r"""Observer module for computing the quantization parameters based on the
+    running per channel min and max values.
+
+    This observer uses the tensor min/max statistics to compute the per channel
+    quantization parameters. The module records the running minimum and maximum
+    of incoming tensors, and uses this statistic to compute the quantization
+    parameters.
+
+    Args:
+        ch_axis: Channel axis
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The quantization parameters are computed the same way as in
+    :class:`~torch.ao.quantization.observer.MinMaxObserver`, with the difference
+    that the running min/max values are stored per channel.
+    Scales and zero points are thus computed per channel as well.
+
+    .. note:: If the running minimum equals to the running maximum, the scales
+              and zero_points are set to 1.0 and 0.
+    """
+
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        ch_axis=0,
+        dtype=torch.quint8,
+        qscheme=torch.per_channel_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_channel(qscheme):
+            raise NotImplementedError(
+                "PerChannelMinMaxObserver's qscheme only support \
+                    torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "PerChannelMinMaxObserver doesn't support dynamic quantization"
+            )
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.ch_axis = ch_axis
+        self.register_buffer("min_val", torch.tensor([], **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor([], **factory_kwargs))
+        if (
+            self.qscheme == torch.per_channel_symmetric
+            and self.reduce_range
+            and self.dtype == torch.quint8
+        ):
+            raise NotImplementedError(
+                "Cannot reduce range for symmetric quantization for quint8"
+            )
+
+    def forward(self, x_orig):
+        return self._forward(x_orig)
+
+    def _forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        min_val = self.min_val
+        max_val = self.max_val
+        x_dim = x.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(new_axis_list)
+        # Need to match dtype of min/max because the updates to buffers
+        # are done in place and types need to match for comparisons
+        y = y.to(self.min_val.dtype)
+        y = torch.flatten(y, start_dim=1)
+        if min_val.numel() == 0 or max_val.numel() == 0:
+            min_val, max_val = torch.aminmax(y, dim=1)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+            min_val = torch.min(min_val_cur, min_val)
+            max_val = torch.max(max_val_cur, max_val)
+        self.min_val.resize_(min_val.shape)
+        self.max_val.resize_(max_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict[str, Any],
+        prefix: str,
+        local_metadata: dict[str, torch.Tensor],
+        strict: bool,
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
+    ):
+        version = local_metadata.get("version")
+        if version is not None and version < 3:
+            local_state = ["min_vals", "max_vals"]
+            expected_min_name = "min_vals"
+            expected_max_name = "max_vals"
+        else:
+            local_state = ["min_val", "max_val"]
+            expected_min_name = "min_val"
+            expected_max_name = "max_val"
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                # Custom handling to allow loading min_val or max_val
+                # of size N into uninitialized buffers of size 0. The
+                # buffers are resized here, and the values are copied in
+                # the default state_dict loading code of the parent.
+                if name == expected_min_name:
+                    self.min_val.resize_(val.shape)
+                elif name == expected_max_name:
+                    self.max_val.resize_(val.shape)
+                else:
+                    warnings.warn(
+                        f"Observer load_from_state_dict got unexpected name {name}",
+                        stacklevel=2,
+                    )
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == expected_min_name:
+                        self.min_val.copy_(val)
+                    elif name == expected_max_name:
+                        self.max_val.copy_(val)
+                    else:
+                        warnings.warn(
+                            f"Observer load_from_state_dict got unexpected name {name}",
+                            stacklevel=2,
+                        )
+            elif strict:
+                missing_keys.append(key)
+
+        if not torch.jit.is_scripting():
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                False,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+
+    def _load_from_state_dict_script(
+        self,
+        state_dict: dict[str, Any],
+        prefix: str,
+        local_metadata: dict[str, torch.Tensor],
+        strict: bool,
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
+    ):
+        self._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        """Resets the min/max values."""
+        # This used to be torch.ones but that does not work because
+        # JIT compiler can optimize it via common subexpression elimination
+        # in which case both min_val and max_val point to the same tensor.
+        self.min_val = torch.rand(
+            0,
+        )
+        self.max_val = torch.rand(
+            0,
+        )
+
+
+class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
+    r"""Observer module for computing the quantization parameters based on the
+    running per channel min and max values.
+
+    This observer uses the tensor min/max statistics to compute the per channel
+    quantization parameters. The module records the running minimum and maximum
+    of incoming tensors, and uses this statistic to compute the quantization
+    parameters.
+
+    Args:
+        averaging_constant: Averaging constant for min/max.
+        ch_axis: Channel axis
+        dtype: Quantized data type
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The quantization parameters are computed the same way as in
+    :class:`~torch.ao.quantization.observer.MovingAverageMinMaxObserver`, with the
+    difference that the running min/max values are stored per channel.
+    Scales and zero points are thus computed per channel as well.
+
+    .. note:: If the running minimum equals to the running maximum, the scales
+              and zero_points are set to 1.0 and 0.
+    """
+
+    def __init__(
+        self,
+        averaging_constant=0.01,
+        ch_axis=0,
+        dtype=torch.quint8,
+        qscheme=torch.per_channel_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_channel(qscheme):
+            raise NotImplementedError(
+                "MovingAveragePerChannelMinMaxObserver's qscheme only support \
+                    torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "MovingAveragePerChannelMinMaxObserver doesn't support dynamic quantization"
+            )
+        super().__init__(
+            ch_axis=ch_axis,
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        self.averaging_constant = averaging_constant
+
+    def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val = self.min_val
+        max_val = self.max_val
+        x_dim = x.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(new_axis_list)
+        y = torch.flatten(y, start_dim=1)
+        if min_val.numel() == 0 or max_val.numel() == 0:
+            min_val, max_val = torch.aminmax(y, dim=1)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+            min_val = min_val + self.averaging_constant * (min_val_cur - min_val)
+            max_val = max_val + self.averaging_constant * (max_val_cur - max_val)
+        self.min_val.resize_(min_val.shape)
+        self.max_val.resize_(max_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+
+class HistogramObserver(UniformQuantizationObserverBase):
+    r"""
+    The module records the running histogram of tensor values along with
+    min/max values. ``calculate_qparams`` will calculate scale and zero_point.
+
+    Args:
+        bins: Number of bins to use for the histogram
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The scale and zero point are computed as follows:
+
+    1. Create the histogram of the incoming inputs.
+        The histogram is computed continuously, and the ranges per bin change
+        with every new tensor observed.
+    2. Search the distribution in the histogram for optimal min/max values.
+        The search for the min/max values ensures the minimization of the
+        quantization error with respect to the floating point model.
+    3. Compute the scale and zero point the same way as in the
+        :class:`~torch.ao.quantization.MinMaxObserver`
+    """
+
+    histogram: torch.Tensor
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        bins: int = 2048,
+        dtype: torch.dtype = torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                "HistogramObserver's qscheme only support torch.per_tensor_symmetric \
+                    and torch.per_tensor_affine."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "HistogramObserver doesn't support dynamic quantization"
+            )
+        # bins: The number of bins used for histogram calculation.
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.bins = bins
+        self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs))
+        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
+        self.dst_nbins = 2 ** torch.iinfo(self.dtype).bits
+        self.upsample_rate = (
+            16  # used to reduce quantization errors when upscaling histogram
+        )
+
+    def _get_norm(
+        self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Compute the norm of the values uniformaly distributed between
+        delta_begin and delta_end.
+        Currently only L2 norm is supported.
+
+        norm = density * (integral_{begin, end} x^2)
+             = density * (end^3 - begin^3) / 3
+        """
+        norm = (
+            delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin
+        ) / 3
+        return density * norm
+
+    def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int):
+        r"""
+        Compute the quantization error if we use start_bin to end_bin as the
+        min and max to do the quantization.
+        """
+        bin_width = (self.max_val.item() - self.min_val.item()) / self.bins
+
+        dst_bin_width = bin_width * (next_end_bin - next_start_bin + 1) / self.dst_nbins
+        if dst_bin_width == 0.0:
+            return 0.0
+
+        src_bin = torch.arange(self.bins, device=self.histogram.device)
+        # distances from the beginning of first dst_bin to the beginning and
+        # end of src_bin
+        src_bin_begin = (src_bin - next_start_bin) * bin_width
+        src_bin_end = src_bin_begin + bin_width
+
+        # which dst_bins the beginning and end of src_bin belong to?
+        dst_bin_of_begin = torch.clamp(
+            torch.div(src_bin_begin, dst_bin_width, rounding_mode="floor"),
+            0,
+            self.dst_nbins - 1,
+        )
+        dst_bin_of_begin_center = (dst_bin_of_begin + 0.5) * dst_bin_width
+
+        dst_bin_of_end = torch.clamp(
+            torch.div(src_bin_end, dst_bin_width, rounding_mode="floor"),
+            0,
+            self.dst_nbins - 1,
+        )
+        density = self.histogram / bin_width
+
+        norm = torch.zeros(self.bins, device=self.histogram.device)
+
+        delta_begin = src_bin_begin - dst_bin_of_begin_center
+        delta_end = dst_bin_width / 2
+        norm += self._get_norm(
+            delta_begin,
+            torch.ones(self.bins, device=self.histogram.device) * delta_end,
+            density,
+        )
+
+        norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm(
+            torch.tensor(-dst_bin_width / 2), torch.tensor(dst_bin_width / 2), density
+        )
+
+        dst_bin_of_end_center = dst_bin_of_end * dst_bin_width + dst_bin_width / 2
+
+        delta_begin = -dst_bin_width / 2
+        delta_end = src_bin_end - dst_bin_of_end_center
+        norm += self._get_norm(torch.tensor(delta_begin), delta_end, density)
+
+        return norm.sum().item()
+
+    def _non_linear_param_search(self) -> tuple[torch.Tensor, torch.Tensor]:
+        r"""Non-linear parameter search.
+
+        An approximation for L2 error minimization for selecting min/max.
+        By selecting new min/max, we filter out outliers in input distribution.
+        This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
+        caffe2/quantization/server/norm_minimization.cc
+        """
+        if self.histogram.size()[0] != self.bins:
+            raise AssertionError("bins mismatch")
+        bin_width = (self.max_val - self.min_val) / self.bins
+
+        # cumulative sum
+        total = torch.sum(self.histogram).item()
+        cSum = torch.cumsum(self.histogram, dim=0)
+
+        stepsize = 1e-5  # granularity
+        alpha = 0.0  # lower bound
+        beta = 1.0  # upper bound
+        start_bin = 0
+        end_bin = self.bins - 1
+        norm_min = float("inf")
+
+        while alpha < beta:
+            # Find the next step
+            next_alpha = alpha + stepsize
+            next_beta = beta - stepsize
+
+            # find the left and right bins between the quantile bounds
+            l = start_bin
+            r = end_bin
+            while l < end_bin and cSum[l] < next_alpha * total:
+                l = l + 1
+            while r > start_bin and cSum[r] > next_beta * total:
+                r = r - 1
+
+            # decide the next move
+            next_start_bin = start_bin
+            next_end_bin = end_bin
+            if (l - start_bin) > (end_bin - r):
+                # move the start bin
+                next_start_bin = l
+                alpha = next_alpha
+            else:
+                # move the end bin
+                next_end_bin = r
+                beta = next_beta
+
+            if next_start_bin == start_bin and next_end_bin == end_bin:
+                continue
+
+            # calculate the quantization error using next_start_bin and next_end_bin
+            norm = self._compute_quantization_error(next_start_bin, next_end_bin)
+
+            if norm > norm_min:
+                break
+            norm_min = norm
+            start_bin = next_start_bin
+            end_bin = next_end_bin
+
+        new_min = self.min_val + bin_width * start_bin
+        new_max = self.min_val + bin_width * (end_bin + 1)
+        return new_min, new_max
+
+    def _upscale_histogram(
+        self,
+        histogram: torch.Tensor,
+        orig_min: torch.Tensor,
+        orig_max: torch.Tensor,
+        update_min: torch.Tensor,
+        update_max: torch.Tensor,
+    ):
+        # this turns the histogram into a more fine-coarsed histogram to reduce
+        # bin quantization errors
+        histogram = histogram.repeat_interleave(self.upsample_rate) / self.upsample_rate
+        bin_size = (orig_max - orig_min) / (self.bins * self.upsample_rate)
+        mid_points_histogram = (
+            torch.linspace(
+                orig_min,
+                orig_max,
+                self.bins * self.upsample_rate + 1,
+                device=orig_min.device,
+            )[:-1].to(histogram.device)
+            + 0.5 * bin_size
+        )
+        boundaries_new_histogram = torch.linspace(
+            update_min, update_max, self.bins + 1, device=update_min.device
+        ).to(histogram.device)
+        # this maps the mid-points of the histogram to the new histogram's space
+        bucket_assignments = (
+            torch.bucketize(mid_points_histogram, boundaries_new_histogram, right=True)
+            - 1
+        )
+        # this then maps the histogram mid-points in the new space, weighted by the original histogram's values
+        # this is just the old histogram in the new histogram's space
+
+        # In case due to numerical issues the values land higher/lower than the maximum/minimum
+        bucket_assignments[bucket_assignments >= self.bins] = self.bins - 1
+        bucket_assignments[bucket_assignments < 0] = 0
+
+        update_histogram = torch.bincount(
+            bucket_assignments, weights=histogram, minlength=self.bins
+        )
+        return update_histogram
+
+    def _combine_histograms(
+        self,
+        orig_hist: torch.Tensor,
+        orig_min: torch.Tensor,
+        orig_max: torch.Tensor,
+        update_hist: torch.Tensor,
+        update_min: torch.Tensor,
+        update_max: torch.Tensor,
+    ) -> torch.Tensor:
+        # If the new min and max are the same as the current min and max,
+        # we can just add the new histogram to the original histogram
+        if update_min == orig_min and update_max == orig_max:
+            return orig_hist + update_hist
+
+        # If the orig hist only has one value (i.e., the min and max are the same)
+        # we can just add it into new histogram
+        if orig_min == orig_max:
+            bin_value = torch.sum(orig_hist)
+            transformed_orig_hist = (
+                torch.histc(orig_min, bins=self.bins, min=update_min, max=update_max)  # type: ignore[arg-type]
+                * bin_value
+            )
+            return transformed_orig_hist + update_hist
+
+        # We assume the update_hist is already in the target range, we will map the orig_max to it
+        if update_min > orig_min:
+            raise AssertionError("update_min must be <= orig_min")
+        if update_max < orig_max:
+            raise AssertionError("update_max must be >= orig_max")
+
+        # Now we need to turn the old_histogram, into the range of the new histogram
+        transformed_orig_hist = self._upscale_histogram(
+            orig_hist,
+            orig_min,
+            orig_max,
+            update_min,
+            update_max,
+        )
+
+        return update_hist + transformed_orig_hist
+
+    def reset_histogram(
+        self, x: torch.Tensor, min_val: torch.Tensor, max_val: torch.Tensor
+    ) -> None:
+        self.min_val.resize_(min_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.resize_(max_val.shape)
+        self.max_val.copy_(max_val)
+        if min_val.numel() != 1 or max_val.numel() != 1:
+            raise AssertionError("histogram min/max values must be scalar.")
+        new_histogram = torch.histc(x, self.bins, min=min_val, max=max_val)  # type: ignore[arg-type]
+        self.histogram.detach_().resize_(new_histogram.shape)
+        self.histogram.copy_(new_histogram)
+
+    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:  # pyre-ignore[14]
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()
+        x_min, x_max = torch.aminmax(x)
+        # want to ignore torch.inf since we don't actually
+        # want to make our quantization range infinite
+        # and in practice those values will be clamped
+        if x_min == -torch.inf or x_max == torch.inf:
+            warnings.warn(
+                "torch.inf detected in input tensor, ignoring input", stacklevel=2
+            )
+            x = x[x.abs() != torch.inf]
+            if x.numel() == 0:
+                return x_orig
+            x_min, x_max = torch.aminmax(x)
+
+        current_min = self.min_val
+        current_max = self.max_val
+
+        is_uninitialized = self.min_val == float("inf") or self.max_val == float("-inf")
+        if is_uninitialized:
+            self.reset_histogram(x, x_min, x_max)
+        else:
+            update_min, update_max = x_min, x_max
+            new_min = torch.min(current_min, update_min)
+            new_max = torch.max(current_max, update_max)
+
+            # TODO: For some reason, this is required for it to pass torchscript test
+            # new_min and new_max should already have requires_grad set to False
+            new_min, new_max = new_min.detach(), new_max.detach()
+            update_histogram = torch.histc(
+                x,
+                self.bins,
+                min=new_min,  # type: ignore[arg-type]
+                max=new_max,  # type: ignore[arg-type]
+            ).to(self.histogram.device)
+            if new_min == current_min and new_max == current_max:
+                combined_histogram = self.histogram + update_histogram
+                self.histogram.detach_().resize_(combined_histogram.shape)
+                self.histogram.copy_(combined_histogram)
+            else:
+                combined_histogram = self._combine_histograms(
+                    self.histogram,
+                    current_min,
+                    current_max,
+                    update_histogram,
+                    new_min,
+                    new_max,
+                )
+                self.histogram.detach_().resize_(combined_histogram.shape)
+                self.histogram.copy_(combined_histogram)
+                self.min_val.detach_().resize_(new_min.shape)
+                self.min_val.copy_(new_min)
+                self.max_val.detach_().resize_(new_max.shape)
+                self.max_val.copy_(new_max)
+
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        is_uninitialized = self.min_val == float("inf") and self.max_val == float(
+            "-inf"
+        )
+        if is_uninitialized:
+            warnings.warn(
+                "must run observer before calling calculate_qparams.\
+                                    Returning default scale and zero point ",
+                stacklevel=2,
+            )
+            return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor(
+                [0], device=self.min_val.device.type
+            )
+        if self.bins != len(self.histogram):
+            raise AssertionError(
+                "The number of bins in histogram should be equal to the number of bins "
+                "supplied while making this observer"
+            )
+
+        new_min, new_max = self._non_linear_param_search()
+
+        return self._calculate_qparams(new_min, new_max)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "min_val"] = self.min_val
+        destination[prefix + "max_val"] = self.max_val
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 3:
+            # if min_val and max_val are not initialized, update their shape
+            # to account for the differences between v2 and v3
+            min_val_name, max_val_name = prefix + "min_val", prefix + "max_val"
+            if min_val_name in state_dict:
+                if state_dict[min_val_name].shape == torch.Size([0]):
+                    state_dict[min_val_name] = torch.tensor(float("inf"))
+            if max_val_name in state_dict:
+                if state_dict[max_val_name].shape == torch.Size([0]):
+                    state_dict[max_val_name] = torch.tensor(float("-inf"))
+
+        local_state = ["min_val", "max_val"]
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                setattr(self, name, val)
+            elif strict:
+                missing_keys.append(key)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+
+class FixedQParamsObserver(ObserverBase):
+    r"""
+    Observer that simulates quantize and dequantize with fixed
+    quantization parameters in training time. Only per tensor
+    quantization is supported.
+
+    Args:
+        `scale` (float): fixed scale for the observer
+        `zero_point` (int): fixed zero point for the observer
+        `dtype`, `qscheme`, `quant_min`, `quant_max`
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(
+        self,
+        scale,
+        zero_point,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        quant_min=0,
+        quant_max=255,
+        is_dynamic=False,
+        **kwargs,
+    ):
+        if is_dynamic:
+            raise NotImplementedError(
+                "FixedQParamsObserver doesn't support dynamic quantization"
+            )
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs)
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.register_buffer("scale", torch.tensor([scale], dtype=torch.float))
+        self.register_buffer("zero_point", torch.tensor([zero_point], dtype=torch.int))
+        self.dtype = dtype
+        self.qscheme = qscheme
+
+    def forward(self, X):
+        return X
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        return self.scale, self.zero_point
+
+
+class PlaceholderObserver(ObserverBase):
+    r"""
+    Observer that doesn't do anything and just passes its configuration to the
+    quantized module's ``.from_float()``.
+
+    Can be used for quantization to float16 which doesn't require determining
+    ranges.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        quant_min: minimum value in quantized domain (TODO: align behavior with other observers)
+        quant_max: maximum value in quantized domain
+        custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
+                        (Can be used in Graph Mode Passes for special case ops).
+        compute_dtype (deprecated): if set, marks the future quantize function to use
+                       dynamic quantization instead of static quantization.
+                       This field is deprecated, use `is_dynamic=True` instead.
+        is_dynamic: if True, the `quantize` function in the reference model
+                    representation taking stats from this observer instance will
+                    use dynamic quantization.
+    """
+
+    def __init__(
+        self,
+        dtype=torch.float32,
+        custom_op_name="",
+        compute_dtype=None,
+        quant_min=None,
+        quant_max=None,
+        qscheme=None,
+        eps=None,
+        is_dynamic=False,
+    ) -> None:
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic)
+        if qscheme is None:
+            qscheme = torch.per_tensor_affine
+        if eps is None:
+            eps = torch.finfo(torch.float32).eps
+
+        # dtype of input of the target operator, e.g. for dynamic quantization
+        # ops, the dtype will be float32
+        self.dtype = dtype
+        self.qscheme = qscheme
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.eps = eps
+        self.custom_op = custom_op_name
+        # used for configuration of computation type for dynamic quantization
+        if compute_dtype:
+            is_dynamic = True
+            warnings.warn(
+                "Please use `is_dynamic` instead of `compute_dtype`. \
+                    `compute_dtype` will be deprecated in a future release \
+                    of PyTorch.",
+                stacklevel=2,
+            )
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def extra_repr(self):
+        return f"dtype={self.dtype}, is_dynamic={self.is_dynamic}"
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        raise Exception(  # noqa: TRY002
+            "calculate_qparams should not be called for PlaceholderObserver"
+        )
+
+
+class RecordingObserver(ObserverBase):
+    r"""
+    The module is mainly for debug and records the tensor values during runtime.
+
+    Args:
+        dtype: Quantized data type
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+    """
+
+    __annotations__ = {"tensor_val": list[torch.Tensor | None]}
+
+    def __init__(self, dtype=torch.quint8):
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.tensor_val = []
+
+    def forward(self, x):
+        self.tensor_val.append(x.clone())
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        raise Exception(  # noqa: TRY002
+            "calculate_qparams should not be called for RecordingObserver"
+        )
+
+    @torch.jit.export
+    def get_tensor_value(self):
+        return self.tensor_val
+
+
+class NoopObserver(ObserverBase):
+    r"""
+    Observer that doesn't do anything and just passes its configuration to the
+    quantized module's ``.from_float()``.
+
+    Primarily used for quantization to float16 which doesn't require determining
+    ranges.
+
+    Args:
+        dtype: Quantized data type
+        custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
+                        (Can be used in Graph Mode Passes for special case ops).
+    """
+
+    def __init__(self, dtype=torch.float16, custom_op_name="") -> None:
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.dtype = dtype
+        self.custom_op = custom_op_name
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        raise Exception(  # noqa: TRY002
+            "calculate_qparams should not be called for NoopObserver"
+        )
+
+
+class ReuseInputObserver(ObserverBase):
+    r"""This observer is used when we want to reuse the observer from the operator
+    that produces the input Tensor, typically used for operators like reshape, e.g.
+    ```
+    x0 = ...
+    x1 = x0.reshape()
+    ```
+    if we configure x0 to be observed by some observer, let's say MinMaxObserver,
+    and reshape is configured with ReuseInputObserver, we'll reuse the observer instance
+    for x0 for x1 (output of reshape). If x0 is not observed, we also won't observe x1.
+
+    Note: this is only enabled in FX Graph Mode Quantization
+    """
+
+    def __init__(self) -> None:
+        super().__init__(torch.quint8, is_dynamic=False)
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):  # type: ignore[override]
+        raise Exception(  # noqa: TRY002
+            "calculate_qparams should not be called for ReuseInputObserver"
+        )
+
+
+"""
+# Experimental Affine Quantization Feature START
+We plan to merge the following with torchao repo after we move pt2e flow to torchao
+copied from https://github.com/pytorch/ao/blob/main/torchao/quantization/observer.py
+"""
+from dataclasses import dataclass
+from enum import auto, Enum
+
+
+class MappingType(Enum):
+    """How floating point number is mapped to integer number
+
+    symmetric mapping means floating point range is symmetrically mapped to integer range
+    let's say we have floating point range (-3.5, 10.2) and integer range (-8, 7) (int4)
+    we'll use (-10.2, 10.2) as the range for floating point and map that to (-8, 7)
+    e.g. scale = (10.2 - (-10.2)) / (7 - (-8))
+
+    SYMMETRIC_NO_CLIPPING_ERR is a variant of symmetric mapping, where the scale is the max of smin
+    and smax, where smin = min_val_neg / quant_min, and smax = max_val_pos / quant_max. By calculating
+    smin and smax individually, there can be less round error on negative values, and no out-of-range
+    of all floating point values.
+
+    asymmetric mapping means we just directly map the floating point range to integer range,
+    for the above example, we will map (-3.5, 10.2) to (-8, 7) and calculate quantization parameter
+    based on this mapping
+    e.g. scale = (10.2 - (-3.5)) / (7 - (-8))
+    """
+
+    SYMMETRIC = auto()
+    SYMMETRIC_NO_CLIPPING_ERR = auto()
+    ASYMMETRIC = auto()
+
+
+class ZeroPointDomain(Enum):
+    """Enum that indicate whether zero_point is in integer domain or floating point domain
+
+    integer domain: quantized_val = (float_val / scale) (integer) + zero_point (integer)
+    float domain: quantized_val = (float_val - (zero_point (float) - scale * mid_point)) / scale
+    none domain: quantized_val = (float_val / scale)
+    """
+
+    INT = auto()
+    FLOAT = auto()
+    NONE = auto()
+
+
+class TorchAODType(Enum):
+    """
+    Placeholder for dtypes that do not exist in PyTorch core yet.
+    """
+
+    # torch.int1 to torch.int7 will be added to PyTorch 2.6
+    # These will remain here for BC with older PyTorch versions
+    INT1 = auto()
+    INT2 = auto()
+    INT3 = auto()
+    INT4 = auto()
+    INT5 = auto()
+    INT6 = auto()
+    INT7 = auto()
+
+
+@dataclass(frozen=True)
+class Granularity:
+    """
+    Base class for representing the granularity of quantization.
+
+    This class serves as a parent for specific granularity types used in
+    quantization operations, such as per-tensor or per-axis quantization.
+    """
+
+
+@dataclass(frozen=True)
+class PerBlock(Granularity):
+    """
+    Represents per-block granularity in quantization. See
+    :func:`~torchao.quantization.quant_primitives.quantize_affine` for docs for
+    `block_size`
+
+    Attributes:
+        block_size (Tuple[int, ...]): The size of each quantization group
+    """
+
+    block_size: tuple[int, ...]
+
+
+@dataclass(frozen=True)
+class PerTensor(Granularity):
+    """
+    Represents per-tensor granularity in quantization.
+
+    This granularity type calculates the quantization parameters
+    based off the entire tensor.
+
+    """
+
+
+@dataclass(frozen=True)
+class PerAxis(Granularity):
+    """
+    Represents per-axis granularity in quantization.
+
+    This granularity type calculates different quantization parameters
+    along a specified axis of the tensor.
+
+    For example if the input tensor is shape [8, 16] and axis=0, then
+    the quantization parameters are calculated for each row of the tensor.
+    Giving a total of 8 quantization parameters.
+
+    Attributes:
+        axis (int): The axis along which reduction is performed.
+    """
+
+    axis: int
+
+
+@dataclass(frozen=True)
+class PerGroup(Granularity):
+    """
+    Represents per-channel group granularity in quantization.
+
+    This granularity type calculates different quantization parameters
+    for each group of <group_size> elements.
+
+    For example if the input tensor is shape [8, 16], and the group size is 4, then
+    the input tensor is reshaped to [64, 4]
+    quantization parameters are calculated for each group of 4 elements,
+    giving a total of 64 quantization parameters.
+
+    Attributes:
+        group_size (int): The size of each quantization group
+
+    """
+
+    group_size: int
+
+
+class PerRow(Granularity):
+    """
+    Represents row-wise granularity in quantization.
+
+    This is a special case of per-axis quantization and is unique to Float8 matmuls
+    where the input is quantized with a block_size of (1, ..., input.shape[-1]). And the weight
+    is quantized with a block_size of (1, weight.shape[1]).
+    """
+
+
+class PerToken(Granularity):
+    """
+    Represents per-token granularity in quantization.
+
+    This granularity type calculates a different set of quantization parameters
+    for each token, which is represented as the last dimension of the tensor.
+
+    For example, if the input tensor has shape [2, 3, 4], then there are 6 tokens
+    with 4 elements each, and we will calculate 6 sets of quantization parameters,
+    one for each token.
+
+    If the input tensor has only two dimensions, e.g. [8, 16], then this is
+    equivalent to `PerAxis(axis=0)`, which yields 8 sets of quantization parameters.
+    """
+
+
+def get_block_size(
+    input_shape: tuple[int, ...], granularity: Granularity
+) -> tuple[int, ...]:
+    """Get the block size based on the input shape and granularity type.
+
+    Args:
+        input_shape: The input tensor shape possibly more than 2 dimensions
+        granularity: The granularity type of the quantization
+    """
+    if not isinstance(granularity, Granularity):
+        raise AssertionError(
+            "Please provide an instance of Granularity, not subclass of it"
+        )
+    if isinstance(granularity, PerTensor):
+        return input_shape
+    elif isinstance(granularity, PerAxis):
+        block_size = list(input_shape)
+        block_size[granularity.axis] = 1
+        return tuple(block_size)
+    elif isinstance(granularity, PerRow):
+        return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
+    elif isinstance(granularity, PerGroup):
+        if len(input_shape) != 2:
+            raise AssertionError(
+                f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
+            )
+        return (1, granularity.group_size)
+    elif isinstance(granularity, PerToken):
+        block_size = [1] * len(input_shape)
+        block_size[-1] = input_shape[-1]
+        return tuple(block_size)
+    raise ValueError(f"Unsupported Granularity: {granularity}")
+
+
+class AffineQuantizedObserverBase(ABC, torch.nn.Module):
+    """Observer module for affine quantization (https://github.com/pytorch/ao/tree/main/torchao/quantization#affine-quantization)
+
+    Args:
+      `granularity` and `block_size`: The granularity of the quantization,
+        must specify at least one, if both are specified `block_size` takes precedence
+        Current supported granularity type are `PerTensor` and `PerAxis`
+      other args: please see `:class:torchao.dtypes.AffineQuantizedTensor`
+    """
+
+    with_args = classmethod(_with_args)
+
+    def __init__(
+        self,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        granularity: Granularity,
+        quant_min: int | None = None,
+        quant_max: int | None = None,
+        eps: float | None = None,
+        scale_dtype: torch.dtype | None = None,
+        zero_point_dtype: torch.dtype | None = None,
+        preserve_zero: bool = True,
+        zero_point_domain: ZeroPointDomain | None = ZeroPointDomain.INT,
+        # there could be some extra args that's ignored
+        **kwargs,
+    ):
+        super().__init__()
+        if granularity is None:
+            raise AssertionError("granularity is None")
+        self.mapping_type = mapping_type
+        self.target_dtype = target_dtype
+        self.granularity = granularity
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.eps = eps
+        self.scale_dtype = scale_dtype
+        self.zero_point_dtype = zero_point_dtype
+        self.preserve_zero = preserve_zero
+        self.zero_point_domain = zero_point_domain
+        # populatd during forward
+        self.block_size = None
+        self.original_dtype = None
+
+    @abstractmethod
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """forward function should take the input tensor
+        and updates internal stats and return the original input Tensor
+        """
+
+    @abstractmethod
+    def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+        """Calculate quantization parameter based on the stats attached to the observer module
+        and returns a tuple of scale and zero_point Tensor
+        """
+
+    def convert(self, model: torch.fx.GraphModule, observer_node: Node):
+        """
+        Converts the observer node in the graph into its quantized representation
+
+        Args:
+            model: graph module to convert the observer node in
+            observer_node: the observer node to convert
+        """
+        from torch.ao.quantization.fx.utils import create_getattr_from_value
+
+        with model.graph.inserting_before(observer_node):
+            if self.block_size is None:
+                raise AssertionError("Expecting block_size to be populated")
+            if self.original_dtype is None:
+                raise AssertionError("Expecting original_dtype to be populated")
+            if hasattr(self, "is_dynamic") and self.is_dynamic:
+                choose_qparams_affine = model.graph.call_function(
+                    torch.ops.pt2e_quant.choose_qparams_affine,
+                    (
+                        observer_node.args[0],
+                        self.mapping_type.name,
+                        self.block_size,
+                        self.target_dtype,
+                        self.quant_min,
+                        self.quant_max,
+                        self.eps,
+                        self.scale_dtype,
+                        self.zero_point_dtype,
+                        self.preserve_zero,
+                        self.zero_point_domain.name,
+                    ),
+                )
+                scale_node = model.graph.call_function(
+                    operator.getitem, (choose_qparams_affine, 0)
+                )
+                zero_point_node = model.graph.call_function(
+                    operator.getitem, (choose_qparams_affine, 1)
+                )
+            else:
+                scale, zero_point = self.calculate_qparams()
+                scale_node = create_getattr_from_value(
+                    model,
+                    model.graph,
+                    "_scale",
+                    scale,
+                    scale.device if isinstance(scale, torch.Tensor) else None,
+                )
+                zero_point_node = create_getattr_from_value(
+                    model,
+                    model.graph,
+                    "_zero_point",
+                    zero_point,
+                    zero_point.device if isinstance(zero_point, torch.Tensor) else None,
+                )
+
+            q_node = model.graph.call_function(
+                torch.ops.pt2e_quant.quantize_affine,
+                (
+                    observer_node.args[0],
+                    self.block_size,
+                    scale_node,
+                    zero_point_node,
+                    self.target_dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    self.zero_point_domain.name,
+                ),
+                {},
+            )
+            dq_node = model.graph.call_function(
+                torch.ops.pt2e_quant.dequantize_affine,
+                (
+                    q_node,
+                    self.block_size,
+                    scale_node,
+                    zero_point_node,
+                    self.target_dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    self.zero_point_domain.name,
+                ),
+                {"output_dtype": self.original_dtype},
+            )
+            observer_node.replace_all_uses_with(dq_node)
+            model.graph.erase_node(observer_node)
+
+
+def _is_observer_script_module(mod, obs_type_name):
+    """Returns true if given mod is an instance of Observer script module."""
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.ao.quantization.observer.___torch_mangle_2.MinMaxObserver'
+        suffix = mod._c.qualified_name.split(".", 1)[1]
+        name = re.sub(r"\.___torch_mangle_\d+", "", suffix)
+        return obs_type_name in name
+    return False
+
+
+# Experimental Affine Quantization Feature END
+
+
+def _is_activation_post_process(module):
+    return isinstance(
+        module,
+        (
+            torch.ao.quantization.ObserverBase,
+            torch.ao.quantization.FakeQuantizeBase,
+            AffineQuantizedObserverBase,
+        ),
+    ) or _is_observer_script_module(module, "quantization.observer")
+
+
+def _is_per_channel_script_obs_instance(module):
+    if isinstance(module, torch.jit.RecursiveScriptModule):
+        return _is_observer_script_module(
+            module, "quantization.observer.PerChannelMinMaxObserver"
+        ) or _is_observer_script_module(
+            module, "quantization.observer.MovingAveragePerChannelMinMaxObserver"
+        )
+    return False
+
+
+def get_observer_state_dict(mod):
+    r"""
+    Returns the state dict corresponding to the observer stats.
+    Traverse the model state_dict and extract out the stats.
+    """
+    od = OrderedDict()
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        for k, v in mod.state_dict().items():
+            if "observer" in k:
+                od[k] = v
+    else:
+        # path for GraphModule and nn.Module (eager mode)
+        for k, v in mod.state_dict().items():
+            if "activation_post_process" in k:
+                od[k] = v
+    od._metadata = mod.state_dict()._metadata  # type: ignore[attr-defined]
+    return od
+
+
+def load_observer_state_dict(mod, obs_dict):
+    r"""
+    Given input model and a state_dict containing model observer stats,
+    load the stats back into the model. The observer state_dict can be saved
+    using torch.ao.quantization.get_observer_state_dict
+    """
+    missing_keys: list[str] = []
+    unexpected_keys: list[str] = []
+    for name, module in mod.named_modules():
+        prefix = name + "."
+        if _is_activation_post_process(module):
+            if _is_per_channel_script_obs_instance(module):
+                # For per-channel observers we need to call a custom load_from_state_dict to resize the tensor.
+                # However this is not called when the module is scripted and we end up calling the default one in module.py
+                module._load_from_state_dict_script(
+                    obs_dict, prefix, {}, True, missing_keys, unexpected_keys, []
+                )
+            else:
+                module._load_from_state_dict(
+                    obs_dict, prefix, {}, False, missing_keys, unexpected_keys, []
+                )
+    for k in missing_keys:
+        if "observer" in k or "activation_post_process" in k:
+            raise Exception(  # noqa: TRY002
+                f"Missing keys for observer {k} in state_dict"
+            )
+    for k in unexpected_keys:
+        if "observer" in k or "activation_post_process" in k:
+            raise Exception(  # noqa: TRY002
+                f"Unexpected keys for observer {k} in state_dict"
+            )
+
+
+# Restrict activations to be in the range (0,127)
+default_observer = MinMaxObserver.with_args(quant_min=0, quant_max=127)
+"""
+Default observer for static quantization, usually used for debugging.
+"""
+
+default_placeholder_observer = PlaceholderObserver
+"""
+Default placeholder observer, usually used for quantization to torch.float16.
+"""
+
+default_debug_observer = RecordingObserver
+"""
+Default debug-only observer.
+"""
+
+default_weight_observer = MinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric
+)
+"""
+Default weight observer.
+"""
+
+weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args(
+    dtype=torch.qint8,
+    qscheme=torch.per_tensor_symmetric,
+    quant_min=-127,
+    quant_max=127,
+    eps=2**-12,
+)
+"""
+Symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+default_histogram_observer = HistogramObserver.with_args(quant_min=0, quant_max=127)
+"""
+Default histogram observer, usually used for PTQ.
+"""
+
+default_per_channel_weight_observer = PerChannelMinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_channel_symmetric
+)
+"""
+Default per-channel weight observer, usually used on backends where per-channel
+weight quantization is supported, such as `fbgemm`.
+"""
+
+per_channel_weight_observer_range_neg_127_to_127 = PerChannelMinMaxObserver.with_args(
+    dtype=torch.qint8,
+    qscheme=torch.per_channel_symmetric,
+    quant_min=-127,
+    quant_max=127,
+    eps=2**-12,
+)
+"""
+Per-channel, symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+default_dynamic_quant_observer = PlaceholderObserver.with_args(
+    dtype=torch.quint8,
+    quant_min=0,
+    quant_max=255,
+    is_dynamic=True,
+)
+"""
+Default observer for dynamic quantization.
+"""
+
+default_float_qparams_observer = PerChannelMinMaxObserver.with_args(
+    dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+)
+"""
+Default observer for a floating point zero-point.
+"""
+
+default_float_qparams_observer_4bit = PerChannelMinMaxObserver.with_args(
+    dtype=torch.quint4x2, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+)
+"""
+Default observer for a floating point zero-point and 4 bit activations.
+"""
+
+# TODO(future PR): remove these defaults and enforce activation functions
+# to explicitly specify their output range
+default_fixed_qparams_range_neg1to1_observer = FixedQParamsObserver.with_args(
+    scale=2.0 / 256.0, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255
+)
+default_fixed_qparams_range_0to1_observer = FixedQParamsObserver.with_args(
+    scale=1.0 / 256.0, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255
+)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_observer = default_fixed_qparams_range_neg1to1_observer
+default_affine_fixed_qparams_observer = default_fixed_qparams_range_0to1_observer
+
+"""
+Default observers for fixed qparams operations.
+"""
+
+default_reuse_input_observer = ReuseInputObserver
+"""
+Default observer for operators like reshape that reuses the observer of input to
+the operator
+"""
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b34d3417ce1c5ae055003725be7918a679d03f88
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48238f38746a62273e1001fab81e2f50ca0f803d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8876d439feb41929ca9b64f3f023db499eac007b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/__init__.py
@@ -0,0 +1,6 @@
+from .rewrite import reference_representation_rewrite
+
+
+__all__ = [
+    "reference_representation_rewrite",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py
new file mode 100644
index 0000000000000000000000000000000000000000..52084784f5036a92a909ad7f044d733677e48618
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -0,0 +1,825 @@
+# mypy: allow-untyped-defs
+from collections.abc import Callable
+from dataclasses import dataclass
+from functools import partial
+from typing import Any
+
+import torch
+from torch._export.utils import _disable_aten_to_metadata_assertions
+from torch._higher_order_ops.out_dtype import out_dtype
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
+from torch.ao.quantization.pt2e.utils import (
+    _get_aten_graph_module_for_pattern,
+    _replace_literals_with_existing_placeholders,
+    _replace_literals_with_new_placeholders,
+    remove_tensor_overload_for_qdq_ops,
+)
+from torch.fx import GraphModule
+from torch.fx.subgraph_rewriter import replace_pattern
+
+
+__all__ = [
+    "reference_representation_rewrite",
+]
+
+
+def _qdq_quantized_linear(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8
+    )
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8,
+        weight_scale,
+        weight_zero_point,
+        weight_quant_min,
+        weight_quant_max,
+        torch.int8,
+    )
+    out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantized_linear(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max)
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.linear.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None,
+    )
+    # TODO: change to mul.Scalar
+    # Note: we are quantizing bias with these scales without signal from user, but it might be OK
+    bias_scale = x_scale * weight_scale
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    acc_i32 = acc_i32 + bias_i32
+    # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values
+    acc_i32 = (
+        out_dtype(
+            torch.ops.aten.mul.Tensor,
+            torch.int32,
+            acc_i32,
+            x_scale * weight_scale / out_scale,
+        )
+        + out_zero_point
+    )
+    out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8)
+    return out_i8
+
+
+def _qdq_dynamic_quantized_linear(
+    x_fp32,
+    x_quant_min,
+    x_quant_max,
+    x_eps,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+):
+    x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams(
+        x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8
+    )
+    x_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        x_fp32, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8
+    )
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8
+    )
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8,
+        weight_scale,
+        weight_zero_point,
+        weight_quant_min,
+        weight_quant_max,
+        torch.int8,
+    )
+    out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32)
+    return out_fp32
+
+
+def _reference_dynamic_quantized_linear(
+    x_fp32,
+    x_quant_min,
+    x_quant_max,
+    x_eps,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+):
+    x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams(
+        x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8
+    )
+    # decomposed representation for quantize_per_tensor
+    # TODO: use out_dtype(mul, ...) here when the op is ready
+    x_fp32 = x_fp32 / x_scale  # fp32
+    # round modes might be different here
+    # pytorch is rounding to even, which is also common for most of the backends
+    x_fp32 = torch.round(x_fp32)  # fp32
+    x_i32 = x_fp32.to(dtype=torch.int32)  # int32
+    x_i32 = x_i32 + x_zero_point  # int32
+    # clamp works for fp32, int32 and int8 dtypes
+    x_i32 = torch.clamp(x_i32, x_quant_min, x_quant_max)  # int32
+    x_i8 = x_i32.to(dtype=torch.int8)
+
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.linear.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None,
+    )
+    bias_scale = x_scale * weight_scale
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    acc_i32 = acc_i32 + bias_i32
+    out_fp32 = acc_i32 * (x_scale * weight_scale)
+    return out_fp32
+
+
+def _qdq_quantized_conv2d(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    stride = [1, 1]
+    padding = [0, 0]
+    dilation = [1, 1]
+    transposed = False
+    output_padding = [0, 0]
+    groups = 1
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8
+    )
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8,
+        weight_scale,
+        weight_zero_point,
+        weight_quant_min,
+        weight_quant_max,
+        torch.int8,
+    )
+    out_fp32 = torch.ops.aten.convolution.default(
+        x_fp32,
+        weight_fp32,
+        bias_fp32,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+    )
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantized_conv2d(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    weight_i8,
+    weight_scale,
+    weight_zero_point,
+    weight_quant_min,
+    weight_quant_max,
+    bias_fp32,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    stride = [1, 1]
+    padding = [0, 0]
+    dilation = [1, 1]
+    transposed = False
+    output_padding = [0, 0]
+    groups = 1
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max)
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.convolution.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+    )
+    # Note: we are quantizing bias with these scales without signal from user, but it might be OK
+    bias_scale = x_scale * weight_scale
+    # bias quantization to int32 uses bias_scale = x_scale * weight_scale due to:
+    # Take linear calculation for example
+    # Out_(i, j)_fp32 = Sum_(over k)[X_(i, k)_fp32 * W_(i, k)_fp32] + bias_(i)_fp32
+    # Represent X, W fp32 as their dequant transforms
+    # A_fp32 = (A_q - A_zero_point)/A_scale
+    # Out_(i, j)_fp32 = Sum_(over k)[(X_(i, k)_fp32 - X_zp) * X_scale * (W_(i, k)_fp32 - W_zp) * W_scale] + bias_(i)_fp32
+    # Factor out X_scale and W_scale
+    # Out_(i, j)_fp32 = ((X_scale * W_scale) * Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)]) + bias_(i)_fp32
+    # In order to addition of bias_(i)_fp32 inside, we must do
+    # Out_(i, j)_fp32 = (X_scale * W_scale) * (Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)] + (1 / (X_scale * W_scale)) * bias_(i)_fp32)W_scale  # noqa: B950
+    # Note we had to multiply bias_fp32 with X_scale * W_scale = bias_scale
+    # Thus bias quantization to int32 must be with X_scale * W_scale
+
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    # Unsqueeze to match broadcast dims
+    # Unfortnuately I cannot do bias_i32.unsqueeze(0) due to literal matching nightmare
+    # in graph pattern replacement
+    bias_i32 = bias_i32.unsqueeze(-1)
+    bias_i32 = bias_i32.unsqueeze(-1)
+    acc_i32 = acc_i32 + bias_i32
+    # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values
+    acc_i32 = (
+        out_dtype(
+            torch.ops.aten.mul.Tensor,
+            torch.int32,
+            acc_i32,
+            x_scale * weight_scale / out_scale,
+        )
+        + out_zero_point
+    )
+    out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8)
+    return out_i8
+
+
+def _qdq_quantized_add_relu(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    y_i8,
+    y_scale,
+    y_zero_point,
+    out_scale,
+    out_zero_point,
+    quant_min,
+    quant_max,
+):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8
+    )
+    y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8
+    )
+    out_fp32 = x_fp32 + y_fp32
+    out_fp32 = torch.ops.aten.relu(out_fp32)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantized_add_relu(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    y_i8,
+    y_scale,
+    y_zero_point,
+    out_scale,
+    out_zero_point,
+    quant_min,
+    quant_max,
+):
+    """
+    See comments for `_reference_quantized_add` for more information on
+    how to derive the formula for out_i8 based on x_i8 and y_i8
+    """
+    x_i32 = x_i8.to(torch.int32)
+    y_i32 = y_i8.to(torch.int32)
+    # TODO: change this to mul.Scalar?
+    x_i32 = out_dtype(
+        torch.ops.aten.mul.Tensor,
+        torch.int32,
+        (x_i32 - x_zero_point),
+        (x_scale / out_scale),
+    )
+    y_i32 = out_dtype(
+        torch.ops.aten.mul.Tensor,
+        torch.int32,
+        (y_i32 - y_zero_point),
+        (y_scale / out_scale),
+    )
+    out_i32 = x_i32 + y_i32 + out_zero_point
+    # out_i32 = torch.ops.aten.clamp(out_i32, out_zero_point)
+    out_i8 = torch.ops.aten.clamp(out_i32, out_zero_point, quant_max).to(torch.int8)
+    return out_i8
+
+
+def _qdq_quantized_add(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    y_i8,
+    y_scale,
+    y_zero_point,
+    out_scale,
+    out_zero_point,
+    quant_min,
+    quant_max,
+):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8
+    )
+    y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8
+    )
+    out_fp32 = x_fp32 + y_fp32
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantized_add(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    y_i8,
+    y_scale,
+    y_zero_point,
+    out_scale,
+    out_zero_point,
+    quant_min,
+    quant_max,
+):
+    """
+        # How to Derive the formula for out_i8 based on x_i8 and y_i8
+        # (since quantized add takes x_i8, y_i8 and their quantization parameters, and produce an out_i8)
+
+        # out_i8 is quantized output, we can write down the formula for it first:
+    out_i8 = out_f32 / out_scale + out_zero_point           (1)
+
+        # then out_fp32 is computed from x_f32 + y_f32, and the x_fp32 and y_fp32 are the dequantized x_i8 and y_i8
+        out_f32 = x_f32 + y_f32           (2)
+        x_fp32 = (x_i8 - x_zero_point) * x_scale         (3)
+        y_fp32 = (y_i8 - y_zero_point) * y_scale         (4)
+
+        # applying the above formula to the out_i8 equation we can get the following:
+        out_i8 = out_fp32 / out_scale + out_zero_point             # (1)
+           = (x_f32 + y_f32) / out_scale + out_zero_point      # applying (2) to substitute out_fp32 with x_fp32 + y_fp32
+           = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point  # apply (3) and (4)
+    """
+    x_i32 = x_i8.to(torch.int32)
+    y_i32 = y_i8.to(torch.int32)
+    # TODO: use out_dtype op
+    x_i32 = torch.round((x_scale / out_scale) * (x_i32 - x_zero_point)).to(torch.int32)
+    y_i32 = torch.round((y_scale / out_scale) * (y_i32 - y_zero_point)).to(torch.int32)
+    out_i32 = x_i32 + y_i32 + out_zero_point
+    quant_min = -128
+    quant_max = 127
+    out_i8 = torch.ops.aten.clamp(out_i32, quant_min, quant_max).to(torch.int8)
+    return out_i8
+
+
+def _qdq_quantized_max_pool2d(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    kernel_size = 1
+    stride = 1
+    padding = 0
+    dilation = 1
+    ceil_mode = False
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8
+    )
+    out_fp32, _ = torch.ops.aten.max_pool2d_with_indices.default(
+        x_fp32, kernel_size, stride, padding, dilation, ceil_mode
+    )
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantized_max_pool2d(
+    x_i8,
+    x_scale,
+    x_zero_point,
+    x_quant_min,
+    x_quant_max,
+    out_scale,
+    out_zero_point,
+    out_quant_min,
+    out_quant_max,
+):
+    kernel_size = 1
+    stride = 1
+    padding = 0
+    dilation = 1
+    ceil_mode = False
+    # to preserve x_quant_min, x_quant_max in the graph for pattern matching
+    x_i8 = torch.clamp(x_i8, x_quant_min, x_quant_max)
+    x_i32 = x_i8.to(torch.int32)
+    out_i32, _ = torch.ops.aten.max_pool2d_with_indices.default(
+        x_i32 - x_zero_point, kernel_size, stride, padding, dilation, ceil_mode
+    )
+    out_fp32 = out_i32 * (x_scale / out_scale) + out_zero_point
+    out_fp32 = torch.clamp(out_fp32, out_quant_min, out_quant_max)
+    out_i8 = out_fp32.to(torch.int8)
+    return out_i8
+
+
+def _quantize_per_tensor_int8(x_fp32, scale, zero_point, quant_min, quant_max):
+    x = torch.ops.quantized_decomposed.quantize_per_tensor(
+        x_fp32, scale, zero_point, quant_min, quant_max, torch.int8
+    )
+    return x
+
+
+def _reference_quantize_per_tensor_int8(
+    x_fp32, scale, zero_point, quant_min, quant_max
+):
+    # TODO: use out_dtype(mul, ...) here when the op is ready
+    x = x_fp32 / scale  # fp32
+    # round modes might be different here
+    # pytorch is rounding to even, which is also common for most of the backends
+    x = torch.round(x)  # fp32
+    x = x.to(dtype=torch.int32)  # int32
+    x = x + zero_point  # int32
+    # clamp works for fp32, int32 and int8 dtypes
+    x = torch.clamp(x, quant_min, quant_max)  # int32
+    x = x.to(dtype=torch.int8)
+    return x
+
+
+def _dequantize_per_tensor_int8(x_i8, scale, zero_point, quant_min, quant_max):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, scale, zero_point, quant_min, quant_max, torch.int8
+    )
+    return x_fp32
+
+
+def _reference_dequantize_per_tensor_int8(
+    x_i8, scale, zero_point, quant_min, quant_max
+):
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max)
+    # TODO: use out_dtype op
+    # note: x_i8.to(torch.int32) does not work here
+    # TODO: debug the implementation later when torchdynamo time out issue is resolved
+    return ((x_i8.to(torch.float32) - zero_point) * scale).to(dtype=torch.float32)
+
+
+def _quantize_per_channel_int8(
+    x_fp32, scales, zero_points, ch_axis, quant_min, quant_max
+):
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_channel(
+        x_fp32, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+
+def _reference_quantize_per_channel_int8(
+    x_fp32, scales, zero_points, ch_axis, quant_min, quant_max
+):
+    x_fp32 = torch.transpose(x_fp32, ch_axis, -1)
+    out_i32 = torch.ops.aten.clamp(
+        torch.round(x_fp32 / scales).to(torch.int32) + zero_points, quant_min, quant_max
+    )
+    out_i32 = torch.transpose(out_i32, ch_axis, -1)
+    return out_i32.to(torch.int8)
+
+
+def _dequantize_per_channel_int8(
+    x_i8, scales, zero_points, ch_axis, quant_min, quant_max
+):
+    # the following will be replaced as placeholders
+    out_fp32 = torch.ops.quantized_decomposed.dequantize_per_channel(
+        x_i8, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8
+    )
+    return out_fp32
+
+
+def _reference_dequantize_per_channel_int8(
+    x_i8, scales, zero_points, ch_axis, quant_min, quant_max
+):
+    # the following will be replaced as placeholders
+    # in order to preserve the quant_min/quant_max args for pattern matching (e.g. matching for int4 quantized ops)
+    # we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max)
+    x_i8 = torch.transpose(x_i8, ch_axis, -1)
+    x_i32 = x_i8.to(torch.int32)
+    out_fp32 = (x_i32 - zero_points).to(torch.float) * scales
+    out_fp32 = torch.transpose(out_fp32, ch_axis, -1)
+    return out_fp32
+
+
+def _replace_ph_qdq_per_channel_replacement(gm: torch.fx.GraphModule):
+    return _replace_literals_with_existing_placeholders(
+        gm, exclude_literals=[-1], literal_to_ph_idx={1: 3, -128: 4, 127: 5}
+    )
+
+
+@dataclass
+class _RewriteInfo:
+    """Data needed for rewrite, this includes example inputs, pattern and replacement functions
+    and post transformation functions for the exported pattern and replacement GraphModule
+    """
+
+    # example inputs used for exporting the pattern into GraphModule
+    example_inputs: tuple[Any, ...]
+    pattern: Callable
+    replacement: Callable
+    # post transformation on the exported pattern and replacement GraphModule
+    pattern_post_trans: Callable[[GraphModule], GraphModule] | None = None
+    replacement_post_trans: Callable[[GraphModule], GraphModule] | None = None
+
+
+def reference_representation_rewrite(model: GraphModule) -> GraphModule:
+    _QUANTIZED_LINEAR_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (2, 5), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randint(-128, 127, (5, 5), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-127], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randn(1, dtype=torch.float),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS = (
+        torch.randn((2, 5), dtype=torch.float),
+        -128,
+        127,
+        torch.finfo(torch.float32).eps,
+        torch.randint(-128, 127, (5, 5), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-127], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randn(1, dtype=torch.float),
+    )
+
+    _QUANTIZED_CONV2d_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-127], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randn(1, dtype=torch.float),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = (
+        torch.randn(1, 3, 3, 3, dtype=torch.float),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(1, dtype=torch.float),
+        torch.zeros(1, dtype=torch.int),
+        torch.tensor([-128], dtype=torch.int),
+        torch.tensor([127], dtype=torch.int),
+    )
+
+    _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = (
+        torch.randn(1, 3, 3, 3, dtype=torch.float),
+        torch.randn(3, dtype=torch.float),
+        torch.zeros(3, dtype=torch.int),
+        1,
+        -128,
+        127,
+    )
+
+    _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = (
+        torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+        torch.randn(3, dtype=torch.float),
+        torch.zeros(3, dtype=torch.int),
+        1,
+        -128,
+        127,
+    )
+
+    _REWRITE_INFO_LIST = [
+        _RewriteInfo(
+            _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_dynamic_quantized_linear),
+            _WrapperModule(_reference_dynamic_quantized_linear),
+            partial(
+                _replace_literals_with_existing_placeholders,
+                literal_to_ph_idx={-128: 1, 127: 2, torch.finfo(torch.float32).eps: 3},
+            ),
+            partial(
+                _replace_literals_with_existing_placeholders,
+                literal_to_ph_idx={-128: 1, 127: 2, torch.finfo(torch.float32).eps: 3},
+            ),
+        ),
+        _RewriteInfo(
+            _QUANTIZED_LINEAR_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_quantized_linear),
+            _WrapperModule(_reference_quantized_linear),
+            _replace_literals_with_new_placeholders,
+            _replace_literals_with_new_placeholders,
+        ),
+        _RewriteInfo(
+            _QUANTIZED_CONV2d_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_quantized_conv2d),
+            _WrapperModule(_reference_quantized_conv2d),
+            partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
+            partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
+        ),
+        _RewriteInfo(
+            _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_quantized_add_relu),
+            _WrapperModule(_reference_quantized_add_relu),
+        ),
+        _RewriteInfo(
+            _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_quantized_add),
+            _WrapperModule(_reference_quantized_add),
+        ),
+        _RewriteInfo(
+            _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS,
+            _WrapperModule(_qdq_quantized_max_pool2d),
+            _WrapperModule(_reference_quantized_max_pool2d),
+            _replace_literals_with_new_placeholders,
+            _replace_literals_with_new_placeholders,
+        ),
+        _RewriteInfo(
+            _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
+            _WrapperModule(_quantize_per_tensor_int8),
+            _WrapperModule(_reference_quantize_per_tensor_int8),
+        ),
+        _RewriteInfo(
+            _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
+            _WrapperModule(_dequantize_per_tensor_int8),
+            _WrapperModule(_reference_dequantize_per_tensor_int8),
+        ),
+        _RewriteInfo(
+            _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
+            _WrapperModule(_quantize_per_channel_int8),
+            _WrapperModule(_reference_quantize_per_channel_int8),
+            _replace_ph_qdq_per_channel_replacement,
+            _replace_ph_qdq_per_channel_replacement,
+        ),
+        _RewriteInfo(
+            _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
+            _WrapperModule(_dequantize_per_channel_int8),
+            _WrapperModule(_reference_dequantize_per_channel_int8),
+            _replace_ph_qdq_per_channel_replacement,
+            _replace_ph_qdq_per_channel_replacement,
+        ),
+    ]
+
+    remove_tensor_overload_for_qdq_ops(model)
+
+    with _disable_aten_to_metadata_assertions():
+        for rewrite_info in _REWRITE_INFO_LIST:
+            example_inputs = rewrite_info.example_inputs
+            pattern = rewrite_info.pattern
+            replacement = rewrite_info.replacement
+            pattern_post_trans = rewrite_info.pattern_post_trans
+            replacement_post_trans = rewrite_info.replacement_post_trans
+            pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs)  # type: ignore[arg-type, assignment]
+            remove_tensor_overload_for_qdq_ops(pattern)  # type: ignore[arg-type]
+            replacement = _get_aten_graph_module_for_pattern(  # type: ignore[assignment]
+                replacement,
+                example_inputs,  # type: ignore[arg-type]
+            )
+            remove_tensor_overload_for_qdq_ops(replacement)  # type: ignore[arg-type]
+            if pattern_post_trans:
+                pattern = pattern_post_trans(pattern)
+            if replacement_post_trans:
+                replacement = replacement_post_trans(replacement)
+            pattern.recompile()  # type: ignore[attr-defined]
+            replacement.recompile()  # type: ignore[attr-defined]
+            replace_pattern(model, pattern, replacement)
+
+    return model
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5d1f341751a3b0ea4f720978d3c380e26ccc41
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig.py
@@ -0,0 +1,715 @@
+# mypy: allow-untyped-defs
+import copy
+import sys
+import warnings
+from collections import namedtuple
+from typing import Any, Optional, Union
+from typing_extensions import deprecated
+
+import torch
+import torch.nn as nn
+from torch.ao.quantization.fake_quantize import (
+    default_dynamic_fake_quant,
+    default_embedding_fake_quant,
+    default_embedding_fake_quant_4bit,
+    default_fake_quant,
+    default_fused_act_fake_quant,
+    default_fused_per_channel_wt_fake_quant,
+    default_fused_wt_fake_quant,
+    default_per_channel_weight_fake_quant,
+    default_weight_fake_quant,
+    FakeQuantize,
+    FakeQuantizeBase,
+    fused_per_channel_wt_fake_quant_range_neg_127_to_127,
+    fused_wt_fake_quant_range_neg_127_to_127,
+    FusedMovingAvgObsFakeQuantize,
+)
+
+from .observer import (
+    _PartialWrapper,
+    default_debug_observer,
+    default_dynamic_quant_observer,
+    default_float_qparams_observer,
+    default_float_qparams_observer_4bit,
+    default_observer,
+    default_per_channel_weight_observer,
+    default_placeholder_observer,
+    default_reuse_input_observer,
+    default_weight_observer,
+    HistogramObserver,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
+    NoopObserver,
+    ObserverBase,
+    per_channel_weight_observer_range_neg_127_to_127,
+    PlaceholderObserver,
+    ReuseInputObserver,
+    weight_observer_range_neg_127_to_127,
+)
+
+
+__all__ = [
+    "QConfig",
+    # TODO: deprecated, remove
+    "QConfigDynamic",
+    "default_qconfig",
+    "default_debug_qconfig",
+    "default_per_channel_qconfig",
+    "default_dynamic_qconfig",
+    "float16_dynamic_qconfig",
+    "float16_static_qconfig",
+    "per_channel_dynamic_qconfig",
+    "float_qparams_weight_only_qconfig",
+    "float_qparams_weight_only_qconfig_4bit",
+    "default_quint8_weight_qconfig",
+    "default_qat_qconfig",
+    "default_dynamic_qat_qconfig",
+    "default_weight_only_qconfig",
+    "default_activation_only_qconfig",
+    "default_qat_qconfig_v2",
+    "default_reuse_input_qconfig",
+    "default_symmetric_qnnpack_qconfig",
+    "default_per_channel_symmetric_qnnpack_qconfig",
+    "default_symmetric_qnnpack_qat_qconfig",
+    "default_per_channel_symmetric_qnnpack_qat_qconfig",
+    "default_embedding_qat_qconfig",
+    "default_embedding_qat_qconfig_4bit",
+    "get_default_qconfig",
+    "get_default_qat_qconfig",
+    "get_default_qconfig_dict",
+    "get_default_qat_qconfig_dict",
+    "QConfigAny",
+    "qconfig_equals",
+]
+
+
+# pyrefly: ignore [invalid-inheritance]
+class QConfig(namedtuple("QConfig", ["activation", "weight"])):
+    """
+    Describes how to quantize a layer or a part of the network by providing
+    settings (observer classes) for activations and weights respectively.
+
+
+    Note that QConfig needs to contain observer **classes** (like MinMaxObserver) or a callable that returns
+    instances on invocation, not the concrete observer instances themselves.
+    Quantization preparation function will instantiate observers multiple times for each of the layers.
+
+
+    Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args`
+    method (that behaves like functools.partial)::
+
+      my_qconfig = QConfig(
+          activation=MinMaxObserver.with_args(dtype=torch.qint8),
+          weight=default_observer.with_args(dtype=torch.qint8),
+      )
+
+    """
+
+    __slots__ = ()
+
+    def __new__(cls, activation, weight):
+        # catch common mistakes
+        if isinstance(activation, nn.Module) or isinstance(weight, nn.Module):
+            raise ValueError(
+                "QConfig received observer instance, please pass observer class instead. "
+                + "Use MyObserver.with_args(x=1) to override arguments to constructor if needed"
+            )
+        return super().__new__(cls, activation, weight)
+
+
+@deprecated(
+    "`QConfigDynamic` is going to be deprecated in PyTorch 1.12, please use `QConfig` instead",
+    category=FutureWarning,
+)
+# pyrefly: ignore [invalid-inheritance]
+class QConfigDynamic(namedtuple("QConfigDynamic", ["activation", "weight"])):
+    """
+    Describes how to dynamically quantize a layer or a part of the network by providing
+    settings (observer classes) for weights.
+
+    It's like QConfig, but for dynamic quantization.
+
+    Note that QConfigDynamic needs to contain observer **classes** (like MinMaxObserver) or a callable that returns
+    instances on invocation, not the concrete observer instances themselves.
+    Quantization function will instantiate observers multiple times for each of the layers.
+
+    Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args`
+    method (that behaves like functools.partial)::
+
+      my_qconfig = QConfigDynamic(weight=default_observer.with_args(dtype=torch.qint8))
+    """
+
+    __slots__ = ()
+
+    def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
+        # catch common mistakes
+        if isinstance(weight, nn.Module):
+            raise ValueError(
+                "QConfigDynamic received observer instance, please pass observer class instead. "
+                + "Use MyObserver.with_args(x=1) to override arguments to constructor if needed"
+            )
+        return super().__new__(cls, activation, weight)
+
+
+default_qconfig = QConfig(activation=default_observer, weight=default_weight_observer)
+"""
+Default qconfig configuration.
+"""
+
+default_debug_qconfig = QConfig(
+    weight=default_weight_observer, activation=default_debug_observer
+)
+"""
+Default qconfig configuration for debugging.
+"""
+
+default_per_channel_qconfig = QConfig(
+    activation=default_observer, weight=default_per_channel_weight_observer
+)
+"""
+Default qconfig configuration for per channel weight quantization.
+"""
+
+default_dynamic_qconfig = QConfig(
+    activation=default_dynamic_quant_observer, weight=default_weight_observer
+)
+"""
+Default dynamic qconfig.
+"""
+
+float16_dynamic_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.float16, is_dynamic=True),
+    weight=PlaceholderObserver.with_args(dtype=torch.float16),
+)
+"""
+Dynamic qconfig with weights quantized to `torch.float16`.
+"""
+
+float16_static_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.float16),
+    weight=PlaceholderObserver.with_args(dtype=torch.float16),
+)
+"""
+Dynamic qconfig with both activations and weights quantized to `torch.float16`.
+"""
+
+per_channel_dynamic_qconfig = QConfig(
+    activation=default_dynamic_quant_observer,
+    weight=default_per_channel_weight_observer,
+)
+"""
+Dynamic qconfig with weights quantized per channel.
+"""
+
+float_qparams_weight_only_qconfig = QConfig(
+    activation=default_placeholder_observer, weight=default_float_qparams_observer
+)
+"""
+Dynamic qconfig with weights quantized with a floating point zero_point.
+"""
+
+float_qparams_weight_only_qconfig_4bit = QConfig(
+    activation=default_placeholder_observer, weight=default_float_qparams_observer_4bit
+)
+
+default_qat_qconfig = QConfig(
+    activation=default_fake_quant, weight=default_weight_fake_quant
+)
+"""
+Default qconfig for QAT.
+"""
+
+default_dynamic_qat_qconfig = QConfig(
+    activation=default_dynamic_fake_quant, weight=default_weight_fake_quant
+)
+"""
+Default qconfig for dynamic QAT.
+"""
+
+default_weight_only_qconfig = QConfig(
+    activation=torch.nn.Identity, weight=default_weight_fake_quant
+)
+"""
+Default qconfig for quantizing weights only.
+"""
+
+default_activation_only_qconfig = QConfig(
+    activation=default_fake_quant, weight=torch.nn.Identity
+)
+"""
+Default qconfig for quantizing activations only.
+"""
+
+# QAT config that uses a fused observer + fake quant modules for optimized training performance.
+# to modify the activation/weight observers, the default entries in fake_quantize.py can be modified.
+default_qat_qconfig_v2 = QConfig(
+    activation=default_fused_act_fake_quant, weight=default_fused_wt_fake_quant
+)
+"""
+Fused version of `default_qat_config`, has performance benefits.
+"""
+
+default_reuse_input_qconfig = QConfig(
+    activation=default_reuse_input_observer, weight=NoopObserver
+)
+"""
+Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape
+"""
+
+
+def get_default_qconfig(backend="x86", version=0):
+    """
+    Returns the default PTQ qconfig for the specified backend.
+
+    Args:
+      * `backend` (str): a string representing the target backend. Currently supports
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
+
+    Return:
+        qconfig
+    """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: "
+            + str(backend)
+            + f" not supported. backend must be one of {supported_backends}"
+        )
+
+    if version == 0:
+        if backend == "fbgemm":
+            qconfig = QConfig(
+                activation=HistogramObserver.with_args(reduce_range=True),
+                weight=default_per_channel_weight_observer,
+            )
+        elif backend == "qnnpack":
+            # TODO: make this compatible with xnnpack constraints
+            qconfig = QConfig(
+                activation=HistogramObserver.with_args(reduce_range=False),
+                weight=default_weight_observer,
+            )
+        elif backend == "onednn":
+            if not torch.cpu._is_vnni_supported():
+                warnings.warn(
+                    "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
+                    "on CPU without Vector Neural Network Instruction support.",
+                    stacklevel=2,
+                )
+            qconfig = QConfig(
+                activation=HistogramObserver.with_args(reduce_range=False),
+                weight=default_per_channel_weight_observer,
+            )
+        elif backend == "x86":
+            qconfig = QConfig(
+                activation=HistogramObserver.with_args(reduce_range=True),
+                weight=default_per_channel_weight_observer,
+            )
+        else:
+            # won't reach
+            qconfig = default_qconfig
+    else:
+        raise AssertionError(
+            "Version number: "
+            + str(version)
+            + " in get_default_qconfig is not supported. Version number must be 0"
+        )
+
+    return qconfig
+
+
+"""
+Default, symmetric PTQ qconfig for the specified backend. And a per_channel
+variant of the same.
+
+Symmetric here applies to signed weights with zero point = 0, and additional
+value restrictions. The activations are also signed 8-bit integers with this
+qconfig.
+
+    * Once this change is merged [as of 3/17/22], with backend or qengine =
+    'qnnpack', some quantized operators with this symmetric qconfig may use
+    operators from xnnpack library.
+
+        ** Support to use xnnpack ops with `qnnpack` backed for asymmetric
+        qconfig (returned by get_default_qconfig()) is not available yet.
+
+    * This qconfig uses signed activations and weights. Weights have added
+    restrictions such as zero point is forced to be 0, making the weights
+    symmetric, hence the name. And the 8-bit quantized values are
+    restricting to to [-127, +127], excluding -128.
+
+    * xnnpack has a requantization scale value restriction, 0x1p-32 <=
+    requantization_scale < 256.0 where, `requantization_scale = (input_scale
+    * kernel_scale) / (output_scale)`. Using this eps (w/ assumed max value
+    of 256) is to prevent requantization_scale to go below xnnpack lower
+    threshold.
+"""
+default_symmetric_qnnpack_qconfig = QConfig(
+    activation=HistogramObserver.with_args(
+        dtype=torch.qint8, reduce_range=False, eps=2**-12
+    ),
+    weight=weight_observer_range_neg_127_to_127,
+)
+
+default_per_channel_symmetric_qnnpack_qconfig = QConfig(
+    activation=HistogramObserver.with_args(
+        dtype=torch.qint8, reduce_range=False, eps=2**-12
+    ),
+    weight=per_channel_weight_observer_range_neg_127_to_127,
+)
+
+default_embedding_qat_qconfig = QConfig(
+    activation=NoopObserver.with_args(dtype=torch.float32),
+    weight=default_embedding_fake_quant,
+)
+
+default_embedding_qat_qconfig_4bit = QConfig(
+    activation=NoopObserver.with_args(dtype=torch.float32),
+    weight=default_embedding_fake_quant_4bit,
+)
+
+default_quint8_weight_qconfig = QConfig(
+    activation=HistogramObserver, weight=MinMaxObserver
+)
+
+
+def get_default_qat_qconfig(backend="x86", version=1):
+    """
+    Returns the default QAT qconfig for the specified backend.
+
+    Args:
+      * `backend` (str): a string representing the target backend. Currently supports
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
+      * `version`: version, for backwards compatibility. Can be `None` or `1`.
+
+    Return:
+        qconfig
+    """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: "
+            + str(backend)
+            + f" not supported. backend must be one of {supported_backends}"
+        )
+
+    # Histogram observer is too slow for quantization aware training
+    if version == 0:
+        if backend == "fbgemm":
+            qconfig = QConfig(
+                activation=FakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=True,
+                ),
+                weight=default_per_channel_weight_fake_quant,
+            )
+        elif backend == "qnnpack":
+            qconfig = QConfig(
+                activation=FakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=False,
+                ),
+                weight=default_weight_fake_quant,
+            )
+        elif backend == "onednn":
+            qconfig = QConfig(
+                activation=FakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255
+                ),
+                weight=default_per_channel_weight_fake_quant,
+            )
+        elif backend == "x86":
+            qconfig = QConfig(
+                activation=FakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=True,
+                ),
+                weight=default_per_channel_weight_fake_quant,
+            )
+        else:
+            qconfig = default_qat_qconfig
+    # Use the fused observe + fake_quant modules for doing QAT.
+    elif version == 1:
+        if backend == "fbgemm":
+            qconfig = QConfig(
+                activation=FusedMovingAvgObsFakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=True,
+                ),
+                weight=default_fused_per_channel_wt_fake_quant,
+            )
+        elif backend == "qnnpack":
+            # TODO: make this compatible with xnnpack constraints
+            qconfig = QConfig(
+                activation=FusedMovingAvgObsFakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=False,
+                ),
+                weight=default_fused_wt_fake_quant,
+            )
+        elif backend == "onednn":
+            qconfig = QConfig(
+                activation=FusedMovingAvgObsFakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255
+                ),
+                weight=default_fused_per_channel_wt_fake_quant,
+            )
+        elif backend == "x86":
+            qconfig = QConfig(
+                activation=FusedMovingAvgObsFakeQuantize.with_args(
+                    observer=MovingAverageMinMaxObserver,
+                    quant_min=0,
+                    quant_max=255,
+                    reduce_range=True,
+                ),
+                weight=default_fused_per_channel_wt_fake_quant,
+            )
+        else:
+            qconfig = default_qat_qconfig_v2
+    else:
+        raise AssertionError(
+            "Version number: "
+            + str(version)
+            + "in get_default_qat_qconfig is not supported. Version number must be 0 or 1"
+        )
+
+    return qconfig
+
+
+"""
+Default symmetric QAT qconfig for qnnpack. And its per channel weight variant.
+"""
+default_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(
+        observer=MovingAverageMinMaxObserver,
+        quant_min=-128,
+        quant_max=127,
+        dtype=torch.qint8,
+        reduce_range=False,
+        eps=2**-12,
+    ),
+    weight=fused_wt_fake_quant_range_neg_127_to_127,
+)
+
+default_per_channel_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(
+        observer=MovingAverageMinMaxObserver,
+        quant_min=-128,
+        quant_max=127,
+        dtype=torch.qint8,
+        reduce_range=False,
+        eps=2**-12,
+    ),
+    weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127,
+)
+
+_default_fp32_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.float32),
+    weight=PlaceholderObserver.with_args(dtype=torch.float32),
+)
+
+_default_quint8_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.quint8),
+    # operators using this qconfig doesn't have weights
+    weight=None,
+)
+
+
+@deprecated(
+    "`torch.ao.quantization.get_default_qconfig_dict` is deprecated and will be removed in "
+    "a future version. Please use `torch.ao.quantization.get_default_qconfig_mapping` instead.",
+    category=FutureWarning,
+)
+def get_default_qconfig_dict(backend="x86", version=0):
+    return torch.ao.quantization.get_default_qconfig_mapping(backend, version).to_dict()
+
+
+@deprecated(
+    "`torch.ao.quantization.get_default_qat_qconfig_dict` is deprecated and will be removed in "
+    "a future version. Please use `torch.ao.quantization.get_default_qat_qconfig_mapping` instead.",
+    category=FutureWarning,
+)
+def get_default_qat_qconfig_dict(backend="x86", version=1):
+    return torch.ao.quantization.get_default_qat_qconfig_mapping(
+        backend, version
+    ).to_dict()
+
+
+def _assert_valid_qconfig(qconfig: QConfig | None, mod: torch.nn.Module) -> None:
+    """
+    Verifies that this `qconfig` is valid.
+    """
+    if qconfig is None:
+        return
+    is_conv_transpose_mod = isinstance(
+        mod,
+        (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d),
+    )
+    if is_conv_transpose_mod:
+        if qconfig.weight is None:
+            # for now, we assume that any qconfig for ConvTranspose without a weight is valid
+            return
+        example_observer = qconfig.weight()
+        is_per_channel = isinstance(
+            example_observer,
+            (
+                torch.ao.quantization.PerChannelMinMaxObserver,
+                torch.ao.quantization.MovingAveragePerChannelMinMaxObserver,
+            ),
+        )
+        if is_per_channel:
+            raise AssertionError(
+                "Per channel weight observer is not supported yet for ConvTranspose{n}d."
+            )
+
+
+if sys.version_info < (3, 12):
+    QConfigAny = Optional[QConfig]
+    QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+else:
+    from typing import TypeAliasType
+
+    QConfigAny = TypeAliasType("QConfigAny", QConfig | None)
+
+
+def _add_module_to_qconfig_obs_ctr(
+    qconfig: QConfigAny, module: nn.Module | None
+) -> Any:
+    r"""This is a helper function for use in quantization prepare that updates a qconfig so that
+    the constructors stored in the qconfig will create observers on the same device that
+    'module' is on. This is intended to be used when the qconfigs are propagated to each
+    module in order to avoid potential device alignment issues.
+
+    Args:
+        qconfig: QConfig with obs constructors stored in activation and weight
+        module: module which the qconfig is related to
+
+    Return:
+        qconfig: configured so that obs constructors set to construct on the same device as module
+    """
+
+    if module is None or qconfig is None or qconfig._fields != ("activation", "weight"):
+        return qconfig
+
+    def get_factory_kwargs_based_on_module_device():
+        if not isinstance(module, torch.nn.Module):
+            raise AssertionError("module must be an instance of torch.nn.Module")
+        devices = {p.device for p in module.parameters()} | {
+            p.device for p in module.buffers()
+        }
+        device = next(iter(devices)) if len(devices) > 0 else None
+        return None if device is None else {"device": device}
+
+    def configure_constructor_to_put_obs_on_module_device(original_constructor):
+        try:
+            # check if constructor can accept factory_kwargs
+            check = original_constructor.with_args(factory_kwargs=None)
+            check()
+            return original_constructor.with_callable_args(
+                factory_kwargs=get_factory_kwargs_based_on_module_device
+            )
+        except AttributeError:  # qconfig doesn't have activation or weight
+            return original_constructor
+        except TypeError:  # the class doesn't accept factory_kwargs argument
+            return original_constructor
+
+    activation = configure_constructor_to_put_obs_on_module_device(qconfig.activation)
+    weight = configure_constructor_to_put_obs_on_module_device(qconfig.weight)
+
+    return QConfig(activation, weight)
+
+
+_ObserverOrFakeQuantizeConstructor = Union[
+    _PartialWrapper, type[ObserverBase], type[FakeQuantizeBase]
+]
+
+
+def _obs_or_fq_ctr_equals(
+    obs_or_fq1: _ObserverOrFakeQuantizeConstructor,
+    obs_or_fq2: _ObserverOrFakeQuantizeConstructor,
+):
+    if isinstance(obs_or_fq1, _PartialWrapper) and isinstance(
+        obs_or_fq2, _PartialWrapper
+    ):
+        return _partial_wrapper_equals(obs_or_fq1, obs_or_fq2)
+    return obs_or_fq1 == obs_or_fq2
+
+
+def _partial_wrapper_equals(obs_or_fq1: _PartialWrapper, obs_or_fq2: _PartialWrapper):
+    """
+    Return whether the two partial wrappers are equal,
+    """
+    # functools.partial has no __eq__ operator defined so '==' defaults to 'is'
+    obs_or_fq1_keywords = copy.copy(obs_or_fq1.p.keywords)
+    obs_or_fq2_keywords = copy.copy(obs_or_fq2.p.keywords)
+    keywords_equal = True
+    # compare observer constructor with _obs_or_fq_ctr_equals since direct compare would fail
+    if "observer" in obs_or_fq1_keywords and "observer" in obs_or_fq2_keywords:
+        keywords_equal = keywords_equal and _obs_or_fq_ctr_equals(
+            obs_or_fq1_keywords["observer"], obs_or_fq2_keywords["observer"]
+        )
+        obs_or_fq1_keywords.pop("observer")
+        obs_or_fq2_keywords.pop("observer")
+    keywords_equal = keywords_equal and obs_or_fq1_keywords == obs_or_fq2_keywords
+    return (
+        obs_or_fq1.p.func == obs_or_fq2.p.func
+        and obs_or_fq1.p.args == obs_or_fq2.p.args
+        and keywords_equal
+    )
+
+
+def qconfig_equals(q1: QConfigAny, q2: QConfigAny):
+    """
+    Returns `True` if `q1` equals `q2`, and `False` otherwise.
+    """
+    if q1 is None or q2 is None:
+        return q1 == q2
+    else:
+        if q1 is None or q2 is None:
+            raise AssertionError(
+                "Both q1 and q2 must be non-None for qconfig comparison"
+            )
+        try:
+            # Qconfig weight and activation can be either a partial wrapper,
+            # or an observer class. Special handling is required (above) for
+            # comparing partial wrappers.
+            activation_same = _obs_or_fq_ctr_equals(q1.activation, q2.activation)
+            weight_same = _obs_or_fq_ctr_equals(q1.weight, q2.weight)
+            return activation_same and weight_same
+        except AttributeError:
+            return q1 == q2
+
+
+def _activation_is_memoryless(qconfig: QConfig):
+    """
+    Return whether the observer for activations defined in the given QConfig is memoryless.
+    This means a MovingAverage observer with averaging constant equal to 1.
+    """
+
+    def _is_memoryless(observer):
+        return (
+            hasattr(observer, "averaging_constant") and observer.averaging_constant == 1
+        )
+
+    act = qconfig.activation()
+    if isinstance(act, FakeQuantizeBase) and hasattr(act, "activation_post_process"):
+        return _is_memoryless(act.activation_post_process)
+    else:
+        return _is_memoryless(act)
+
+
+def _is_reuse_input_qconfig(qconfig: QConfig | None):
+    return (
+        qconfig is not None
+        and isinstance(qconfig.activation(), ReuseInputObserver)
+        and isinstance(qconfig.weight(), NoopObserver)
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf896a96da055ea99d1e165c12dc450f50ad77dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/qconfig_mapping.py
@@ -0,0 +1,385 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+from collections import OrderedDict
+from typing import Any, TYPE_CHECKING
+
+import torch
+
+from .fake_quantize import default_weight_fake_quant, FixedQParamsFakeQuantize
+from .observer import (
+    _PartialWrapper,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
+    default_placeholder_observer,
+    default_weight_observer,
+)
+from .qconfig import (
+    default_quint8_weight_qconfig,
+    default_reuse_input_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
+    default_symmetric_qnnpack_qconfig,
+    get_default_qat_qconfig,
+    get_default_qconfig,
+    QConfig,
+    QConfigAny,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+__all__ = [
+    "get_default_qconfig_mapping",
+    "get_default_qat_qconfig_mapping",
+    "QConfigMapping",
+]
+
+
+# TODO: replace all usages with these constants
+_GLOBAL_DICT_KEY = ""
+_OBJECT_TYPE_DICT_KEY = "object_type"
+_MODULE_NAME_REGEX_DICT_KEY = "module_name_regex"
+_MODULE_NAME_DICT_KEY = "module_name"
+_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
+
+# TODO: derive this map from the BackendConfig
+_FIXED_QPARAMS_OP_TO_OBSERVER: dict[Callable | str, _PartialWrapper] = {
+    torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer,
+    torch.nn.functional.hardsigmoid: default_fixed_qparams_range_0to1_observer,
+    "hardsigmoid": default_fixed_qparams_range_0to1_observer,
+    "hardsigmoid_": default_fixed_qparams_range_0to1_observer,
+    torch.nn.Sigmoid: default_fixed_qparams_range_0to1_observer,
+    torch.sigmoid: default_fixed_qparams_range_0to1_observer,
+    "sigmoid": default_fixed_qparams_range_0to1_observer,
+    "sigmoid_": default_fixed_qparams_range_0to1_observer,
+    torch.nn.Softmax: default_fixed_qparams_range_0to1_observer,
+    torch.nn.Tanh: default_fixed_qparams_range_neg1to1_observer,
+    torch.tanh: default_fixed_qparams_range_neg1to1_observer,
+    "tanh": default_fixed_qparams_range_neg1to1_observer,
+    "tanh_": default_fixed_qparams_range_neg1to1_observer,
+}
+
+
+def _get_default_qconfig_mapping(
+    is_qat: bool, backend: str, version: int
+) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for the given quantization type and backend.
+    """
+    if is_qat:
+        qconfig = get_default_qat_qconfig(backend, version)
+    else:
+        qconfig = get_default_qconfig(backend, version)
+    default_weight = default_weight_fake_quant if is_qat else default_weight_observer
+
+    # default_per_channel_weight_observer is not currently compatible with fbgemm backend
+    # so we have to modify the weight observer to default_weight_observer or another
+    # per tensor supported observer.
+    # see https://github.com/pytorch/pytorch/issues/47535
+    if backend in ("fbgemm", "x86"):
+        qconfig_transpose = QConfig(
+            activation=qconfig.activation, weight=default_weight
+        )
+    else:
+        qconfig_transpose = qconfig
+
+    # currently layernorm only supports float weights
+    # we have to add this because otherwise there will be a extra quantize-dequantize pair
+    qconfig_layernorm = QConfig(
+        activation=qconfig.activation, weight=default_placeholder_observer
+    )
+
+    qconfig_mapping = (
+        QConfigMapping()
+        .set_global(qconfig)
+        .set_object_type("reshape", default_reuse_input_qconfig)
+        .set_object_type(torch.nn.ConvTranspose1d, qconfig_transpose)
+        .set_object_type(torch.nn.ConvTranspose2d, qconfig_transpose)
+        .set_object_type(torch.nn.ConvTranspose3d, qconfig_transpose)
+        .set_object_type(torch.nn.functional.conv_transpose1d, qconfig_transpose)
+        .set_object_type(torch.nn.functional.conv_transpose2d, qconfig_transpose)
+        .set_object_type(torch.nn.functional.conv_transpose3d, qconfig_transpose)
+        .set_object_type(torch.nn.functional.layer_norm, qconfig_layernorm)
+        .set_object_type(torch.nn.LayerNorm, qconfig_layernorm)
+        .set_object_type(torch.nn.PReLU, default_quint8_weight_qconfig)
+    )
+    # Use special observers for ops with fixed qparams
+    fixed_qparams_observer_to_qconfig: dict[Any, QConfigAny] = {}
+    for fixed_qparams_op, observer in _FIXED_QPARAMS_OP_TO_OBSERVER.items():
+        if observer in fixed_qparams_observer_to_qconfig:
+            fixed_qparams_qconfig = fixed_qparams_observer_to_qconfig[observer]
+        else:
+            if is_qat:
+                activation = FixedQParamsFakeQuantize.with_args(observer=observer)
+            else:
+                activation = observer
+            fixed_qparams_qconfig = QConfig(
+                activation=activation, weight=default_weight
+            )
+            fixed_qparams_observer_to_qconfig[observer] = fixed_qparams_qconfig
+        qconfig_mapping.set_object_type(fixed_qparams_op, fixed_qparams_qconfig)
+
+    # TODO Currently it's required that separate ops in a fused op/module have the same qconfig.
+    #      Need to be able to support fusion of ops with different qconfigs
+
+    return qconfig_mapping
+
+
+def get_default_qconfig_mapping(backend="x86", version=0) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for post training quantization.
+
+    Args:
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
+      * ``version`` (int) : the version for the default qconfig mapping
+    """
+    # TODO: add assert for backend choices
+    return _get_default_qconfig_mapping(False, backend, version)
+
+
+def get_default_qat_qconfig_mapping(backend="x86", version=1) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for quantization aware training.
+
+    Args:
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
+      * ``version`` (int) : the version for the default qconfig mapping
+    """
+    return _get_default_qconfig_mapping(True, backend, version)
+
+
+def _get_symmetric_qnnpack_qconfig_mapping() -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qconfig`
+    as the default QConfig.
+    """
+    default_qconfig = default_symmetric_qnnpack_qconfig
+    return _get_default_qconfig_mapping_with_default_qconfig(
+        False, "qnnpack", default_qconfig
+    )
+
+
+def _get_symmetric_qnnpack_qat_qconfig_mapping() -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qat_qconfig`
+    as the default QConfig.
+    """
+    default_qconfig = default_symmetric_qnnpack_qat_qconfig
+    return _get_default_qconfig_mapping_with_default_qconfig(
+        True, "qnnpack", default_qconfig
+    )
+
+
+def _get_default_qconfig_mapping_with_default_qconfig(
+    is_qat: bool,
+    backend: str,
+    default_qconfig: QConfig,
+) -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses the provided qconfig as the default QConfig.
+    """
+    if is_qat:
+        qconfig_mapping = get_default_qat_qconfig_mapping(backend)
+    else:
+        qconfig_mapping = get_default_qconfig_mapping(backend)
+    qconfig_mapping.set_global(default_qconfig)
+    for pattern in qconfig_mapping.object_type_qconfigs:
+        if pattern not in _FIXED_QPARAMS_OP_TO_OBSERVER:
+            qconfig_mapping.set_object_type(pattern, default_qconfig)
+    return qconfig_mapping
+
+
+_QCONFIG_STYLE_ORDER: list[str] = [
+    "global_qconfig",
+    "object_type_qconfigs",
+    "module_name_regex_qconfigs",
+    "module_name_qconfigs",
+    "module_name_object_type_order_qconfigs",
+]
+
+
+class QConfigMapping:
+    """
+    Mapping from model ops to :class:`torch.ao.quantization.QConfig` s.
+
+    The user can specify QConfigs using the following methods (in increasing match priority):
+
+        ``set_global`` : sets the global (default) QConfig
+
+        ``set_object_type`` : sets the QConfig for a given module type, function, or method name
+
+        ``set_module_name_regex`` : sets the QConfig for modules matching the given regex string
+
+        ``set_module_name`` : sets the QConfig for modules matching the given module name
+
+        ``set_module_name_object_type_order`` : sets the QConfig for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
+
+    Example usage::
+
+        qconfig_mapping = QConfigMapping()
+            .set_global(global_qconfig)
+            .set_object_type(torch.nn.Linear, qconfig1)
+            .set_object_type(torch.nn.ReLU, qconfig1)
+            .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1)
+            .set_module_name_regex("foo.*", qconfig2)
+            .set_module_name("module1", qconfig1)
+            .set_module_name("module2", qconfig2)
+            .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, qconfig3)
+
+    """
+
+    def __init__(self) -> None:
+        # In increasing match priority:
+        self.global_qconfig: QConfigAny = None
+        self.object_type_qconfigs: OrderedDict[Callable | str, QConfigAny] = (
+            OrderedDict()
+        )
+        self.module_name_regex_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
+        self.module_name_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
+        self.module_name_object_type_order_qconfigs: OrderedDict[
+            tuple[str, Callable, int], QConfigAny
+        ] = OrderedDict()
+
+    def set_global(self, global_qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the global (default) QConfig.
+        """
+        self.global_qconfig = global_qconfig
+        return self
+
+    def set_object_type(
+        self, object_type: Callable | str, qconfig: QConfigAny
+    ) -> QConfigMapping:
+        """
+        Set the QConfig for a given module type, function, or method name.
+        If the QConfig for an existing object type was already set, the new QConfig will override the old one.
+        """
+        self.object_type_qconfigs[object_type] = qconfig
+        return self
+
+    def set_module_name_regex(
+        self, module_name_regex: str, qconfig: QConfigAny
+    ) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching the given regex string.
+
+        Regexes will be matched in the order in which they are registered through this method.
+        Thus, the caller should register more specific patterns first, e.g.::
+
+            qconfig_mapping = QConfigMapping()
+                .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1)
+                .set_module_name_regex("foo.*bar.*", qconfig2)
+                .set_module_name_regex("foo.*", qconfig3)
+
+        In this example, "foo.bar.conv0" would match qconfig1, "foo.bar.linear" would match qconfig2,
+        and "foo.baz.relu" would match qconfig3.
+
+        If the QConfig for an existing module name regex was already set, the new QConfig will override the
+        old one while preserving the order in which the regexes were originally registered.
+        """
+        self.module_name_regex_qconfigs[module_name_regex] = qconfig
+        return self
+
+    def set_module_name(self, module_name: str, qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching the given module name.
+        If the QConfig for an existing module name was already set, the new QConfig will override the old one.
+        """
+        self.module_name_qconfigs[module_name] = qconfig
+        return self
+
+    def set_module_name_object_type_order(
+        self, module_name: str, object_type: Callable, index: int, qconfig: QConfigAny
+    ) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching a combination of the given module name, object type,
+        and the index at which the module appears.
+
+        If the QConfig for an existing (module name, object type, index)  was already set, the new QConfig
+        will override the old one.
+        """
+        self.module_name_object_type_order_qconfigs[
+            (module_name, object_type, index)
+        ] = qconfig
+        return self
+
+    def __repr__(self) -> str:
+        output = self.__class__.__name__ + " ("
+        for style_name in _QCONFIG_STYLE_ORDER:
+            output += f"\n {style_name}"
+            qconfigs = getattr(self, style_name)
+            if isinstance(qconfigs, OrderedDict) and len(qconfigs) > 0:
+                for key, qconfig in qconfigs.items():
+                    output += f"\n  {key}: {qconfig}"
+            else:
+                output += f"\n  {qconfigs}"
+        return output + "\n)"
+
+    # TODO: remove this
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Convert this ``QConfigMapping`` to a dictionary with the following keys:
+
+            "" (for global QConfig)
+
+            "object_type"
+
+            "module_name_regex"
+
+            "module_name"
+
+            "module_name_object_type_order"
+
+        The values of this dictionary are lists of tuples.
+        """
+        return {
+            _GLOBAL_DICT_KEY: self.global_qconfig,
+            _OBJECT_TYPE_DICT_KEY: list(self.object_type_qconfigs.items()),
+            _MODULE_NAME_REGEX_DICT_KEY: list(self.module_name_regex_qconfigs.items()),
+            _MODULE_NAME_DICT_KEY: list(self.module_name_qconfigs.items()),
+            _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
+                (*k, v) for k, v in self.module_name_object_type_order_qconfigs.items()
+            ],
+        }
+
+    # TODO: remove this
+    @classmethod
+    def from_dict(cls, qconfig_dict: dict[str, Any]) -> QConfigMapping:
+        """
+        Create a ``QConfigMapping`` from a dictionary with the following keys (all optional):
+
+            "" (for global QConfig)
+
+            "object_type"
+
+            "module_name_regex"
+
+            "module_name"
+
+            "module_name_object_type_order"
+
+        The values of this dictionary are expected to be lists of tuples.
+        """
+        conf = cls()
+        if _GLOBAL_DICT_KEY in qconfig_dict:
+            conf.set_global(qconfig_dict[_GLOBAL_DICT_KEY])
+        for object_type, qconfig in qconfig_dict.get(_OBJECT_TYPE_DICT_KEY, []):
+            conf.set_object_type(object_type, qconfig)
+        for module_name_regex, qconfig in qconfig_dict.get(
+            _MODULE_NAME_REGEX_DICT_KEY, []
+        ):
+            conf.set_module_name_regex(module_name_regex, qconfig)
+        for module_name, qconfig in qconfig_dict.get(_MODULE_NAME_DICT_KEY, []):
+            conf.set_module_name(module_name, qconfig)
+        for module_name, object_type, index, qconfig in qconfig_dict.get(
+            _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY, []
+        ):
+            conf.set_module_name_object_type_order(
+                module_name, object_type, index, qconfig
+            )
+        return conf
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..18488d7f9ccba604ca8f1df7ea0ef4a88546d63e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quant_type.py
@@ -0,0 +1,35 @@
+import enum
+
+
+__all__ = [
+    "QuantType",
+]
+
+
+# Quantization type (dynamic quantization, static quantization).
+# Should match the c++ enum in quantization_type.h
+class QuantType(enum.IntEnum):
+    DYNAMIC = 0
+    STATIC = 1
+    QAT = 2
+    WEIGHT_ONLY = 3
+
+
+_quant_type_to_str = {
+    QuantType.STATIC: "static",
+    QuantType.DYNAMIC: "dynamic",
+    QuantType.QAT: "qat",
+    QuantType.WEIGHT_ONLY: "weight_only",
+}
+
+
+# TODO: make this private
+def _get_quant_type_to_str(quant_type: QuantType) -> str:
+    return _quant_type_to_str[quant_type]
+
+
+def _quant_type_from_str(name: str) -> QuantType:
+    for quant_type, s in _quant_type_to_str.items():
+        if name == s:
+            return quant_type
+    raise ValueError(f"Unknown QuantType name '{name}'")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..647ed5a4d4f3946626ef360a7a45541719136006
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantization_mappings.py
@@ -0,0 +1,369 @@
+import copy
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.ao.nn as ao_nn
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.qat.dynamic as nnqatd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.quantized.reference as nnqr
+
+# Because `torch.ao.nn` uses lazy imports, we need to make
+# sure we import the contents explicitly here.
+import torch.ao.nn.sparse
+import torch.nn.functional as F
+from torch import nn
+from torch.ao.quantization.fake_quantize import (
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
+)
+from torch.ao.quantization.stubs import DeQuantStub, QuantStub
+from torch.ao.quantization.utils import get_combined_dict
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+
+__all__ = [
+    "DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_STATIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_QAT_MODULE_MAPPINGS",
+    "DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS",
+    "DEFAULT_MODULE_TO_ACT_POST_PROCESS",
+    "DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS",
+    "no_observer_set",
+    "get_default_static_quant_module_mappings",
+    "get_default_static_quant_reference_module_mappings",
+    "get_embedding_static_quant_module_mappings",
+    "get_default_static_sparse_quant_module_mappings",
+    "get_static_quant_module_class",
+    "get_dynamic_quant_module_class",
+    "get_default_qat_module_mappings",
+    "get_embedding_qat_module_mappings",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_dynamic_sparse_quant_module_mappings",
+    "get_default_qconfig_propagation_list",
+    "get_default_compare_output_module_list",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_quantized_operator",
+]
+
+# Default map for swapping float module to reference quantized modules
+DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
+    nn.Linear: nnqr.Linear,
+    nn.Conv1d: nnqr.Conv1d,
+    nn.Conv2d: nnqr.Conv2d,
+    nn.Conv3d: nnqr.Conv3d,
+    nn.ConvTranspose1d: nnqr.ConvTranspose1d,
+    nn.ConvTranspose2d: nnqr.ConvTranspose2d,
+    nn.ConvTranspose3d: nnqr.ConvTranspose3d,
+    nn.Embedding: nnqr.Embedding,
+    nn.EmbeddingBag: nnqr.EmbeddingBag,
+    nn.GRUCell: nnqr.GRUCell,
+    nn.LSTMCell: nnqr.LSTMCell,
+    nn.RNNCell: nnqr.RNNCell,
+    nn.LSTM: nnqr.LSTM,
+}
+
+# Default map for swapping float module to quantized ones
+DEFAULT_STATIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
+    nn.BatchNorm2d: nnq.BatchNorm2d,
+    nn.BatchNorm3d: nnq.BatchNorm3d,
+    nn.Dropout: nnq.Dropout,
+    nn.Conv1d: nnq.Conv1d,
+    nn.Conv2d: nnq.Conv2d,
+    nn.Conv3d: nnq.Conv3d,
+    nn.ConvTranspose1d: nnq.ConvTranspose1d,
+    nn.ConvTranspose2d: nnq.ConvTranspose2d,
+    nn.ConvTranspose3d: nnq.ConvTranspose3d,
+    nn.ELU: nnq.ELU,
+    nn.Embedding: nnq.Embedding,
+    nn.EmbeddingBag: nnq.EmbeddingBag,
+    nn.GroupNorm: nnq.GroupNorm,
+    nn.Hardswish: nnq.Hardswish,
+    nn.InstanceNorm1d: nnq.InstanceNorm1d,
+    nn.InstanceNorm2d: nnq.InstanceNorm2d,
+    nn.InstanceNorm3d: nnq.InstanceNorm3d,
+    nn.LayerNorm: nnq.LayerNorm,
+    nn.LeakyReLU: nnq.LeakyReLU,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnq.Linear,
+    nn.Linear: nnq.Linear,
+    nn.ReLU6: nnq.ReLU6,
+    nn.PReLU: nnq.PReLU,
+    # Wrapper Modules:
+    nnq.FloatFunctional: nnq.QFunctional,
+    # Intrinsic modules:
+    nni.BNReLU2d: nniq.BNReLU2d,
+    nni.BNReLU3d: nniq.BNReLU3d,
+    nni.ConvReLU1d: nniq.ConvReLU1d,
+    nni.ConvReLU2d: nniq.ConvReLU2d,
+    nni.ConvReLU3d: nniq.ConvReLU3d,
+    nni.ConvAdd2d: nniq.ConvAdd2d,
+    nni.ConvAddReLU2d: nniq.ConvAddReLU2d,
+    nni.LinearReLU: nniq.LinearReLU,
+    nni.LinearLeakyReLU: nniq.LinearLeakyReLU,
+    nni.LinearTanh: nniq.LinearTanh,
+    nniqat.ConvBn1d: nnq.Conv1d,
+    nniqat.ConvBn2d: nnq.Conv2d,
+    nniqat.ConvBn3d: nnq.Conv3d,
+    nniqat.ConvBnReLU1d: nniq.ConvReLU1d,
+    nniqat.ConvBnReLU2d: nniq.ConvReLU2d,
+    nniqat.ConvBnReLU3d: nniq.ConvReLU3d,
+    nniqat.ConvReLU2d: nniq.ConvReLU2d,
+    nniqat.ConvReLU3d: nniq.ConvReLU3d,
+    nniqat.LinearReLU: nniq.LinearReLU,
+    nniqat.LinearBn1d: nnq.Linear,
+    # QAT modules:
+    nnqat.Linear: nnq.Linear,
+    nnqat.Conv2d: nnq.Conv2d,
+    nnqat.Conv3d: nnq.Conv3d,
+}
+
+# Default map for swapping float module to qat modules
+DEFAULT_QAT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    nn.Conv2d: nnqat.Conv2d,
+    nn.Conv3d: nnqat.Conv3d,
+    nn.Linear: nnqat.Linear,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnqat.Linear,
+    # Intrinsic modules:
+    nni.ConvBn1d: nniqat.ConvBn1d,
+    nni.ConvBn2d: nniqat.ConvBn2d,
+    nni.ConvBn3d: nniqat.ConvBn3d,
+    nni.ConvBnReLU1d: nniqat.ConvBnReLU1d,
+    nni.ConvBnReLU2d: nniqat.ConvBnReLU2d,
+    nni.ConvBnReLU3d: nniqat.ConvBnReLU3d,
+    nni.ConvReLU2d: nniqat.ConvReLU2d,
+    nni.ConvReLU3d: nniqat.ConvReLU3d,
+    nni.LinearReLU: nniqat.LinearReLU,
+    nni.LinearBn1d: nniqat.LinearBn1d,
+}
+
+# Default map for swapping dynamic modules
+DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    nn.GRUCell: nnqd.GRUCell,
+    nn.Linear: nnqd.Linear,
+    nnqatd.Linear: nnqd.Linear,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnqd.Linear,
+    nn.LSTM: nnqd.LSTM,
+    nn.GRU: nnqd.GRU,
+    nn.LSTMCell: nnqd.LSTMCell,
+    nn.RNNCell: nnqd.RNNCell,
+    nni.LinearReLU: nniqd.LinearReLU,
+    nn.EmbeddingBag: nnq.EmbeddingBag,
+    nn.Embedding: nnq.Embedding,
+    # Don't want to enable these by default because the numerical
+    # accuracy is poor compared to other dynamic ops
+    # nn.Conv1d: nnqd.Conv1d,
+    # nn.Conv2d: nnqd.Conv2d,
+    # nn.Conv3d: nnqd.Conv3d,
+    # nn.ConvTranspose1d: nnqd.ConvTranspose1d,
+    # nn.ConvTranspose2d: nnqd.ConvTranspose2d,
+    # nn.ConvTranspose3d: nnqd.ConvTranspose3d,
+}
+
+# Allowlist for propagating the qconfig
+_INCLUDE_QCONFIG_PROPAGATE_LIST: set[Callable] = {
+    nn.Sequential,
+}
+
+# Default mapping from floating point function or torch ops to quantized ops
+# TODO: merge with default static mapping
+DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS: dict[Callable | str, Callable] = {
+    F.elu: torch.ops.quantized.elu,
+    F.hardswish: torch.ops.quantized.hardswish,
+    F.instance_norm: torch.ops.quantized.instance_norm,
+    F.layer_norm: torch.ops.quantized.layer_norm,
+    F.leaky_relu: torch.ops.quantized.leaky_relu,
+    F.dropout: torch.ops.quantized.dropout,
+}
+
+# mapping from module to output activation post process class
+DEFAULT_MODULE_TO_ACT_POST_PROCESS: dict[Callable, Callable] = {
+    nn.Hardsigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Sigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Softmax: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Tanh: default_fixed_qparams_range_neg1to1_fake_quant,
+}
+
+# Default map for swapping float module to static sparse quantized ones
+DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    nn.Linear: ao_nn.sparse.quantized.Linear
+}
+
+# Default map for swapping float module to dynamic sparse quantized ones
+DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    nn.Linear: ao_nn.sparse.quantized.dynamic.Linear
+}
+
+
+def no_observer_set() -> set[Any]:
+    r"""These modules cannot have observers inserted by default."""
+    no_observers = {nn.quantizable.LSTM, nn.quantizable.MultiheadAttention}
+    return no_observers
+
+
+def get_default_static_quant_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping for post training static quantization"""
+    return copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS)
+
+
+def get_default_static_quant_reference_module_mappings() -> dict[Callable, Any]:
+    """Get reference module mapping for post training static quantization"""
+    return copy.deepcopy(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS)
+
+
+def get_embedding_static_quant_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping, including mapping for embedding QAT"""
+    mapping = copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS)
+    mapping[nnqat.EmbeddingBag] = nnq.EmbeddingBag
+    mapping[nnqat.Embedding] = nnq.Embedding
+    return mapping
+
+
+def get_default_static_sparse_quant_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping for post training static sparse quantization"""
+    return copy.deepcopy(DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS)
+
+
+def get_static_quant_module_class(
+    float_module_class: Callable,
+    additional_static_quant_mapping: dict[Callable, Any] | None = None,
+    is_reference: bool = False,
+) -> Any:
+    r"""n Get the statically quantized module class corresponding to
+    the floating point module class
+    """
+    if additional_static_quant_mapping is None:
+        additional_static_quant_mapping = {}
+    all_mappings = get_combined_dict(
+        DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS
+        if is_reference
+        else DEFAULT_STATIC_QUANT_MODULE_MAPPINGS,
+        additional_static_quant_mapping,
+    )
+    static_quant_module_class = all_mappings.get(float_module_class, None)
+    if static_quant_module_class is None:
+        raise AssertionError(
+            f"Floating point module class {str(float_module_class)}"
+            + " does not have a corresponding quantized module class"
+        )
+    return copy.deepcopy(static_quant_module_class)
+
+
+def get_dynamic_quant_module_class(
+    float_module_class: Callable,
+    additional_dynamic_quant_mapping: dict[Callable, Any] | None = None,
+) -> Any:
+    r"""n Get the dynamically quantized module class corresponding to
+    the floating point module class
+    """
+    if additional_dynamic_quant_mapping is None:
+        additional_dynamic_quant_mapping = {}
+    all_mappings = get_combined_dict(
+        DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, additional_dynamic_quant_mapping
+    )
+    dynamic_quant_module_class = all_mappings.get(float_module_class, None)
+    if dynamic_quant_module_class is None:
+        raise AssertionError(
+            f"Floating point module class {str(float_module_class)}"
+            + " does not have a corresponding quantized module class"
+        )
+    return copy.deepcopy(dynamic_quant_module_class)
+
+
+def get_default_qat_module_mappings() -> dict[Callable, Any]:
+    """Get default module mapping for quantization aware training"""
+    return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
+
+
+def get_embedding_qat_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping for quantization aware training
+    This is includes default values in addition to
+    enabling qat for embeddings.
+    """
+    mapping = copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
+    mapping[nn.EmbeddingBag] = nnqat.EmbeddingBag
+    mapping[nn.Embedding] = nnqat.Embedding
+    return mapping
+
+
+def get_default_dynamic_quant_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping for post training dynamic quantization"""
+    return DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS
+
+
+def get_default_dynamic_sparse_quant_module_mappings() -> dict[Callable, Any]:
+    """Get module mapping for post training dynamic sparse quantization"""
+    return DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS
+
+
+def get_default_qconfig_propagation_list() -> set[Callable]:
+    """Get the default list of module types that we'll attach qconfig
+    attribute to in prepare
+    """
+    QCONFIG_PROPAGATE_MODULE_CLASS_LIST = (
+        set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_QAT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys())
+        | _INCLUDE_QCONFIG_PROPAGATE_LIST
+    )
+    return copy.deepcopy(QCONFIG_PROPAGATE_MODULE_CLASS_LIST)
+
+
+def get_default_compare_output_module_list() -> set[Callable]:
+    """Get list of module class types that we will record output
+    in numeric suite
+    """
+    NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST = (
+        set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_QAT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_QAT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys())
+        | _INCLUDE_QCONFIG_PROPAGATE_LIST
+    )
+    return copy.deepcopy(NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST)
+
+
+def get_default_float_to_quantized_operator_mappings() -> dict[
+    Callable | str, Callable
+]:
+    return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS)
+
+
+# TODO: merge with get_static_quant_module_class
+def get_quantized_operator(float_op: Callable | str) -> Callable:
+    """Get the quantized operator corresponding to the float operator"""
+    quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op)
+    if quantized_op is None:
+        raise AssertionError(
+            f"Operator {str(float_op)} does not have corresponding quantized op"
+        )
+    return quantized_op
+
+
+def _get_special_act_post_process(module: torch.nn.Module) -> Callable | None:
+    r"""Get the special activation post process for `module`, this has
+    higher priority than the activation post process in `qconfig`
+    e.g.
+    input: torch.nn.Sigmoid
+    output: default_affine_fixed_qparam_fake_quant
+    """
+    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(
+        type_before_parametrizations(module), None
+    )
+
+
+def _has_special_act_post_process(module: torch.nn.Module) -> bool:
+    return module.training and type(module) in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..e71dd24fda745d7f23f671eedaa1ff43df147a9a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize.py
@@ -0,0 +1,829 @@
+# mypy: allow-untyped-defs
+import copy
+import inspect
+import itertools
+import typing_extensions
+import warnings
+
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+from torch.ao.nn.intrinsic import _FusedModule
+from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.qconfig import (
+    _activation_is_memoryless,
+    _add_module_to_qconfig_obs_ctr,
+    default_dynamic_qconfig,
+    float16_dynamic_qconfig,
+    float_qparams_weight_only_qconfig,
+    float_qparams_weight_only_qconfig_4bit,
+)
+from torch.ao.quantization.quantization_mappings import (
+    _get_special_act_post_process,
+    _has_special_act_post_process,
+    get_default_dynamic_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    get_default_static_quant_module_mappings,
+    get_default_static_quant_reference_module_mappings,
+    no_observer_set,
+)
+from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from .utils import (
+    DEPRECATION_WARNING,
+    get_qparam_dict,
+    has_no_children_ignoring_parametrizations,
+)
+
+
+__all__ = [
+    "get_default_custom_config_dict",
+    "propagate_qconfig_",
+    "add_quant_dequant",
+    "prepare",
+    "quantize",
+    "quantize_dynamic",
+    "prepare_qat",
+    "quantize_qat",
+    "convert",
+    "swap_module",
+]
+
+
+# TODO remove this once BC is no longer required to avoid a SEV
+is_activation_post_process = _is_activation_post_process
+
+
+_DEFAULT_CUSTOM_CONFIG_DICT = {
+    "float_to_observed_custom_module_class": {
+        nn.LSTM: nn.quantizable.LSTM,
+        nn.MultiheadAttention: nn.quantizable.MultiheadAttention,
+    },
+    "observed_to_quantized_custom_module_class": {
+        nn.quantizable.LSTM: nn.quantized.LSTM,
+        nn.quantizable.MultiheadAttention: nn.quantized.MultiheadAttention,
+    },
+}
+
+
+def get_default_custom_config_dict():
+    r"""Defines the default custom config dict."""
+    return _DEFAULT_CUSTOM_CONFIG_DICT
+
+
+def _propagate_qconfig_helper(
+    module,
+    qconfig_dict,
+    qconfig_parent=None,
+    prefix="",
+    prepare_custom_config_dict=None,
+):
+    r"""This is a helper function for `propagate_qconfig_`
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                     configuration
+        qconfig_parent: quantization config of parent module, we will fallback to
+                       this config when there is no specified config for current
+                       module
+        prefix: corresponding prefix of the current module, used as key in
+                qconfig_dict
+        prepare_custom_config_dict: dictionary for custom handling of modules
+                                    see docs for :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+
+    module_qconfig = qconfig_dict.get(
+        type_before_parametrizations(module), qconfig_parent
+    )
+    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
+    module_qconfig = getattr(module, "qconfig", module_qconfig)
+
+    torch.ao.quantization.qconfig._assert_valid_qconfig(module_qconfig, module)
+
+    qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(module_qconfig, module)
+    module.qconfig = qconfig_with_device_check
+
+    for name, child in module.named_children():
+        module_prefix = prefix + "." + name if prefix else name
+        #  do no not propagate qconfig to child if child is non traceable
+        if prepare_custom_config_dict is None or not (
+            name in prepare_custom_config_dict.get("non_traceable_module_name", [])
+            or type(child)
+            in prepare_custom_config_dict.get("non_traceable_module_class", [])
+        ):
+            _propagate_qconfig_helper(
+                child, qconfig_dict, qconfig_with_device_check, module_prefix
+            )
+
+
+def propagate_qconfig_(module, qconfig_dict=None, prepare_custom_config_dict=None):
+    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
+    attribute on each leaf module
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name or type of submodule to
+            quantization configuration, qconfig applies to all submodules of a
+            given module unless qconfig for the submodules are specified (when
+            the submodule already has qconfig attribute)
+        prepare_custom_config_dict: dictionary for custom handling of modules
+            see docs for :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if qconfig_dict is None:
+        qconfig_dict = {}
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = {}
+    _propagate_qconfig_helper(
+        module, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict
+    )
+
+
+def _observer_forward_hook(self, input, output):
+    r"""Forward hook that calls observer on the output"""
+    return self.activation_post_process(output)
+
+
+def _observer_forward_pre_hook(self, input):
+    r"""Forward pre hook that calls observer on the output"""
+    return self.activation_post_process(input[0])
+
+
+def _register_activation_post_process_hook(module, pre_hook=False):
+    if not hasattr(module, "activation_post_process"):
+        raise AssertionError(
+            "Expect activation_post_process attribute already attached to the module"
+        )
+    if pre_hook:
+        module.register_forward_pre_hook(_observer_forward_pre_hook, prepend=True)
+    else:
+        module.register_forward_hook(_observer_forward_hook, prepend=True)
+
+
+def _add_observer_(
+    module,
+    qconfig_propagation_list=None,
+    non_leaf_module_list=None,
+    device=None,
+    custom_module_class_mapping=None,
+):
+    r"""Add observer for the leaf child of the module.
+
+    This function insert observer module to all leaf child module that
+    has a valid qconfig attribute.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules that we want to quantize
+        qconfig_propagation_list: a list of quantizable modules that will have observers added to them
+            if they are leaf nodes
+        device: parent device, if any
+        non_leaf_module_list: list of non-leaf modules we want to add observer
+
+    Return:
+        None, module is modified inplace with added observer modules and forward_hooks
+    """
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+
+    if custom_module_class_mapping is None:
+        custom_module_class_mapping = {}
+
+    # respect device affinity when adding observers
+    if device is None:
+        devices = _get_unique_devices_(module)
+        if len(devices) > 1:
+            raise AssertionError(
+                f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
+            )
+        device = next(iter(devices)) if len(devices) > 0 else None
+
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = (
+            qconfig.activation()
+            if special_act_post_process is None
+            else special_act_post_process()
+        )
+        if device is not None:
+            activation.to(device)
+        return activation
+
+    def needs_observation(m):
+        return hasattr(m, "qconfig") and m.qconfig is not None
+
+    def insert_activation_post_process(m, special_act_post_process=None):
+        """Adds an activation post process module and register
+        a pre or post hook that calls the module
+        """
+        # We don't insert observer/fake_quantize for DeQuantStub
+        if needs_observation(m) and not isinstance(m, DeQuantStub):
+            # observer and hook will be gone after we swap the module
+            m.add_module(
+                "activation_post_process",
+                get_activation_post_process(
+                    m.qconfig, device, special_act_post_process
+                ),
+            )
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            _register_activation_post_process_hook(
+                m, pre_hook=_activation_is_memoryless(m.qconfig)
+            )
+
+    for name, child in module.named_children():
+        # TODO remove Dropout special after codebase stable
+        if type_before_parametrizations(child) is nn.Dropout:
+            continue
+        elif issubclass(
+            type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional)
+        ):
+            if needs_observation(child):
+                if not hasattr(child, "activation_post_process"):
+                    raise AssertionError(
+                        f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
+                    )
+                child.activation_post_process = get_activation_post_process(
+                    child.qconfig, device
+                )
+        elif isinstance(child, _FusedModule):
+            # activation_post_process are now added directly to nn.Sequential/_FusedModule
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif (
+            non_leaf_module_list is not None
+            and type_before_parametrizations(child) in non_leaf_module_list
+        ):
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif _has_special_act_post_process(child):
+            special_act_post_process = _get_special_act_post_process(child)
+            insert_activation_post_process(child, special_act_post_process)
+        elif (
+            needs_observation(child)
+            and type_before_parametrizations(child) in custom_module_class_mapping
+        ):
+            observed_class = custom_module_class_mapping[
+                type_before_parametrizations(child)
+            ]
+            observed_child = observed_class.from_float(child)
+            setattr(module, name, observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if not issubclass(observed_class, tuple(no_observer_set())):
+                insert_activation_post_process(observed_child)
+        else:
+            _add_observer_(
+                child,
+                qconfig_propagation_list,
+                non_leaf_module_list,
+                device,
+                custom_module_class_mapping,
+            )
+
+    # Insert observers only for leaf nodes, note that this observer is for
+    # the output of the module, for input QuantStub will observe them
+    if (
+        has_no_children_ignoring_parametrizations(module)
+        and not isinstance(module, torch.nn.Sequential)
+        and type_before_parametrizations(module) in qconfig_propagation_list
+    ):
+        insert_activation_post_process(module)
+    # This is a special case for AdaRound eager mode
+    # AdaRound contains weight_fake_quant to be propagated from API to convert
+    # leaf node check with a number of children looks naive assumption that blocks
+    # Adding an exception case for AdaRound
+    if (
+        hasattr(module, "weight_fake_quant")
+        and not isinstance(module, torch.nn.Sequential)
+        and type_before_parametrizations(module) in qconfig_propagation_list
+    ):
+        insert_activation_post_process(module)
+
+
+def _get_unique_devices_(module):
+    return {p.device for p in module.parameters() if p.device.type != "meta"} | {
+        p.device for p in module.buffers() if p.device.type != "meta"
+    }
+
+
+def add_quant_dequant(module):
+    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
+    Note that this function will modify the children of module inplace and it
+    can return a new module which wraps the input module as well.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        Either the inplace modified module with submodules wrapped in
+        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
+        wraps the input module, the latter case only happens when the input
+        module is a leaf module and we want to quantize it.
+    """
+    if (
+        has_no_children_ignoring_parametrizations(module)
+        and hasattr(module, "qconfig")
+        and module.qconfig
+    ):
+        return QuantWrapper(module)
+
+    for name, child in module.named_children():
+        module._modules[name] = add_quant_dequant(child)
+    return module
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare(
+    model,
+    inplace=False,
+    allow_list=None,
+    observer_non_leaf_module_list=None,
+    prepare_custom_config_dict=None,
+):
+    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    The model will be attached with observer or fake quant modules, and qconfig
+    will be propagated.
+
+    Args:
+        `model`: input model to be modified in-place
+        `inplace`: carry out model transformations in-place, the original module is mutated
+        `allow_list`: list of quantizable modules
+        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
+        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
+
+    .. code-block:: python
+
+       # Example of prepare_custom_config_dict:
+       prepare_custom_config_dict = {
+           # user will manually define the corresponding observed
+           # module class which has a from_float class method that converts
+           # float custom module to observed custom module
+           "float_to_observed_custom_module_class": {CustomModule: ObservedCustomModule}
+       }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = get_default_custom_config_dict()
+    custom_module_class_mapping = prepare_custom_config_dict.get(
+        "float_to_observed_custom_module_class", {}
+    )
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    # TODO: remove allow_list
+    qconfig_propagation_list = allow_list
+    if allow_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+    propagate_qconfig_(model, qconfig_dict=None)
+
+    # sanity check common API misusage
+    if not any(hasattr(m, "qconfig") and m.qconfig for m in model.modules()):
+        warnings.warn(
+            "None of the submodule got qconfig applied. Make sure you "
+            "passed correct configuration through `qconfig_dict` or "
+            "by assigning the `.qconfig` attribute directly on submodules",
+            stacklevel=2,
+        )
+
+    _add_observer_(
+        model,
+        qconfig_propagation_list,
+        observer_non_leaf_module_list,
+        custom_module_class_mapping=custom_module_class_mapping,
+    )
+    return model
+
+
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, "activation_post_process") and _is_activation_post_process(
+        module.activation_post_process
+    ):
+        delattr(module, "activation_post_process")
+
+    # remove activation_post_process pre and post hooks
+    def remove_hooks(pre_hook=False):
+        hook_map = module._forward_pre_hooks if pre_hook else module._forward_hooks
+        observer_hook = (
+            _observer_forward_pre_hook if pre_hook else _observer_forward_hook
+        )
+        handle_ids_to_remove = set()
+        for handle_id, hook_fn in hook_map.items():
+            if hook_fn is observer_hook:
+                handle_ids_to_remove.add(handle_id)
+        for handle_id in handle_ids_to_remove:
+            hook_map.pop(handle_id)
+
+    remove_hooks(pre_hook=True)
+    remove_hooks(pre_hook=False)
+
+
+# TODO: rename to something more general
+def _remove_qconfig(module):
+    r"""Clean up the qconfig left in the module so that new qconfig can be
+    propagated.
+
+    Args:
+        module: module to be cleaned up
+    """
+    for child in module.children():
+        _remove_qconfig(child)
+
+    if hasattr(module, "qconfig"):
+        del module.qconfig
+
+    _remove_activation_post_process(module)
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def quantize(model, run_fn, run_args, mapping=None, inplace=False):
+    r"""Quantize the input float model with post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        model: input float model
+        run_fn: a calibration function for calibrating the prepared model
+        run_args: positional arguments for `run_fn`
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: correspondence between original module types and quantized counterparts
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    prepare(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, mapping, inplace=True)
+    return model
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def quantize_dynamic(
+    model, qconfig_spec=None, dtype=torch.qint8, mapping=None, inplace=False
+):
+    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
+
+    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
+
+    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
+    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
+
+    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
+    If `qconfig` is provided, the `dtype` argument is ignored.
+
+    Args:
+        model: input model
+        qconfig_spec: Either:
+
+            - A dictionary that maps from name or type of submodule to quantization
+              configuration, qconfig applies to all submodules of a given
+              module unless qconfig for the submodules are specified (when the
+              submodule already has qconfig attribute). Entries in the dictionary
+              need to be QConfig instances.
+
+            - A set of types and/or submodule names to apply dynamic quantization to,
+              in which case the `dtype` argument is used to specify the bit-width
+
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
+            with which the submodule needs to be replaced
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
+    if qconfig_spec is None:
+        if dtype == torch.qint8:
+            qconfig_spec = {
+                nn.Linear: default_dynamic_qconfig,
+                nn.LSTM: default_dynamic_qconfig,
+                nn.GRU: default_dynamic_qconfig,
+                nn.LSTMCell: default_dynamic_qconfig,
+                nn.RNNCell: default_dynamic_qconfig,
+                nn.GRUCell: default_dynamic_qconfig,
+            }
+        elif dtype == torch.float16:
+            qconfig_spec = {
+                nn.Linear: float16_dynamic_qconfig,
+                nn.LSTM: float16_dynamic_qconfig,
+                nn.GRU: float16_dynamic_qconfig,
+                nn.LSTMCell: float16_dynamic_qconfig,
+                nn.RNNCell: float16_dynamic_qconfig,
+                nn.GRUCell: float16_dynamic_qconfig,
+            }
+        elif dtype == torch.quint8:
+            qconfig_spec = {
+                nn.EmbeddingBag: float_qparams_weight_only_qconfig,
+                nn.Embedding: float_qparams_weight_only_qconfig,
+            }
+        elif dtype == torch.quint4x2:
+            qconfig_spec = {
+                nn.EmbeddingBag: float_qparams_weight_only_qconfig_4bit,
+            }
+        else:
+            raise ValueError(
+                f"Don't know how to quantize with default settings for {dtype}. Provide full qconfig please"
+            )
+    elif isinstance(qconfig_spec, set):
+        if dtype is torch.qint8:
+            default_qconfig = default_dynamic_qconfig
+        elif dtype is torch.float16:
+            default_qconfig = float16_dynamic_qconfig
+        elif dtype is torch.quint8:
+            default_qconfig = float_qparams_weight_only_qconfig
+        elif dtype is torch.quint4x2:
+            default_qconfig = float_qparams_weight_only_qconfig_4bit
+        else:
+            raise RuntimeError(
+                "Unknown dtype specified for quantize_dynamic: ", str(dtype)
+            )
+        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
+
+    if mapping is None:
+        mapping = get_default_dynamic_quant_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    propagate_qconfig_(model, qconfig_spec)
+    convert(model, mapping, inplace=True)
+    return model
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare_qat(model, mapping=None, inplace=False):
+    r"""
+    Prepares a copy of the model for quantization calibration or
+    quantization-aware training and converts it to quantized version.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    Args:
+        model: input model to be modified in-place
+        mapping: dictionary that maps float modules to quantized modules to be
+                 replaced.
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
+    if not model.training:
+        raise AssertionError("prepare_qat only works on models in training mode")
+    if mapping is None:
+        mapping = get_default_qat_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    propagate_qconfig_(model, qconfig_dict=None)
+    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
+    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
+    return model
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def quantize_qat(model, run_fn, run_args, inplace=False):
+    r"""Do quantization aware training and output a quantized model
+
+    Args:
+        model: input model
+        run_fn: a function for evaluating the prepared model, can be a
+                function that simply runs the prepared model or a training
+                loop
+        run_args: positional arguments for `run_fn`
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.train()
+    prepare_qat(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, inplace=True)
+    return model
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def convert(
+    module,
+    mapping=None,
+    inplace=False,
+    remove_qconfig=True,
+    is_reference=False,
+    convert_custom_config_dict=None,
+    use_precomputed_fake_quant=False,
+):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class. And remove qconfig at the
+    end if remove_qconfig is set to True.
+
+    Args:
+        `module`: prepared and calibrated module
+        `mapping`: a dictionary that maps from source module type to target
+                   module type, can be overwritten to allow swapping user defined
+                   Modules
+        `inplace`: carry out model transformations in-place, the original module
+                   is mutated
+        `convert_custom_config_dict`: custom configuration dictionary for convert function
+        `use_precomputed_fake_quant`: a flag to enable use of precomputed fake quant
+
+    .. code-block:: python
+
+       # Example of convert_custom_config_dict:
+       convert_custom_config_dict = {
+           # user will manually define the corresponding quantized
+           # module class which has a from_observed class method that converts
+           # observed custom module to quantized custom module
+           "observed_to_quantized_custom_module_class": {
+               ObservedCustomModule: QuantizedCustomModule
+           }
+       }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.convert")
+    if not inplace:
+        module = copy.deepcopy(module)
+    _convert(
+        module,
+        mapping,
+        inplace=True,
+        is_reference=is_reference,
+        convert_custom_config_dict=convert_custom_config_dict,
+        use_precomputed_fake_quant=use_precomputed_fake_quant,
+    )
+    if remove_qconfig:
+        _remove_qconfig(module)
+    return module
+
+
+def _convert(
+    module,
+    mapping=None,
+    inplace=False,
+    is_reference=False,
+    convert_custom_config_dict=None,
+    use_precomputed_fake_quant=False,
+):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class
+
+    Args:
+        module: input module
+        mapping: a dictionary that maps from source module type to target
+                 module type, can be overwritten to allow swapping user defined
+                 Modules
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+        is_reference: a flag to enable quantized reference module
+        use_precomputed_fake_quant: a flag to enable use of precomputed fake quant
+
+    """
+    if mapping is None:
+        mapping = (
+            get_default_static_quant_reference_module_mappings()
+            if is_reference
+            else get_default_static_quant_module_mappings()
+        )
+    if convert_custom_config_dict is None:
+        convert_custom_config_dict = get_default_custom_config_dict()
+    custom_module_class_mapping = convert_custom_config_dict.get(
+        "observed_to_quantized_custom_module_class", {}
+    )
+
+    if not inplace:
+        module = copy.deepcopy(module)
+    reassign = {}
+    for name, mod in module.named_children():
+        # both fused modules and observed custom modules are
+        # swapped as one unit
+        if (
+            not isinstance(mod, _FusedModule)
+            and type_before_parametrizations(mod) not in custom_module_class_mapping
+        ):
+            _convert(
+                mod,
+                mapping,
+                True,  # inplace
+                is_reference,
+                convert_custom_config_dict,
+                use_precomputed_fake_quant=use_precomputed_fake_quant,
+            )
+        reassign[name] = swap_module(
+            mod, mapping, custom_module_class_mapping, use_precomputed_fake_quant
+        )
+
+    for key, value in reassign.items():
+        module._modules[key] = value
+
+    return module
+
+
+def swap_module(
+    mod, mapping, custom_module_class_mapping, use_precomputed_fake_quant=False
+):
+    r"""Swaps the module if it has a quantized counterpart and it has an
+    `observer` attached.
+
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to nnq module
+
+    Return:
+        The corresponding quantized module of `mod`
+    """
+    new_mod = mod
+    if hasattr(mod, "qconfig") and mod.qconfig is not None:
+        swapped = False
+        if type_before_parametrizations(mod) in custom_module_class_mapping:
+            new_mod = custom_module_class_mapping[
+                type_before_parametrizations(mod)
+            ].from_observed(mod)
+            swapped = True
+        elif type_before_parametrizations(mod) in mapping:
+            qmod = mapping[type_before_parametrizations(mod)]
+            if hasattr(qmod, "_IS_REFERENCE") and qmod._IS_REFERENCE:
+                if mod.qconfig is None:
+                    raise AssertionError(
+                        "module qconfig must not be None when swapping to reference module"
+                    )
+                weight_post_process = mod.qconfig.weight()
+                weight_post_process(mod.weight)
+                weight_qparams = get_qparam_dict(weight_post_process)
+                new_mod = qmod.from_float(mod, weight_qparams)
+            else:
+                sig = inspect.signature(qmod.from_float)
+                if "use_precomputed_fake_quant" in sig.parameters:
+                    new_mod = qmod.from_float(
+                        mod, use_precomputed_fake_quant=use_precomputed_fake_quant
+                    )
+                else:
+                    new_mod = qmod.from_float(mod)
+            swapped = True
+
+        if swapped:
+            # Preserve module's pre forward hooks. They'll be called on quantized input
+            for pre_hook_fn in mod._forward_pre_hooks.values():
+                new_mod.register_forward_pre_hook(pre_hook_fn)
+            # Preserve module's post forward hooks except _observer_forward_hook
+            # After convert they'll work with quantized output
+            for hook_fn in mod._forward_hooks.values():
+                if hook_fn is not _observer_forward_hook:
+                    new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = _get_unique_devices_(mod)
+            if not (
+                len(devices) <= 1
+                or (len(devices) == 2 and torch.device("meta") in devices)
+            ):
+                raise AssertionError(
+                    f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+                )
+            device = next(iter(devices)) if len(devices) > 0 else None
+            if device:
+                new_mod.to(device)
+    return new_mod
+
+
+def _get_observer_dict(mod, target_dict, prefix=""):
+    r"""Traverse the modules and save all observers into dict.
+    This is mainly used for quantization accuracy debug
+    Args:
+        mod: the top module we want to save all observers
+        prefix: the prefix for the current module
+        target_dict: the dictionary used to save all the observers
+    """
+
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + "."
+
+    if hasattr(mod, "activation_post_process"):
+        target_dict[get_prefix(prefix) + "activation_post_process"] = (
+            mod.activation_post_process
+        )
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        _get_observer_dict(child, target_dict, module_prefix)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba6ab86aaa048fbd128f9a89cc32d4e438d3fe12
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_fx.py
@@ -0,0 +1,759 @@
+import copy
+import typing_extensions
+import warnings
+from typing import Any
+
+import torch
+from torch.fx import GraphModule
+from torch.fx.graph_module import _USER_PRESERVED_ATTRIBUTES_KEY
+
+from .backend_config import BackendConfig, get_tensorrt_backend_config  # noqa: F401
+from .fx.convert import convert
+from .fx.custom_config import ConvertCustomConfig, FuseCustomConfig, PrepareCustomConfig
+from .fx.fuse import fuse  # noqa: F401
+from .fx.graph_module import ObservedGraphModule  # noqa: F401
+from .fx.prepare import prepare  # noqa: F401
+from .fx.tracer import QuantizationTracer, Scope, ScopeContextManager  # noqa: F401
+from .fx.utils import (  # noqa: F401
+    get_custom_module_class_keys,
+    get_skipped_module_name_and_classes,
+)
+from .qconfig_mapping import QConfigMapping
+from .utils import DEPRECATION_WARNING
+
+
+def attach_preserved_attrs_to_model(
+    model: GraphModule | torch.nn.Module,
+    preserved_attrs: dict[str, Any],
+) -> None:
+    """Store preserved attributes to the model.meta so that it can be preserved during deepcopy"""
+    model.meta[_USER_PRESERVED_ATTRIBUTES_KEY] = copy.copy(preserved_attrs)  # type: ignore[operator, index, assignment]
+    # set the preserved attributes in the model so that user can call
+    # model.attr as they do before calling fx graph mode quantization
+    for attr_name, attr in model.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():  # type: ignore[index, union-attr]
+        setattr(model, attr_name, attr)
+
+
+def _check_is_graph_module(model: torch.nn.Module) -> None:
+    if not isinstance(model, GraphModule):
+        raise ValueError(
+            "input model must be a GraphModule, "
+            + "Got type:"
+            + str(type(model))
+            + " Please make "
+            + "sure to follow the tutorials."
+        )
+
+
+def _attach_meta_to_node_if_not_exist(model: GraphModule) -> None:
+    """Attach meta field to all nodes of the graph if it does not exist,
+    meta field is a field stores some meta information about the node, such
+    as dtype and shape information for output of the node, this only exists
+    if the program is captured by make_fx (used in quantize_pt2e flow), if
+    the program is captured by torch.fx symbolic tracing, this field may not exist,
+    so we add it here to avoid checking this all over the places
+    """
+    for node in model.graph.nodes:
+        if not hasattr(node, "meta"):
+            node.meta = {}
+
+
+def _swap_ff_with_fxff(model: torch.nn.Module) -> None:
+    r"""Swap FloatFunctional with FXFloatFunctional"""
+    modules_to_swap = []
+    for name, module in model.named_children():
+        if isinstance(module, torch.ao.nn.quantized.FloatFunctional):
+            modules_to_swap.append(name)
+        else:
+            _swap_ff_with_fxff(module)
+
+    for name in modules_to_swap:
+        del model._modules[name]
+        model._modules[name] = torch.ao.nn.quantized.FXFloatFunctional()
+
+
+def _fuse_fx(
+    model: GraphModule,
+    is_qat: bool,
+    fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""Internal helper function to fuse modules in preparation for quantization
+
+    Args:
+        model: GraphModule object from symbolic tracing (torch.fx.symbolic_trace)
+    """
+    _check_is_graph_module(model)
+    return fuse(model, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
+
+
+def _prepare_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: QConfigMapping | dict[str, Any],
+    is_qat: bool,
+    example_inputs: tuple[Any, ...],
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    _equalization_config: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+    is_standalone_module: bool = False,
+) -> GraphModule:
+    r"""Internal helper function for prepare_fx
+        Args:
+          `model`, `qconfig_mapping`, `prepare_custom_config`, `_equalization_config`:
+          see docs for :func:`~torch.ao.quantization.prepare_fx`
+          `is_standalone_module`: a boolean flag indicates whether we are
+          quantizing a standalone module or not, a standalone module
+          is a submodule of the parent module that is not inlined in the
+    forward graph of the parent module,
+          the way we quantize standalone module is described in:
+          :func:`~torch.ao.quantization._prepare_standalone_module_fx`
+    """
+    if prepare_custom_config is None:
+        prepare_custom_config = PrepareCustomConfig()
+    if _equalization_config is None:
+        _equalization_config = QConfigMapping()
+
+    if isinstance(prepare_custom_config, dict):
+        warnings.warn(
+            "Passing a prepare_custom_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a PrepareCustomConfig instead.",
+            FutureWarning,
+            stacklevel=3,
+        )
+        prepare_custom_config = PrepareCustomConfig.from_dict(prepare_custom_config)
+
+    # swap FloatFunctional with FXFloatFunctional
+    _swap_ff_with_fxff(model)
+
+    skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes(
+        prepare_custom_config, is_standalone_module
+    )
+    preserved_attr_names = prepare_custom_config.preserved_attributes
+    preserved_attrs = {
+        attr: getattr(model, attr)
+        for attr in preserved_attr_names
+        if hasattr(model, attr)
+    }
+    # symbolically trace the model
+    tracer = QuantizationTracer(skipped_module_names, skipped_module_classes)  # type: ignore[arg-type]
+    graph_module = GraphModule(model, tracer.trace(model))
+    _attach_meta_to_node_if_not_exist(graph_module)
+
+    fuse_custom_config = FuseCustomConfig().set_preserved_attributes(
+        prepare_custom_config.preserved_attributes
+    )
+    graph_module = _fuse_fx(graph_module, is_qat, fuse_custom_config, backend_config)
+    prepared = prepare(
+        graph_module,
+        qconfig_mapping,
+        is_qat,
+        tracer.node_name_to_scope,
+        example_inputs=example_inputs,
+        prepare_custom_config=prepare_custom_config,
+        _equalization_config=_equalization_config,
+        backend_config=backend_config,
+        is_standalone_module=is_standalone_module,
+    )  # type: ignore[operator]
+
+    attach_preserved_attrs_to_model(prepared, preserved_attrs)
+    return prepared
+
+
+def _prepare_standalone_module_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: QConfigMapping | dict[str, Any],
+    is_qat: bool,
+    example_inputs: tuple[Any, ...],
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""[Internal use only] Prepare a standalone module, so that it can be used when quantizing the
+    parent module.
+    standalone_module means it a submodule that is not inlined in parent module,
+    and will be quantized separately as one unit.
+
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+
+    Returns:
+
+        * model(GraphModule): prepared standalone module. It has these attributes in
+          model.meta:
+
+            * `standalone_module_input_quantized_idxs(List[Int])`: a list of
+              indexes for the graph input that is expected to be quantized,
+              same as input_quantized_idxs configuration provided
+              for the standalone module
+            * `standalone_module_output_quantized_idxs(List[Int])`: a list of
+              indices for the graph output that is quantized
+              same as input_quantized_idxs configuration provided
+              for the standalone module
+
+    """
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        is_qat,
+        example_inputs,
+        prepare_custom_config,
+        backend_config=backend_config,
+        is_standalone_module=True,
+    )
+
+
+def fuse_fx(
+    model: torch.nn.Module,
+    fuse_custom_config: FuseCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
+    Fusion rules are defined in torch.ao.quantization.fx.fusion_pattern.py
+
+    Args:
+
+        * `model` (torch.nn.Module): a torch.nn.Module model
+        * `fuse_custom_config` (FuseCustomConfig): custom configurations for fuse_fx.
+            See :class:`~torch.ao.quantization.fx.custom_config.FuseCustomConfig` for more details
+    Example::
+
+        from torch.ao.quantization import fuse_fx
+
+        m = Model().eval()
+        m = fuse_fx(m)
+
+    """
+    if fuse_custom_config is None:
+        fuse_custom_config = FuseCustomConfig()
+
+    if isinstance(fuse_custom_config, dict):
+        warnings.warn(
+            "Passing a fuse_custom_config_dict to fuse is deprecated and will not be supported "
+            "in a future version. Please pass in a FuseCustomConfig instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        fuse_custom_config = FuseCustomConfig.from_dict(fuse_custom_config)
+
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx")
+    preserved_attr_names = fuse_custom_config.preserved_attributes
+    preserved_attrs = {
+        attr: getattr(model, attr)
+        for attr in preserved_attr_names
+        if hasattr(model, attr)
+    }
+
+    graph_module = torch.fx.symbolic_trace(model)
+    _attach_meta_to_node_if_not_exist(graph_module)
+    graph_module = _fuse_fx(graph_module, False, fuse_custom_config, backend_config)
+
+    attach_preserved_attrs_to_model(graph_module, preserved_attrs)
+    return graph_module
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: QConfigMapping | dict[str, Any],
+    example_inputs: tuple[Any, ...],
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    _equalization_config: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r""" Prepare a model for post training quantization
+
+    Args:
+      * `model` (torch.nn.Module): torch.nn.Module model
+
+      * `qconfig_mapping` (QConfigMapping): QConfigMapping object to configure how a model is
+         quantized, see :class:`~torch.ao.quantization.qconfig_mapping.QConfigMapping`
+         for more details
+
+      * `example_inputs` (Tuple[Any, ...]): Example inputs for forward function of the model,
+         Tuple of positional args (keyword args can be passed as positional args as well)
+
+      * `prepare_custom_config` (PrepareCustomConfig): customization configuration for quantization tool.
+          See :class:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig` for more details
+
+      * `_equalization_config`: config for specifying how to perform equalization on the model
+
+      * `backend_config` (BackendConfig): config that specifies how operators are quantized
+         in a backend, this includes how the operators are observed,
+         supported fusion patterns, how quantize/dequantize ops are
+         inserted, supported dtypes etc. See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
+
+    Return:
+      A GraphModule with observer (configured by qconfig_mapping), ready for calibration
+
+    Example::
+
+        import torch
+        from torch.ao.quantization import get_default_qconfig_mapping
+        from torch.ao.quantization.quantize_fx import prepare_fx
+
+        class Submodule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.sub = Submodule()
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.sub(x) + x
+                return x
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define calibration function
+        def calibrate(model, data_loader):
+            model.eval()
+            with torch.no_grad():
+                for image, target in data_loader:
+                    model(image)
+
+        # qconfig is the configuration for how we insert observers for a particular
+        # operator
+        # qconfig = get_default_qconfig("fbgemm")
+        # Example of customizing qconfig:
+        # qconfig = torch.ao.quantization.QConfig(
+        #    activation=MinMaxObserver.with_args(dtype=torch.qint8),
+        #    weight=MinMaxObserver.with_args(dtype=torch.qint8))
+        # `activation` and `weight` are constructors of observer module
+
+        # qconfig_mapping is a collection of quantization configurations, user can
+        # set the qconfig for each operator (torch op calls, functional calls, module calls)
+        # in the model through qconfig_mapping
+        # the following call will get the qconfig_mapping that works best for models
+        # that target "fbgemm" backend
+        qconfig_mapping = get_default_qconfig_mapping("fbgemm")
+
+        # We can customize qconfig_mapping in different ways.
+        # e.g. set the global qconfig, which means we will use the same qconfig for
+        # all operators in the model, this can be overwritten by other settings
+        # qconfig_mapping = QConfigMapping().set_global(qconfig)
+        # e.g. quantize the linear submodule with a specific qconfig
+        # qconfig_mapping = QConfigMapping().set_module_name("linear", qconfig)
+        # e.g. quantize all nn.Linear modules with a specific qconfig
+        # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig)
+        # for a more complete list, please see the docstring for :class:`torch.ao.quantization.QConfigMapping`
+        # argument
+
+        # example_inputs is a tuple of inputs, that is used to infer the type of the
+        # outputs in the model
+        # currently it's not used, but please make sure model(*example_inputs) runs
+        example_inputs = (torch.randn(1, 3, 224, 224),)
+
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        # `prepare_fx` inserts observers in the model based on qconfig_mapping and
+        # backend_config. If the configuration for an operator in qconfig_mapping
+        # is supported in the backend_config (meaning it's supported by the target
+        # hardware), we'll insert observer modules according to the qconfig_mapping
+        # otherwise the configuration in qconfig_mapping will be ignored
+        #
+        # Example:
+        # in qconfig_mapping, user sets linear module to be quantized with quint8 for
+        # activation and qint8 for weight:
+        # qconfig = torch.ao.quantization.QConfig(
+        #     observer=MinMaxObserver.with_args(dtype=torch.quint8),
+        #     weight=MinMaxObserver.with-args(dtype=torch.qint8))
+        # Note: current qconfig api does not support setting output observer, but
+        # we may extend this to support these more fine grained control in the
+        # future
+        #
+        # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig)
+        # in backend config, linear module also supports in this configuration:
+        # weighted_int8_dtype_config = DTypeConfig(
+        #   input_dtype=torch.quint8,
+        #   output_dtype=torch.quint8,
+        #   weight_dtype=torch.qint8,
+        #   bias_type=torch.float)
+
+        # linear_pattern_config = BackendPatternConfig(torch.nn.Linear) \
+        #    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+        #    .add_dtype_config(weighted_int8_dtype_config) \
+        #    ...
+
+        # backend_config = BackendConfig().set_backend_pattern_config(linear_pattern_config)
+        # `prepare_fx` will check that the setting requested by suer in qconfig_mapping
+        # is supported by the backend_config and insert observers and fake quant modules
+        # in the model
+        prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs)
+        # Run calibration
+        calibrate(prepared_model, sample_inference_data)
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        False,  # is_qat
+        example_inputs,
+        prepare_custom_config,
+        _equalization_config,
+        backend_config,
+    )
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare_qat_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: QConfigMapping | dict[str, Any],
+    example_inputs: tuple[Any, ...],
+    prepare_custom_config: PrepareCustomConfig | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""Prepare a model for quantization aware training
+
+    Args:
+      * `model` (torch.nn.Module): torch.nn.Module model
+      * `qconfig_mapping` (QConfigMapping): see :func:`~torch.ao.quantization.prepare_fx`
+      * `example_inputs` (Tuple[Any, ...]): see :func:`~torch.ao.quantization.prepare_fx`
+      * `prepare_custom_config` (PrepareCustomConfig): see :func:`~torch.ao.quantization.prepare_fx`
+      * `backend_config` (BackendConfig): see :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+      A GraphModule with fake quant modules (configured by qconfig_mapping and backend_config), ready for
+      quantization aware training
+
+    Example::
+
+        import torch
+        from torch.ao.quantization import get_default_qat_qconfig_mapping
+        from torch.ao.quantization.quantize_fx import prepare_qat_fx
+
+
+        class Submodule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.sub = Submodule()
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.sub(x) + x
+                return x
+
+
+        # initialize a floating point model
+        float_model = M().train()
+        # (optional, but preferred) load the weights from pretrained model
+        # float_model.load_weights(...)
+
+
+        # define the training loop for quantization aware training
+        def train_loop(model, train_data):
+            model.train()
+            for image, target in data_loader:
+                ...
+
+
+        # qconfig is the configuration for how we insert observers for a particular
+        # operator
+        # qconfig = get_default_qconfig("fbgemm")
+        # Example of customizing qconfig:
+        # qconfig = torch.ao.quantization.QConfig(
+        #    activation=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8)),
+        #    weight=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8)))
+        # `activation` and `weight` are constructors of observer module
+
+        # qconfig_mapping is a collection of quantization configurations, user can
+        # set the qconfig for each operator (torch op calls, functional calls, module calls)
+        # in the model through qconfig_mapping
+        # the following call will get the qconfig_mapping that works best for models
+        # that target "fbgemm" backend
+        qconfig_mapping = get_default_qat_qconfig_mapping("fbgemm")
+
+        # We can customize qconfig_mapping in different ways, please take a look at
+        # the docstring for :func:`~torch.ao.quantization.prepare_fx` for different ways
+        # to configure this
+
+        # example_inputs is a tuple of inputs, that is used to infer the type of the
+        # outputs in the model
+        # currently it's not used, but please make sure model(*example_inputs) runs
+        example_inputs = (torch.randn(1, 3, 224, 224),)
+
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        # `prepare_qat_fx` inserts observers in the model based on qconfig_mapping and
+        # backend_config, if the configuration for an operator in qconfig_mapping
+        # is supported in the backend_config (meaning it's supported by the target
+        # hardware), we'll insert fake_quantize modules according to the qconfig_mapping
+        # otherwise the configuration in qconfig_mapping will be ignored
+        # see :func:`~torch.ao.quantization.prepare_fx` for a detailed explanation of
+        # how qconfig_mapping interacts with backend_config
+        prepared_model = prepare_qat_fx(float_model, qconfig_mapping, example_inputs)
+        # Run training
+        train_loop(prepared_model, train_loop)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        True,  # is_qat
+        example_inputs,
+        prepare_custom_config,
+        backend_config=backend_config,
+    )
+
+
+def _convert_fx(
+    graph_module: GraphModule,
+    is_reference: bool,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+    is_standalone_module: bool = False,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+    is_decomposed: bool = False,
+    keep_original_weights: bool = False,
+) -> GraphModule:
+    """`is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx`"""
+    if convert_custom_config is None:
+        convert_custom_config = ConvertCustomConfig()
+
+    if isinstance(convert_custom_config, dict):
+        warnings.warn(
+            "Passing a convert_custom_config_dict to convert is deprecated and will not be supported "
+            "in a future version. Please pass in a ConvertCustomConfig instead.",
+            FutureWarning,
+            stacklevel=3,
+        )
+        convert_custom_config = ConvertCustomConfig.from_dict(convert_custom_config)
+
+    _check_is_graph_module(graph_module)
+    preserved_attr_names = convert_custom_config.preserved_attributes
+    preserved_attrs = {
+        attr: getattr(graph_module, attr)
+        for attr in preserved_attr_names
+        if hasattr(graph_module, attr)
+    }
+
+    quantized = convert(
+        graph_module,
+        is_reference,
+        convert_custom_config,
+        is_standalone_module,
+        _remove_qconfig_flag=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        is_decomposed=is_decomposed,
+        keep_original_weights=keep_original_weights,
+    )
+
+    attach_preserved_attrs_to_model(quantized, preserved_attrs)
+    return quantized
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def convert_fx(
+    graph_module: GraphModule,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+    keep_original_weights: bool = False,
+) -> GraphModule:
+    r"""Convert a calibrated or trained model to a quantized model
+
+    Args:
+        * `graph_module` (torch.fx.GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :class:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig` for more details
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+
+           The keys must include the ones in the qconfig_mapping passed to `prepare_fx` or `prepare_qat_fx`,
+           with the same values or `None`. Additional keys can be specified with values set to `None`.
+
+          For each entry whose value is set to None, we skip quantizing that entry in the model::
+
+            qconfig_mapping = QConfigMapping
+                .set_global(qconfig_from_prepare)
+                .set_object_type(torch.nn.functional.add, None)  # skip quantizing torch.nn.functional.add
+                .set_object_type(torch.nn.functional.linear, qconfig_from_prepare)
+                .set_module_name("foo.bar", None)  # skip quantizing module "foo.bar"
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend, this includes quantization
+            mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.),
+            observer placement for each operators and fused operators.
+            See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
+
+    Return:
+        A quantized model (torch.nn.Module)
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # convert_fx converts a calibrated/trained model to a quantized model for the
+        # target hardware, this includes converting the model first to a reference
+        # quantized model, and then lower the reference quantized model to a backend
+        # Currently, the supported backends are fbgemm (onednn), qnnpack (xnnpack) and
+        # they share the same set of quantized operators, so we are using the same
+        # lowering procedure
+        #
+        # backend_config defines the corresponding reference quantized module for
+        # the weighted modules in the model, e.g. nn.Linear
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        quantized_model = convert_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=False,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        keep_original_weights=keep_original_weights,
+    )
+
+
+def convert_to_reference_fx(
+    graph_module: GraphModule,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""Convert a calibrated or trained model to a reference quantized model,
+    see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
+    reference quantized model is a standard representation of a quantized model provided
+    by FX Graph Mode Quantization, it can be further lowered to run on the target
+    hardware, like accelerators
+
+    Args:
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend. See
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+    Return:
+        A reference quantized model (GraphModule)
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        reference_quantized_model = convert_to_reference_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_to_reference_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=True,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+    )
+
+
+def _convert_to_reference_decomposed_fx(
+    graph_module: GraphModule,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+    qconfig_mapping: QConfigMapping | dict[str, Any] | None = None,
+    backend_config: BackendConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""Convert a calibrated or trained model to a reference quantized model, with
+    decomposed representation for quantized Tensor
+    see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
+    reference quantized model is a standard representation of a quantized model provided
+    by FX Graph Mode Quantization, it can be further lowered to run on the target
+    hardware, like accelerators
+
+    Note: this is not public API
+
+    Args:
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend. See
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+    Return:
+        A reference quantized model (GraphModule) with operators working with decomposed quantized Tensor
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        reference_quantized_model = _convert_to_reference_decomposed_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once(
+        "quantization_api.quantize_fx._convert_to_reference_decomposed_fx"
+    )
+    return _convert_fx(
+        graph_module,
+        is_reference=True,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=False,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        is_decomposed=True,
+    )
+
+
+def _convert_standalone_module_fx(
+    graph_module: GraphModule,
+    is_reference: bool = False,
+    convert_custom_config: ConvertCustomConfig | dict[str, Any] | None = None,
+) -> GraphModule:
+    r"""[Internal use only] Convert a model produced by :func:`~torch.ao.quantization.prepare_standalone_module_fx`
+    and convert it to a quantized model
+
+    Returns a quantized standalone module, whether input/output is quantized is
+    specified by prepare_custom_config, with
+    input_quantized_idxs, output_quantized_idxs, please
+    see docs for prepare_fx for details
+    """
+    return _convert_fx(
+        graph_module,
+        is_reference,
+        convert_custom_config,
+        is_standalone_module=True,
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4caab1edcd010a66032cab51cae77ad8e4ed62
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_jit.py
@@ -0,0 +1,423 @@
+# mypy: allow-untyped-defs
+
+import torch
+from torch.ao.quantization.qconfig import QConfig
+from torch.ao.quantization.quant_type import QuantType
+from torch.jit._recursive import wrap_cpp_module
+
+
+__all__ = [
+    "script_qconfig",
+    "script_qconfig_dict",
+    "fuse_conv_bn_jit",
+    "prepare_jit",
+    "prepare_dynamic_jit",
+    "convert_jit",
+    "convert_dynamic_jit",
+    "quantize_jit",
+    "quantize_dynamic_jit",
+]
+
+
+def _check_is_script_module(model):
+    if not isinstance(model, torch.jit.ScriptModule):
+        raise ValueError("input must be a script module, got: " + str(type(model)))
+
+
+def _check_forward_method(model):
+    if not model._c._has_method("forward"):
+        raise ValueError("input script module does not have forward method")
+
+
+def script_qconfig(qconfig):
+    r"""Instantiate the activation and weight observer modules and script
+    them, these observer module instances will be deepcopied during
+    prepare_jit step.
+    """
+    return QConfig(
+        activation=torch.jit.script(qconfig.activation())._c,
+        weight=torch.jit.script(qconfig.weight())._c,
+    )
+
+
+def script_qconfig_dict(qconfig_dict):
+    r"""Helper function used by `prepare_jit`.
+    Apply `script_qconfig` for all entries in `qconfig_dict` that is
+    not None.
+    """
+    return {k: script_qconfig(v) if v else None for k, v in qconfig_dict.items()}
+
+
+def fuse_conv_bn_jit(model, inplace=False):
+    r"""Fuse conv - bn module
+    Works for eval model only.
+
+    Args:
+        model: TorchScript model from scripting or tracing
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.fuse_conv_bn_jit")
+    model_c = model._c
+    model_c = torch._C._jit_pass_fold_convbn(model_c)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+
+def _prepare_jit(model, qconfig_dict, inplace=False, quant_type=QuantType.STATIC):
+    _check_is_script_module(model)
+    _check_forward_method(model)
+    if not all(isinstance(x, str) for x in qconfig_dict):
+        raise ValueError("qconfig_dict should only contain names(str) as keys.")
+    scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
+    model = fuse_conv_bn_jit(model, inplace)
+    model_c = torch._C._jit_pass_insert_observers(
+        model._c, "forward", scripted_qconfig_dict, inplace, quant_type
+    )
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+
+def _prepare_ondevice_jit(
+    model,
+    qconfig_dict,
+    method_name="forward",
+    inplace=False,
+    quant_type=QuantType.STATIC,
+):
+    _check_is_script_module(model)
+    if not all(isinstance(x, str) for x in qconfig_dict):
+        raise ValueError("qconfig_dict should only contain names(str) as keys.")
+    scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
+    method_graph = model._c._get_method(method_name).graph
+    torch._C._jit_pass_inline(method_graph)
+    model = fuse_conv_bn_jit(model, inplace)
+    model_c = torch._C._jit_pass_insert_observer_method_for_ondevice_ptq(
+        model._c, method_name, scripted_qconfig_dict, inplace, quant_type
+    )
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+
+def prepare_jit(model, qconfig_dict, inplace=False):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_jit")
+    return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.STATIC)
+
+
+def prepare_dynamic_jit(model, qconfig_dict, inplace=False):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_dynamic_jit")
+    return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC)
+
+
+def _prepare_ondevice_dynamic_jit(
+    model, qconfig_dict, method_name="forward", inplace=False
+):
+    return _prepare_ondevice_jit(
+        model, qconfig_dict, method_name, inplace, quant_type=QuantType.DYNAMIC
+    )
+
+
+def _convert_jit(
+    model, inplace=False, debug=False, quant_type=QuantType.STATIC, preserved_attrs=None
+):
+    _check_is_script_module(model)
+    model.eval()
+    model_c = model._c
+    model_c = torch._C._jit_pass_insert_quant_dequant(
+        model_c, "forward", inplace, debug, quant_type
+    )
+    if not debug:
+        is_xpu = all(p.device.type == "xpu" for p in model.parameters())
+        if not is_xpu:
+            # Moving model parameters to CPU since quantized operators
+            # are only supported on CPU and XPU right now
+            model.cpu()
+        if preserved_attrs is None:
+            preserved_attrs = []
+        model_c = torch._C._jit_pass_quant_finalize(
+            model_c, quant_type, preserved_attrs
+        )
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    torch._C._jit_pass_constant_propagation(model.graph)
+    torch._C._jit_pass_dce(model.graph)
+    return model
+
+
+def _convert_ondevice_jit(
+    model, method_name, inplace=False, debug=False, quant_type=QuantType.STATIC
+):
+    _check_is_script_module(model)
+    if quant_type != QuantType.DYNAMIC:
+        raise AssertionError(
+            "This API, while should work for static quant, is only tested for dynamic quant."
+        )
+    if method_name.startswith("observe_"):
+        raise AssertionError("Pass in valid method to be quantized, e.g. forward")
+    observe_method_name = "observe_" + method_name
+    quantize_method_name = "quantize_" + method_name
+    model_c = model._c
+    model_c = torch._C._jit_pass_insert_quant_dequant_for_ondevice_ptq(
+        model._c, observe_method_name, inplace, debug, QuantType.DYNAMIC
+    )
+    model_c = torch._C._jit_pass_quant_finalize_for_ondevice_ptq(
+        model_c, QuantType.DYNAMIC, quantize_method_name
+    )
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+
+def convert_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_jit")
+    return _convert_jit(
+        model,
+        inplace,
+        debug,
+        quant_type=QuantType.STATIC,
+        preserved_attrs=preserved_attrs,
+    )
+
+
+def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_dynamic_jit")
+    return _convert_jit(
+        model,
+        inplace,
+        debug,
+        quant_type=QuantType.DYNAMIC,
+        preserved_attrs=preserved_attrs,
+    )
+
+
+def _convert_ondevice_dynamic_jit(model, method_name, inplace=False, debug=False):
+    return _convert_ondevice_jit(
+        model, method_name, inplace, debug, quant_type=QuantType.DYNAMIC
+    )
+
+
+def _quantize_ondevice_dynamic_jit_impl(
+    model, qconfig_dict, method_name, inplace=False
+):
+    model = _prepare_ondevice_dynamic_jit(model, qconfig_dict, method_name, inplace)
+    model = _convert_ondevice_dynamic_jit(model, method_name, inplace)
+    return model
+
+
+def _quantize_jit(
+    model,
+    qconfig_dict,
+    run_fn=None,
+    run_args=None,
+    inplace=False,
+    debug=False,
+    quant_type=QuantType.STATIC,
+):
+    # Always do inplace convert because the Tensor is already
+    # copied in prepare_jit when inplace is False
+    if quant_type == QuantType.DYNAMIC:
+        model = prepare_dynamic_jit(model, qconfig_dict, inplace)
+        model = convert_dynamic_jit(model, True, debug)
+    else:
+        if not run_fn:
+            raise AssertionError(
+                "Must provide calibration function for post training static quantization"
+            )
+        if not run_args:
+            raise AssertionError(
+                "Must provide calibration dataset for post training static quantization"
+            )
+        model = prepare_jit(model, qconfig_dict, inplace)
+        run_fn(model, *run_args)
+        model = convert_jit(model, True, debug)
+
+    torch._C._jit_pass_constant_propagation(model.graph)
+    torch._C._jit_pass_dce(model.graph)
+    return model
+
+
+def quantize_jit(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
+    r"""Quantize the input float TorchScript model with
+    post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, empty key means the qconfig will be applied
+        to whole model unless it's overwritten by more specific configurations, the
+        qconfig for each module is either found in the dictionary or fallback to
+         the qconfig of parent module.
+
+        Right now qconfig_dict is the only way to configure how the model is quantized,
+        and it is done in the granularity of module, that is, we only support one type
+        of qconfig for each torch.nn.Module, and the qconfig for sub module will
+        override the qconfig for parent module, empty string means global configuration.
+        `run_fn`: a calibration function for calibrating the prepared model
+        `run_args`: positional arguments for `run_fn`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import get_default_qconfig
+    from torch.ao.quantization import quantize_jit
+
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+
+
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+
+    quantized_model = quantize_jit(
+        ts_model, {"": qconfig}, calibrate, [data_loader_test]
+    )
+    ```
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_jit")
+    return _quantize_jit(
+        model,
+        qconfig_dict,
+        run_fn,
+        run_args,
+        inplace,
+        debug,
+        quant_type=QuantType.STATIC,
+    )
+
+
+def quantize_dynamic_jit(model, qconfig_dict, inplace=False, debug=False):
+    r"""Quantize the input float TorchScript model with
+    post training dynamic quantization.
+    Currently only qint8 quantization of torch.nn.Linear is supported.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, please see detailed
+        descriptions in :func:`~torch.ao.quantization.quantize_jit`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import per_channel_dynamic_qconfig
+    from torch.ao.quantization import quantize_dynamic_jit
+
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+
+
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+
+    quantized_model = quantize_dynamic_jit(
+        ts_model, {"": qconfig}, calibrate, [data_loader_test]
+    )
+    ```
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_dynamic_jit")
+    return _quantize_jit(
+        model, qconfig_dict, inplace=inplace, debug=debug, quant_type=QuantType.DYNAMIC
+    )
+
+
+def _quantize_ondevice_dynamic_jit(
+    model, qconfig_dict, method_name="forward", inplace=False
+):
+    r"""Prepares the input float TorchScript model with
+    *on-device* post training dynamic quantization.
+    Currently only qint8 quantization of torch.nn.Linear is supported.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, please see detailed
+        `method_name`: Name of the method within the model, to be prepared for quantization
+        descriptions in :func:`~torch.ao.quantization.quantize_jit`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+
+    Return:
+        TorchScript model that is ready for on device quantization.
+        This means that the returned
+        model has:
+        - Method is inlined.
+        - Model has observer modules inserted in the model.
+        - Model has packed params inserted in the model. However they are empty as in they dont
+          contain valid quantized weights.
+        - observe_<method_name> is added that observe the values to be quantized.
+        - reset_observers_<method_name> to reset observers.
+        - quantize_<method_name> is added to the model.
+          - This method extract scale, zero points.
+          - Quantizes observed weights.
+          - Creates packed params from it and update the attribute of the model with the new values
+            for the packed params.
+          - Reset the original fp32 weights with empty tensor using SetAttr.
+        - quantized_<method_name> is added to the model.
+          - This method uses quantized weights and quantized linear ops instead of fp32 op.
+          - This method should be used for inference post PTQ.
+        - Note that all method's signatures should be the same as method_name.
+
+        Later on device:
+        - Run reset_observers_<method_name>
+        - Run observe_<method_name>
+        - Run quantize_<method_name>
+        - Now model can be saved and loaded later.
+        - Run model with quantized_<method_name>
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import per_channel_dynamic_qconfig
+    from torch.ao.quantization.quantize_jit import _quantize_ondevice_dynamic_jit
+
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+    quant_ready_model = _quantize_ondevice_dynamic_jit(
+        ts_model, {"": qconfig}, "forward", True
+    )
+    ```
+    """
+    return _quantize_ondevice_dynamic_jit_impl(
+        model, qconfig_dict, method_name, inplace=inplace
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..169e2905ddbdcc2ec86d92d1b858abe7e91af298
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantize_pt2e.py
@@ -0,0 +1,262 @@
+import typing_extensions
+
+import torch
+from torch._export.passes.constant_folding import constant_fold
+from torch.ao.quantization.pt2e.duplicate_dq_pass import DuplicateDQPass
+from torch.ao.quantization.pt2e.port_metadata_pass import PortNodeMetaForQDQ
+from torch.ao.quantization.quantizer import (  # noqa: F401
+    DerivedQuantizationSpec,
+    FixedQParamsQuantizationSpec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    QuantizationSpecBase,
+    Quantizer,
+    SharedQuantizationSpec,
+)
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_manager import PassManager
+
+from .pt2e.prepare import prepare
+from .pt2e.qat_utils import _fold_conv_bn_qat, _fuse_conv_bn_qat
+from .pt2e.representation import reference_representation_rewrite
+from .pt2e.utils import _disallow_eval_train, _fuse_conv_bn_, _get_node_name_to_scope
+from .quantize_fx import _convert_to_reference_decomposed_fx
+from .utils import DEPRECATION_WARNING
+
+
+__all__ = [
+    "prepare_pt2e",
+    "prepare_qat_pt2e",
+    "convert_pt2e",
+]
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare_pt2e(
+    model: GraphModule,
+    quantizer: Quantizer,
+) -> GraphModule:
+    """Prepare a model for post training quantization
+
+    Args:
+      * `model` (torch.fx.GraphModule): a model captured by `torch.export.export_for_training` API.
+      * `quantizer`: A backend specific quantizer that conveys how user want the
+        model to be quantized. Tutorial for how to write a quantizer can be found here:
+        https://pytorch.org/tutorials/prototype/pt2e_quantizer.html
+
+    Return:
+      A GraphModule with observer (based on quantizer annotation), ready for calibration
+
+    Example::
+
+        import torch
+        from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+        from torch.ao.quantization.quantizer import (
+            XNNPACKQuantizer,
+            get_symmetric_quantization_config,
+        )
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+           def forward(self, x):
+               return self.linear(x)
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define calibration function
+        def calibrate(model, data_loader):
+            model.eval()
+            with torch.no_grad():
+                for image, target in data_loader:
+                    model(image)
+
+        # Step 1. program capture
+        # NOTE: this API will be updated to torch.export API in the future, but the captured
+        # result should mostly stay the same
+        m = torch.export.export_for_training(m, *example_inputs).module()
+        # we get a model with aten ops
+
+        # Step 2. quantization
+        # backend developer will write their own Quantizer and expose methods to allow
+        # users to express how they
+        # want the model to be quantized
+        quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config())
+        m = prepare_pt2e(m, quantizer)
+
+        # run calibration
+        # calibrate(m, sample_inference_data)
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_pt2e")
+    original_graph_meta = model.meta
+    node_name_to_scope = _get_node_name_to_scope(model)
+    # TODO: check qconfig_mapping to make sure conv and bn are both configured
+    # to be quantized before fusion
+    # TODO: (maybe) rewrite this with subgraph_rewriter
+    _fuse_conv_bn_(model)
+    model = quantizer.transform_for_annotation(model)
+    quantizer.annotate(model)
+    quantizer.validate(model)
+    model = prepare(
+        model,
+        node_name_to_scope,
+        is_qat=False,
+        obs_or_fq_callback=quantizer.prepare_obs_or_fq_callback,
+    )
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def prepare_qat_pt2e(
+    model: GraphModule,
+    quantizer: Quantizer,
+) -> GraphModule:
+    """Prepare a model for quantization aware training
+
+    Args:
+      * `model` (torch.fx.GraphModule): see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e`
+      * `quantizer`: see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e`
+
+    Return:
+      A GraphModule with fake quant modules (based on quantizer annotation), ready for
+      quantization aware training
+
+    Example::
+        import torch
+        from torch.ao.quantization.quantize_pt2e import prepare_qat_pt2e
+        from torch.ao.quantization.quantizer import (
+            XNNPACKQuantizer,
+            get_symmetric_quantization_config,
+        )
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+           def forward(self, x):
+               return self.linear(x)
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define the training loop for quantization aware training
+        def train_loop(model, train_data):
+            model.train()
+            for image, target in data_loader:
+                ...
+
+        # Step 1. program capture
+        # NOTE: this API will be updated to torch.export API in the future, but the captured
+        # result should mostly stay the same
+        m = torch.export.export_for_training(m, *example_inputs).module()
+        # we get a model with aten ops
+
+        # Step 2. quantization
+        # backend developer will write their own Quantizer and expose methods to allow
+        # users to express how they
+        # want the model to be quantized
+        quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config())
+        m = prepare_qat_pt2e(m, quantizer)
+
+        # run quantization aware training
+        train_loop(prepared_model, train_loop)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_qat_pt2e")
+    original_graph_meta = model.meta
+    node_name_to_scope = _get_node_name_to_scope(model)
+    model = quantizer.transform_for_annotation(model)
+    quantizer.annotate(model)
+    quantizer.validate(model)
+    # Perform fusion after annotate to avoid quantizing ops in the new
+    # subgraph that don't need to be quantized
+    # TODO: only fuse if conv and bn are both configured to be quantized
+    _fuse_conv_bn_qat(model)
+    model = prepare(
+        model,
+        node_name_to_scope,
+        is_qat=True,
+        obs_or_fq_callback=quantizer.prepare_obs_or_fq_callback,
+    )
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
+
+
+_QUANT_OPS = [
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+    torch.ops.pt2e_quant.quantize_affine,
+]
+
+
+def _quant_node_constraint(n: Node) -> bool:
+    """If there is any pure ops between get_attr and quantize op they will be const propagated
+    e.g. get_attr(weight) -> transpose -> quantize -> dequantize*
+    (Note: dequantize op is not going to be constant propagated)
+
+    This filter is added because we don't want to constant fold the things that are not
+    related to quantization
+    """
+    return n.op == "call_function" and n.target in _QUANT_OPS
+
+
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+def convert_pt2e(
+    model: GraphModule,
+    use_reference_representation: bool = False,
+    fold_quantize: bool = True,
+) -> GraphModule:
+    """Convert a calibrated/trained model to a quantized model
+
+    Args:
+      * `model` (torch.fx.GraphModule): calibrated/trained model
+      * `use_reference_representation` (bool): boolean flag to indicate whether to produce reference representation or not
+      * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not
+
+    Returns:
+        quantized model, either in q/dq representation or reference representation
+
+    Example::
+
+        # prepared_model: the model produced by `prepare_pt2e`/`prepare_qat_pt2e` and calibration/training
+        # `convert_pt2e` produces a quantized model that represents quantized computation with
+        # quantize dequantize ops and fp32 ops by default.
+        # Please refer to
+        # https://pytorch.org/tutorials/prototype/pt2e_quant_ptq_static.html#convert-the-calibrated-model-to-a-quantized-model
+        # for detailed explanation of output quantized model
+        quantized_model = convert_pt2e(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.convert_pt2e")
+    if not isinstance(use_reference_representation, bool):
+        raise ValueError(
+            "Unexpected argument type for `use_reference_representation`, "
+            f"please make sure you intend to pass argument {use_reference_representation} to convert_pt2e"
+        )
+    original_graph_meta = model.meta
+    model = _convert_to_reference_decomposed_fx(model)
+    model = _fold_conv_bn_qat(model)
+
+    pm = PassManager([DuplicateDQPass()])
+    model = pm(model).graph_module
+
+    pm = PassManager([PortNodeMetaForQDQ()])
+    model = pm(model).graph_module
+
+    if fold_quantize:
+        constant_fold(model, _quant_node_constraint)
+
+    if use_reference_representation:
+        model = reference_representation_rewrite(model)
+
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5cd5e8696d39781004960f47e6f44d3b1987ff4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/__init__.py
@@ -0,0 +1,22 @@
+from .quantizer import (
+    DerivedQuantizationSpec,
+    EdgeOrNode,
+    FixedQParamsQuantizationSpec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    QuantizationSpecBase,
+    Quantizer,
+    SharedQuantizationSpec,
+)
+
+
+__all__ = [
+    "EdgeOrNode",
+    "Quantizer",
+    "QuantizationSpecBase",
+    "QuantizationSpec",
+    "FixedQParamsQuantizationSpec",
+    "SharedQuantizationSpec",
+    "DerivedQuantizationSpec",
+    "QuantizationAnnotation",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..15404cc560117713bf8c952f594c051b1c13e3a4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .quantizer import QuantizationAnnotation, Quantizer
+
+
+if TYPE_CHECKING:
+    import torch
+    from torch.fx import Node
+
+__all__ = [
+    "ComposableQuantizer",
+]
+
+
+class ComposableQuantizer(Quantizer):
+    """
+    ComposableQuantizer allows users to combine more than one quantizer into a single quantizer.
+    This allows users to quantize a model with multiple quantizers. E.g., embedding quantization
+    maybe supported by one quantizer while linear layers and other ops might be supported by another
+    quantizer.
+
+    ComposableQuantizer is initialized with a list of `Quantizer` instances.
+    The order of the composition matters since that is the order in which the quantizers will be
+    applies.
+    Example:
+    ```
+    embedding_quantizer = EmbeddingQuantizer()
+    linear_quantizer = MyLinearQuantizer()
+    xnnpack_quantizer = (
+        XNNPackQuantizer()
+    )  # to handle ops not quantized by previous two quantizers
+    composed_quantizer = ComposableQuantizer(
+        [embedding_quantizer, linear_quantizer, xnnpack_quantizer]
+    )
+    prepared_m = prepare_pt2e(model, composed_quantizer)
+    ```
+    """
+
+    def __init__(self, quantizers: list[Quantizer]):
+        super().__init__()
+        self.quantizers = quantizers
+        self._graph_annotations: dict[Node, QuantizationAnnotation] = {}
+
+    def _record_and_validate_annotations(
+        self, gm: torch.fx.GraphModule, quantizer: Quantizer
+    ) -> None:
+        for n in gm.graph.nodes:
+            if "quantization_annotation" in n.meta:
+                # check if the annotation has been changed by
+                # comparing QuantizationAnnotation object id
+                if n in self._graph_annotations and (
+                    id(self._graph_annotations[n])
+                    != id(n.meta["quantization_annotation"])
+                ):
+                    raise RuntimeError(
+                        f"Quantizer {quantizer.__class__.__name__} has changed annotations on node {n}"
+                    )
+                else:
+                    self._graph_annotations[n] = n.meta["quantization_annotation"]
+            else:
+                if n in self._graph_annotations:
+                    raise RuntimeError(
+                        f"Quantizer {quantizer.__class__.__name__} has removed annotations on node {n}"
+                    )
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        for quantizer in self.quantizers:
+            quantizer.annotate(model)
+            self._record_and_validate_annotations(model, quantizer)
+        return model
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        for quantizer in self.quantizers:
+            model = quantizer.transform_for_annotation(model)
+        return model
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8ef1030bfdcdeb88b58179f4f2ea83c895aad2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py
@@ -0,0 +1,94 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import copy
+
+import torch
+import torch.nn.functional as F
+from torch.ao.quantization.observer import PerChannelMinMaxObserver
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    Quantizer,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    OperatorConfig,
+    OperatorPatternType,
+    QuantizationConfig,
+)
+
+
+__all__ = [
+    "get_embedding_operators_config",
+    "EmbeddingQuantizer",
+]
+
+
+def get_embedding_operators_config() -> OperatorConfig:
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.uint8,
+        qscheme=torch.per_channel_affine_float_qparams,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(eps=2**-12),
+    )
+    quantization_config = QuantizationConfig(None, None, weight_quantization_spec, None)
+    ops: list[OperatorPatternType] = [[torch.nn.Embedding]]
+    ops.append([F.embedding])
+    supported_config_and_operators = OperatorConfig(
+        config=quantization_config, operators=ops
+    )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+class EmbeddingQuantizer(Quantizer):
+    @classmethod
+    def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
+        op_configs: set[QuantizationConfig] = {
+            spec for spec, _ in cls.get_supported_operators()
+        }
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: QuantizationConfig
+    ) -> list[OperatorPatternType]:
+        for config, ops in cls.get_supported_operators():
+            # note: this assumes each entry in cls.supported_spec_and_operators
+            # corresponds to one spec, e.g. we don't have
+            # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)]
+            # where the first and second entry have the same spec but did not
+            # merge the op list
+            if config == quantization_config:
+                return ops
+        return []
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        self._annotate_embedding_ops(model.graph)
+        return model
+
+    def _annotate_embedding_ops(self, graph: torch.fx.Graph) -> None:
+        embedding_config: OperatorConfig = get_embedding_operators_config()
+        for node in graph.nodes:
+            # Keep node parsing based annotations instead of module partitioners
+            # just as an example of alternate ways of annotating
+            if (
+                node.op == "call_function"
+                and node.target is torch.ops.aten.embedding.default
+            ):
+                if embedding_config.config.weight is None:
+                    raise ValueError(
+                        "Embedding config must have a valid weight quantization spec."
+                    )
+                node.meta["quantization_annotation"] = QuantizationAnnotation(
+                    input_qspec_map={
+                        node.args[0]: embedding_config.config.weight,
+                    }
+                )
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> list[OperatorConfig]:
+        return [get_embedding_operators_config()]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e10526b4cc4ca58d099523d32ebd57a393a1dd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/quantizer.py
@@ -0,0 +1,182 @@
+# mypy: allow-untyped-defs
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Annotated
+
+import torch
+from torch import Tensor
+from torch.ao.quantization import ObserverOrFakeQuantize
+from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+from torch.fx import Node
+
+
+__all__ = [
+    "Quantizer",
+    "QuantizationSpecBase",
+    "QuantizationSpec",
+    "FixedQParamsQuantizationSpec",
+    "EdgeOrNode",
+    "SharedQuantizationSpec",
+    "DerivedQuantizationSpec",
+    "QuantizationAnnotation",
+]
+
+
+class QuantizationSpecBase(ABC):  # noqa: B024
+    """Base class for different types of quantization specs that allows users to
+    specify how to quantize a Tensor (input/output of a Node) in the model
+    """
+
+
+@dataclass(eq=True, frozen=True)
+class QuantizationSpec(QuantizationSpecBase):
+    """Quantization spec for common operators that allows user to specify how to
+    quantize a Tensor, this includes dtype, quant_min, quant_max etc.
+    """
+
+    dtype: torch.dtype
+    # observer or fake_quantize constructor such as
+    # MinMaxObserver, PerChannelHistogramObserver etc.
+    # or we can attach some custom args to them
+    # e.g. MinMaxObserver.with_args(eps=eps)
+    observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
+    ch_axis: int | None = None
+    is_dynamic: bool = False
+
+    def __post_init__(self):
+        # TODO: add init for quant_min/quant_max
+        # quant_min must be less than quant_max
+        if (
+            self.quant_min is not None
+            and self.quant_max is not None
+            and self.quant_min > self.quant_max
+        ):
+            raise ValueError(
+                f"quant_min {self.quant_min} must be <= quant_max {self.quant_max}."
+            )
+
+        # ch_axis must be less than the number of channels
+        # but no way to check here. Just check that it is not < 0.
+        if self.ch_axis is not None and self.ch_axis < 0:
+            raise ValueError("Ch_axis is < 0.")
+
+
+@dataclass(eq=True, frozen=True)
+class FixedQParamsQuantizationSpec(QuantizationSpecBase):
+    dtype: torch.dtype
+    scale: float
+    zero_point: int
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
+    is_dynamic: bool = False
+
+
+"""
+The way we refer to other points of quantization in the graph will be either
+an input edge or an output value
+input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node]
+output value is an fx Node
+"""
+EdgeOrNode = Annotated[tuple[Node, Node] | Node, None]
+EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer"
+
+
+@dataclass(eq=True, frozen=True)
+class SharedQuantizationSpec(QuantizationSpecBase):
+    """
+    Quantization spec for the Tensors whose quantization parameters are shared with other Tensors
+    """
+
+    # the edge or node to share observer or fake quant instances with
+    edge_or_node: EdgeOrNode
+
+
+@dataclass(eq=True, frozen=True)
+class DerivedQuantizationSpec(QuantizationSpecBase):
+    """Quantization spec for the Tensors whose quantization parameters are derived from other Tensors"""
+
+    derived_from: list[EdgeOrNode]
+    derive_qparams_fn: Callable[[list[ObserverOrFakeQuantize]], tuple[Tensor, Tensor]]
+    dtype: torch.dtype
+    quant_min: int | None = None
+    quant_max: int | None = None
+    qscheme: torch.qscheme | None = None
+    ch_axis: int | None = None
+    is_dynamic: bool = False
+
+
+@dataclass
+class QuantizationAnnotation:
+    """How are input argument or output should be quantized,
+    expressed as QuantizationSpec, this corresponds to how a Tensor in the
+    operator Graph is observed (PTQ) or fake quantized (QAT)
+    """
+
+    # a map from torch.fx.Node to a type of QuantizationSpecBase
+    input_qspec_map: dict[Node, QuantizationSpecBase | None] = field(
+        default_factory=dict
+    )
+
+    # How the output of this node is quantized, expressed as QuantizationSpec
+    # TODO: change the value to QuantizationSpec in a separate PR
+    output_qspec: QuantizationSpecBase | None = None
+
+    # For a Node: node1 and edge: (node1, node2), since they are observing the same
+    # Tensor, we may want to implicitly share observers, this flag allows people to
+    # turn off this behavior for the output of the node
+    allow_implicit_sharing: bool = True
+
+    # whether the node is annotated or not
+    _annotated: bool = False
+
+
+class Quantizer(ABC):
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """Allows for user defined transforms to run before annotating the graph.
+        This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
+        For example quantizer can
+        a) decompose a compound operator like scaled dot product attention,
+        into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
+        or b) transform scalars to tensor to allow quantizing scalares.
+
+        Note: this is an optional method
+        """
+        return model
+
+    # annotate nodes in the graph with observer or fake quant constructors
+    # to convey the desired way of quantization
+    @abstractmethod
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        pass
+
+    # validate the annotated graph is supported by the backend
+    @abstractmethod
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    def prepare_obs_or_fq_callback(
+        self,
+        model: torch.fx.GraphModule,
+        edge_or_node_to_obs_or_fq: dict[EdgeOrNode, ObserverOrFakeQuantize],
+    ) -> None:
+        """A callback that will be called after the observers or fake quants are created
+        for each sharing group, but before they are inserted into the graph. The
+        callback can be used to make final quantization adjustments, such as enforcing
+        specific scale and zero point on model input or output.
+
+        Args:
+          * `model`: the graph module being prepared.
+          * `edge_or_node_to_obs_or_fq`: a dictionary mapping each annotated edge and
+            node to the corresponding observer or fake quant object. Note that multiple
+            edges and/or nodes can map to the same observer / fake quant instance if
+            they were annotated with SharedQuantizationSpec. This dictionary can be
+            modified by the callback.
+        """
+        return
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..06463ae0f2f3adb815d34b0f539fb6cde423e1ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/utils.py
@@ -0,0 +1,90 @@
+from collections.abc import Callable
+
+from torch.ao.quantization.pt2e.utils import _is_sym_size_node
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpecBase,
+)
+from torch.fx import Node
+
+
+__all__: list[str] = []
+
+
+def _annotate_input_qspec_map(
+    node: Node, input_node: Node, qspec: QuantizationSpecBase | None
+) -> None:
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+    quantization_annotation.input_qspec_map[input_node] = qspec
+    node.meta["quantization_annotation"] = quantization_annotation
+
+
+def _annotate_output_qspec(node: Node, qspec: QuantizationSpecBase | None) -> None:
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    quantization_annotation.output_qspec = qspec
+    node.meta["quantization_annotation"] = quantization_annotation
+
+
+def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]) -> bool:
+    """
+    This utility is used to handle cases when dynami_shape=True tracing leads
+    to symint nodes in the pattern of linear module. In those cases, we need to
+    distinguish between the nodes that are in input for just extracting value of
+    some dimensions (and symint nodes) vs. the one that is activation.
+    For example:
+    graph(x, y, weight):
+       size_0 = torch.ops.aten.sym_size([x], [0])
+       size_1 = torch.ops.aten.sym_size([y], [1])
+       view_size = size_0 * size_1
+       size_3 = torch.ops.aten.sym_size([x], [2])
+       vie_out = torch.ops.aten.view(x, [view_size, size_3])
+       return mm(view_out, weight)
+    In the example above y node is not actual input. It exist only to extract size_1
+    """
+    if _is_sym_size_node(node):
+        return True
+
+    return all(
+        ((user not in partition_nodes) or _is_sym_size_node(user))
+        for user in node.users
+    )
+
+
+def _get_module_name_filter(module_name: str) -> Callable[[Node], bool]:
+    """Get the module_name_filter function for a given module name, the filter accepts
+    a node and checks if the node comes from a module that has certain module name
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with name blocks.sub.linear1
+
+
+    >> module_name_filter = _get_module_name_filter("blocks.sub")
+    >> print(module_name_filter(node))
+    True  # the node is from "blocks.sub" based on the fully qualified name "blocks.sub.linear1"
+    """
+
+    def module_name_filter(n: Node) -> bool:
+        # example: {
+        #    'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #    'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        # get_attr nodes doesn't have nn_module_stack?
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+
+        def _normalize_path(n: str) -> str:
+            prefix = 0
+            # TODO This is non standard behavior and should be removed when we migrate off capture_pre_autograd_graph.
+            if n.startswith("L['self']."):
+                prefix = len("L['self'].")
+            return n[prefix:]
+
+        names = [_normalize_path(n) for n, _ in nn_module_stack.values()]
+        return module_name in names
+
+    return module_name_filter
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9cde0e2d12a6d00abfef6c2564b679286d99262
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -0,0 +1,1605 @@
+# mypy: allow-untyped-defs
+import functools
+import itertools
+import operator
+import warnings
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias
+
+import torch
+import torch.nn.functional as F
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    HistogramObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+)
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    Quantizer,
+    SharedQuantizationSpec,
+)
+from torch.ao.quantization.quantizer.utils import _get_module_name_filter
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    get_bias_qspec,
+    get_input_act_qspec,
+    get_output_act_qspec,
+    get_weight_qspec,
+    QuantizationConfig,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.source_matcher_utils import (
+    get_source_partitions,
+    SourcePartition,
+)
+
+
+FilterFn: TypeAlias = Callable[[list[Node]], bool]
+
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
+__all__ = [
+    "X86InductorQuantizer",
+    "get_default_x86_inductor_quantization_config",
+    "get_x86_inductor_linear_dynamic_fp16_config",
+]
+
+
+@dataclass
+class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
+    # _is_output_of_quantized_pattern:
+    #  * Node as output node of a fusion pattern.
+    #  * The fusion pattern supports int8 data type.
+    #  * The fusion pattern has inputs annotated to insert observer.
+    #  * The quantization_config is not `None`.
+    _is_output_of_quantized_pattern: bool = False
+
+
+# Operators that:
+# 1. Operators are optimized to run with int8 when int8 input provided.
+# 2. Operators do not support int8 input and produce fp32 output.
+int8_in_int8_out_ops: set = {
+    torch.ops.aten.max_pool2d.default,
+    torch.ops.aten.cat.default,
+    torch.ops.aten.avg_pool2d.default,
+    torch.ops.aten.adaptive_avg_pool2d.default,
+    torch.ops.aten.flatten.using_ints,
+}
+
+# Operators that support the int8 data type for quantization config propagation.
+# A superset of int8_in_int8_out_ops incorporating additional operators.
+propagation_quantizable_ops = int8_in_int8_out_ops
+
+# Operators support the int8 data type
+# and recipe is configured by default in X86InductorQuantizer.
+default_quantizable_ops = propagation_quantizable_ops | {
+    torch.ops.aten.conv1d.default,
+    torch.ops.aten.conv2d.default,
+    torch.ops.aten.linear.default,
+}
+
+# A superset of default_quantizable_ops includes operators support the int8 data type
+# but not enabled by default recipe of X86InductorQuantizer.
+quantizable_ops = default_quantizable_ops | {
+    torch.ops.aten.matmul.default,
+}
+
+QUANT_ANNOTATION_KEY = "quantization_annotation"
+
+
+def _skip_annotate(nodes: list[Node], filter_fn: FilterFn | None = None) -> bool:
+    """Determine whether to skip annotation for a list of nodes."""
+
+    # 1) Skip annotate if any node is already annotated
+    if _is_any_annotated(nodes):
+        return True
+
+    # 2) Proceed annotate if a) a filter function is provided
+    # and b) the given nodes list passes the filter function check.
+    if filter_fn and filter_fn(nodes):
+        return False
+
+    return True
+
+
+def _create_module_name_filter(module_name: str) -> FilterFn:
+    """Create a filter function for a given module name.
+
+    The filter function takes a list of nodes (as determined by the annotate function)
+    and return True if *all* nodes come from the specified module name, False otherwise.
+
+    For example:
+        linear_1: "f32[3, 10]" = torch.ops.aten.linear.default(...) # comes from a module with name `sub.linear1`
+        relu: "f32[3, 10]" = torch.ops.aten.relu.default(linear_1); # comes from a module with name `sub.relu1`
+
+    >> module_name_filter = _create_module_name_filter_inner("sub")
+    >> print(module_name_filter([relu, linear_1]))
+    # True  # These two nodes are determined by `_annotate_linear_unary` function and from "sub".
+    """
+
+    filter_fn = _get_module_name_filter(module_name)
+
+    def check_all_nodes_from_module(nodes: list[Node]) -> bool:
+        all_nodes_from_module_name: bool = all(filter_fn(n) for n in nodes)
+        return all_nodes_from_module_name
+
+    return check_all_nodes_from_module
+
+
+def _create_operator_type_filter(
+    operator_type: Callable,
+) -> FilterFn:
+    """Create a filter function for a given operator type.
+
+    The filter function takes a list of nodes and returns True if it contains
+    exactly one node with the specified operator type, False otherwise.
+
+    For example:
+        linear_1: "f32[3, 10]" = torch.ops.aten.linear.default(...) # comes from a module with name `sub.linear1`
+        relu: "f32[3, 10]" = torch.ops.aten.relu.default(linear_1); # comes from a module with name `sub.relu1`
+
+    >> operator_type_filter = _create_operator_type_filter(torch.ops.aten.linear.default)
+    >> print(operator_type_filter([relu, linear_1]))
+    # True  # These two nodes are determined by `_annotate_linear_unary` function and the second node is `linear`.
+    """
+
+    def operator_type_filter(nodes: list[Node]):
+        num_nodes_with_operator_type = sum(
+            node.target == operator_type for node in nodes
+        )
+        if num_nodes_with_operator_type > 1:
+            raise NotImplementedError(
+                f"Several nodes within a single pattern are {operator_type}."
+            )
+        return num_nodes_with_operator_type == 1
+
+    return operator_type_filter
+
+
+def _global_config_filter(nodes: list[Node]) -> bool:
+    """Filter function for global configuration.
+
+    This filter function takes a list of nodes and returns True if there is exactly one node
+    in the list that is a default quantizable operation, False otherwise.
+    """
+    num_nodes_in_default_quantizable_ops = sum(
+        node.target in default_quantizable_ops for node in nodes
+    )
+    if num_nodes_in_default_quantizable_ops > 1:
+        raise NotImplementedError(
+            "Several nodes within a single pattern are default quantizable operations."
+        )
+    return num_nodes_in_default_quantizable_ops == 1
+
+
+def _map_module_function_to_aten_operator_type():
+    module_function_to_aten_operator: dict[Callable, torch._ops.OpOverloadPacket] = {}
+    map_list = (
+        ([torch.nn.Conv2d, F.conv1d], torch.ops.aten.conv1d.default),
+        ([torch.nn.Conv2d, F.conv2d], torch.ops.aten.conv2d.default),
+        ([torch.nn.Linear, F.linear], torch.ops.aten.linear.default),
+        ([torch.nn.MaxPool2d, F.max_pool2d], torch.ops.aten.max_pool2d.default),
+        (
+            [
+                torch.cat,
+            ],
+            torch.ops.aten.cat.default,
+        ),
+        ([torch.nn.AvgPool2d, F.avg_pool2d], torch.ops.aten.avg_pool2d.default),
+        (
+            [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d],
+            torch.ops.aten.adaptive_avg_pool2d.default,
+        ),
+        (
+            [
+                torch.flatten,
+            ],
+            torch.ops.aten.flatten.using_ints,
+        ),
+        (
+            [
+                torch.matmul,
+            ],
+            torch.ops.aten.matmul.default,
+        ),
+    )
+    for map_item in map_list:
+        module_function_to_aten_operator.update(dict.fromkeys(map_item[0], map_item[1]))  # type: ignore[arg-type, call-overload]
+    return module_function_to_aten_operator
+
+
+def _mark_nodes_as_annotated(nodes: list[Node]):
+    for node in nodes:
+        if node is not None:
+            if QUANT_ANNOTATION_KEY not in node.meta:
+                node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation()
+            node.meta[QUANT_ANNOTATION_KEY]._annotated = True
+
+
+def _is_node_annotated(_node):
+    """
+    return True if the node is annotated, otherwise return False
+    """
+    return (
+        QUANT_ANNOTATION_KEY in _node.meta
+        and _node.meta[QUANT_ANNOTATION_KEY]._annotated
+    )
+
+
+def _is_any_annotated(nodes: list[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    check if any of the node is annotated, return True if any of the node
+    is annotated, otherwise return False.
+    """
+    return any(_is_node_annotated(node) for node in nodes)
+
+
+def _is_all_annotated(nodes: list[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    return True if all of the node is annotated, otherwise return False.
+    """
+    return all(_is_node_annotated(node) for node in nodes)
+
+
+def _is_quantized_op_pt2e(node: torch.fx.Node):
+    """
+    Used for pt2e flow to check if the node is a quantized node:
+    Case1: the node has been annotated as output node of a fusion pattern.
+    Case2: the node has been annotated as single quantized node.
+    """
+    if not _is_any_annotated([node]):
+        # The node has not been annotated, directly return False
+        return False
+    quantization_annotation = node.meta.get(QUANT_ANNOTATION_KEY, None)
+    if not isinstance(quantization_annotation, _X86InductorQuantizationAnnotation):
+        raise AssertionError(
+            "quantization_annotation must be an _X86InductorQuantizationAnnotation"
+        )
+    return quantization_annotation._is_output_of_quantized_pattern
+
+
+@functools.lru_cache
+def get_default_x86_inductor_quantization_config(
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    reduce_range: bool = False,
+):
+    """
+    reduce_range is False by default. Set it to True on earlier CPUs without VNNI to avoid accuracy issue.
+    """
+    extra_args: dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    # Copy from x86 default qconfig from torch/ao/quantization/qconfig.py
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.uint8,
+        quant_min=0,
+        quant_max=127 if reduce_range else 255,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        FusedMovingAvgObsFakeQuantize if is_qat else PerChannelMinMaxObserver
+    )
+
+    if is_qat:
+        # Only support per channel quant for now
+        extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,  # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+    bias_quantization_spec = None  # will use placeholder observer by default
+    quantization_config = QuantizationConfig(
+        act_quantization_spec,
+        act_quantization_spec,
+        weight_quantization_spec,
+        bias_quantization_spec,
+        is_qat,
+    )
+    return quantization_config
+
+
+@functools.lru_cache
+def get_x86_inductor_linear_dynamic_fp16_config():
+    """
+    For linear_dynamic_fp16. The name may be confusing.
+    The op's behavior is fp32_input * (fp16_weight -> to_fp32) -> fp32_output.
+    """
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.float16,
+        observer_or_fake_quant_ctr=PlaceholderObserver,
+    )
+    quantization_config = QuantizationConfig(
+        None,  # input_quantization_spec
+        None,  # output_quantization_spec
+        weight_quantization_spec,
+        None,  # bias_quantization_spec
+    )
+    return quantization_config
+
+
+def _annotate_nodes_not_quantize(nodes: Node | list[Node]) -> None:
+    """Annotate nodes to exclude them from quantization (their `quantization_config` is `None`)."""
+    if not isinstance(nodes, list):
+        nodes = [nodes]
+    for node in nodes:
+        node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+            _annotated=True
+        )
+
+
+def _config_checker(method: Callable) -> Callable:
+    @functools.wraps(method)
+    def wrapper(
+        quantizer: "X86InductorQuantizer",
+        name: Any,
+        quantization_config: Optional["QuantizationConfig"],
+    ) -> "X86InductorQuantizer":
+        if quantizer._need_skip_config(quantization_config):
+            warnings.warn(
+                f"Skip the quantization config for {name}.",
+                stacklevel=2,
+            )
+            return quantizer
+        return method(quantizer, name, quantization_config)
+
+    return wrapper
+
+
+@dataclass
+class _CurrentQuantizationMode:
+    r"""Configuration defining the current quantization mode for the quantizer.
+
+    All possible current quantization modes are listed below:
+    ----------------------------------------------------------------------------------------------------------
+                |                                       dynamic_state
+     qat_state  |---------------------------------------------------------------------------------------------
+                |                           None                              |    True       |  False
+    ----------------------------------------------------------------------------------------------------------
+        None    | quantizer does not receive a non-None `quantization_config` | \             | \
+        False   | quantizer will not do QAT                                   | dynamic       | static
+        True    | quantizer will do QAT                                       | QAT + dynamic | QAT + static
+    """
+
+    qat_state: bool | None
+    dynamic_state: bool | None
+
+
+class X86InductorQuantizer(Quantizer):
+    module_function_to_aten_operator_type = _map_module_function_to_aten_operator_type()
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.global_config: QuantizationConfig | None = None
+        self.operator_type_qconfig: dict[
+            torch._ops.OpOverloadPacket, QuantizationConfig | None
+        ] = {}
+        self.module_name_qconfig: dict[str, QuantizationConfig | None] = {}
+
+    def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
+        """Retrieves the current quantization mode based on all configurations."""
+        qat_state = None
+        dynamic_state = None
+
+        # As we use `_need_skip_config` to skip all invalid configurations,
+        # we can safely assume that the all existing non-None configurations
+        # have the same quantization mode.
+        # pyrefly: ignore [bad-assignment]
+        for qconfig in (
+            list(self.module_name_qconfig.values())
+            + list(self.operator_type_qconfig.values())
+            + [self.global_config]
+        ):
+            if qconfig is not None:
+                # Query the `is_qat` state
+                if qat_state is None:
+                    qat_state = qconfig.is_qat
+                else:
+                    if qat_state != qconfig.is_qat:
+                        raise AssertionError(
+                            f"All non-None quantization configs should have the same `is_qat`,"
+                            f"but got {qat_state} and {qconfig.is_qat}."
+                        )
+                # Query the `is_dynamic` state
+                input_activation_spec = qconfig.input_activation
+                if input_activation_spec is not None:
+                    if dynamic_state is None:
+                        dynamic_state = input_activation_spec.is_dynamic
+                    else:
+                        if dynamic_state != input_activation_spec.is_dynamic:
+                            raise AssertionError(
+                                f"All non-None `input_activation_spec` should have the same `is_dynamic`,"
+                                f"but got {dynamic_state} and {input_activation_spec.is_dynamic}."
+                            )
+        return _CurrentQuantizationMode(
+            qat_state=qat_state, dynamic_state=dynamic_state
+        )
+
+    def _need_skip_config(self, quantization_config: QuantizationConfig | None) -> bool:
+        """Check if the provided quantization config is valid for X86InductorQuantizer.
+
+        Mixed static/dynamic configurations or mixed QAT/non-QAT configurations are not supported.
+        To avoid such a mix, we compare the incoming configuration with current configuration status.
+        Refer the `_CurrentQuantizationMode` definition for all possible modes.
+        """
+        if quantization_config is None:
+            return False
+
+        need_skip = False
+        current_mode = self._get_current_quantization_mode()
+        if (
+            current_mode.qat_state is not None
+            and current_mode.qat_state != quantization_config.is_qat
+        ):
+            warnings.warn(
+                "Mixed QAT and Non-QAT quantization config is not supported.",
+                stacklevel=2,
+            )
+            need_skip = True
+        if current_mode.dynamic_state is not None:
+            input_activation_spec = quantization_config.input_activation
+            if (
+                input_activation_spec is not None
+                and current_mode.dynamic_state != input_activation_spec.is_dynamic
+            ):
+                warnings.warn(
+                    "Mixed dynamic and static quantization config is not supported.",
+                    stacklevel=2,
+                )
+                need_skip = True
+        return need_skip
+
+    def set_global(self, quantization_config: QuantizationConfig):
+        if self._need_skip_config(quantization_config):
+            warnings.warn("Skip the global quantization config.", stacklevel=2)
+            return self
+        self.global_config = quantization_config
+        return self
+
+    def get_global_quantization_config(self):
+        if not isinstance(self.global_config, QuantizationConfig):
+            warnings.warn(
+                "The global_config for X86InductorQuantizer is currently invalid. \
+                Please ensure that you use set_global to establish the global quantization configuration.",
+                stacklevel=2,
+            )
+        return self.global_config
+
+    @_config_checker
+    def set_function_type_qconfig(
+        self,
+        function_type: Callable,
+        quantization_config: QuantizationConfig | None,
+    ) -> "X86InductorQuantizer":
+        if function_type in X86InductorQuantizer.module_function_to_aten_operator_type:
+            self._set_aten_operator_qconfig(
+                X86InductorQuantizer.module_function_to_aten_operator_type[
+                    function_type
+                ],
+                quantization_config,
+            )
+        else:
+            warnings.warn(
+                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.",
+                stacklevel=2,
+            )
+        return self
+
+    @_config_checker
+    def set_module_type_qconfig(
+        self,
+        module_type: torch.nn.Module,
+        quantization_config: QuantizationConfig | None,
+    ) -> "X86InductorQuantizer":
+        if module_type in X86InductorQuantizer.module_function_to_aten_operator_type:
+            self._set_aten_operator_qconfig(
+                X86InductorQuantizer.module_function_to_aten_operator_type[module_type],
+                quantization_config,
+            )
+        else:
+            warnings.warn(
+                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.",
+                stacklevel=2,
+            )
+        return self
+
+    @_config_checker
+    def set_module_name_qconfig(
+        self, module_name: str, quantization_config: QuantizationConfig | None
+    ):
+        """Set quantization_config for a submodule with name: `module_name`, for example:
+        quantizer.set_module_name_qconfig("blocks.sub"), it will quantize all supported operator/operator
+        patterns in the submodule with this module name with the given `quantization_config`
+
+        The supported operators include `quantizable_ops` and `propagation_quantizable_ops`.
+        """
+        self.module_name_qconfig[module_name] = quantization_config
+        return self
+
+    def _set_aten_operator_qconfig(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+        quantization_config: QuantizationConfig | None,
+    ) -> "X86InductorQuantizer":
+        if operator_type in quantizable_ops:
+            self.operator_type_qconfig[operator_type] = quantization_config
+        else:
+            warnings.warn(
+                f"operator: Unable to quantize {operator} by X86InductorQuantizer.",
+                stacklevel=2,
+            )
+        return self
+
+    def _annotate_conv_node_helper(
+        self,
+        conv_node: torch.fx.Node,
+        annotate_output: bool,
+        quantization_config: QuantizationConfig | None,
+    ) -> None:
+        """Helper function to annotate the conv node"""
+        if quantization_config is None:
+            _annotate_nodes_not_quantize(conv_node)
+            return
+        input_qspec_map = {}
+        input_node = conv_node.args[0]
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        weight_node = conv_node.args[1]
+        if not isinstance(weight_node, Node):
+            raise AssertionError("weight_node must be a FX Node")
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+        bias_node = None if len(conv_node.args) == 2 else conv_node.args[2]
+        if isinstance(bias_node, Node):
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+        if annotate_output:
+            conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+        else:
+            conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+            )
+
+    def _annotate_linear_node_helper(
+        self,
+        linear_node: torch.fx.Node,
+        annotate_output: bool,
+        quantization_config: QuantizationConfig | None,
+    ) -> None:
+        """Helper function to annotate the linear node"""
+        if quantization_config is None:
+            _annotate_nodes_not_quantize(linear_node)
+            return
+        input_qspec_map = {}
+        if linear_node.target is not torch.ops.aten.linear.default:
+            raise AssertionError(
+                "linear_node.target must be torch.ops.aten.linear.default"
+            )
+        has_bias = len(linear_node.args) == 3
+        input_index = 0
+        weight_index = 1
+        bias_index = 2
+
+        input_node = linear_node.args[input_index]
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+
+        weight_node = linear_node.args[weight_index]
+        if not isinstance(weight_node, Node):
+            raise AssertionError("weight_node must be a FX Node")
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+
+        bias_node = linear_node.args[bias_index] if has_bias else None
+        if isinstance(bias_node, Node):
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+
+        if annotate_output:
+            linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+        else:
+            linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map, _annotated=True
+            )
+
+    def _get_output_nodes_of_partitions(
+        self,
+        partition_list: list[SourcePartition],
+    ) -> list[torch.fx.Node]:
+        """Helper function to get the output node list from partition list"""
+        output_node_list = []
+        for partition in partition_list:
+            if len(partition.output_nodes) > 1:
+                raise ValueError("Input partition has more than one output node")
+            output_node = partition.output_nodes[0]
+            if not isinstance(output_node, Node):
+                raise AssertionError("output_node must be a FX Node")
+            output_node_list.append(output_node)
+        if len(output_node_list) != len(partition_list):
+            raise ValueError(
+                "length of output_node_list should equal to length of partition_list"
+            )
+        return output_node_list
+
+    def _get_input_idx_for_binary_node(
+        self,
+        conv_gemm_node: torch.fx.Node,
+        binary_node: torch.fx.Node,
+    ):
+        """Helper function to check conv_gemm and extra input node index
+        for binary node fused with conv_gemm.
+        """
+        conv_gemm_node_idx = None
+        extra_input_node_idx = None
+        if (binary_node.args[0].op == "call_function") and (  # type: ignore[union-attr]
+            binary_node.args[0] == conv_gemm_node
+        ):
+            conv_gemm_node_idx = 0
+            extra_input_node_idx = 1
+        elif (binary_node.args[1].op == "call_function") and (  # type: ignore[union-attr]
+            binary_node.args[1] == conv_gemm_node
+        ):
+            conv_gemm_node_idx = 1
+            extra_input_node_idx = 0
+        extra_input_node = binary_node.args[extra_input_node_idx]  # type: ignore[index]
+        if not isinstance(extra_input_node, Node):
+            raise AssertionError("extra_input_node must be a FX Node")
+        return conv_gemm_node_idx, extra_input_node_idx
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """Annotate the given model with quantization configurations.
+
+        Annotation contracts:
+        1. Annotate each node according to the user's qconfig in the following order:
+        `module_name_qconfig`, `operator_type_qconfig`, and `global_config`.
+        2. Avoid re-annotating nodes already annotated in prior stages. For example,
+        if `linear1` has been annotated by `module_name_qconfig`, it won't be annotated again
+        during the processing of the 'operator_type_qconfig' or 'global_config'.
+        3. For config is `None`, the node will be annotated with `_X86InductorQuantizationAnnotation(_annotated=True)`.
+
+        For each pair of (module_name_or_operator_type_or_global, qconfig), a filter function is created.
+        This filter function checks if the node is marked by current stage and not annotated by the previous stage.
+        """
+        for module_name, quantization_config in self.module_name_qconfig.items():
+            self._annotate_with_config(
+                model, quantization_config, _create_module_name_filter(module_name)
+            )
+
+        for operator_type, quantization_config in self.operator_type_qconfig.items():
+            self._annotate_with_config(
+                model, quantization_config, _create_operator_type_filter(operator_type)
+            )
+
+        if self.global_config:
+            self._annotate_with_config(
+                model,
+                self.global_config,
+                _global_config_filter,
+            )
+
+        # Once we've annotated the model with quantization configurations, we also need to annotate
+        # the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs,
+        # we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op.
+        # Refer to
+        # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487  # noqa: B950
+
+        self._annotate_output_for_int8_in_int8_out_pattern_entry(model)
+
+        return model
+
+    def _annotate_with_config(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn,
+    ) -> None:
+        """Annotate the model with the given quantization configuration.
+
+        High-level description of quantization recipe for X86 Inductor Backend:
+        Step 1: Apply quantization recipe for fusion patterns of conv/linear to enable int8 data type actively.
+        Step 2: Propagate quantization annotation for patterns besides conv/linear. Go through the pattern in model
+        from start to the end. If a pattern supports computation with int8 data type and inputs connected to
+        quantized patterns, annotate its inputs as quantized pattern.
+        """
+
+        # Step1: Recipe of fusion patterns like conv/linear.
+        self._annotate_conv2d_fusion_pattern(model, quantization_config, filter_fn)
+        self._annotate_linear_fusion_pattern(model, quantization_config, filter_fn)
+        self._annotate_matmul(model, quantization_config, filter_fn)
+
+        # Step2: Recipe to propagate annotation for patterns beside conv/linear.
+        # Go through all the nodes from start to end.
+        # Recipe refer to
+        # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538  # noqa: B950
+
+        self._annotate_propagation_quantizable_pattern_entry(
+            model, quantization_config, filter_fn
+        )
+
+    def _annotate_qat_conv2d_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        # Annotate QAT Specific patterns
+        self._annotate_qat_conv2d_bn_binary_unary(model, quantization_config, filter_fn)
+        self._annotate_qat_conv2d_bn_binary(model, quantization_config, filter_fn)
+        self._annotate_qat_conv2d_bn_unary(model, quantization_config, filter_fn)
+        self._annotate_qat_conv2d_bn(model, quantization_config, filter_fn)
+
+    def _annotate_qat_conv2d_bn_binary_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add, torch.nn.ReLU]
+        )
+        for fused_partition in fused_partitions:
+            (
+                conv_partition,
+                bn_partition,
+                binary_partition,
+                unary_partition,
+            ) = fused_partition
+
+            (
+                conv_node,
+                bn_output_node,
+                binary_node,
+                unary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, binary_partition, unary_partition]
+            )
+            if len(bn_output_node.users) != 1:
+                # Conv BN pattern should only has 1 user.
+                continue
+            (
+                bn_output_node_idx,
+                extra_input_node_idx,
+            ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node)
+            if (bn_output_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if bn_output_node != binary_node.args[bn_output_node_idx]:
+                raise ValueError(f"{bn_output_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _skip_annotate(
+                [unary_node, binary_node, bn_output_node, conv_node], filter_fn
+            ):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+
+            if quantization_config is not None:
+                binary_node_input_qspec_map = {}
+                binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                    quantization_config
+                )
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # pyrefly: ignore [bad-argument-type]
+                        input_qspec_map=binary_node_input_qspec_map,
+                        _annotated=True,
+                    )
+                )
+                unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+                )
+            else:
+                _annotate_nodes_not_quantize([binary_node, unary_node])
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(binary_partition.nodes))
+            nodes_to_mark_annotated.extend(list(unary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn_binary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition, binary_partition = fused_partition
+            (
+                conv_node,
+                bn_output_node,
+                binary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, binary_partition]
+            )
+            if len(bn_output_node.users) != 1:
+                # Conv BN pattern should only has 1 user.
+                continue
+            (
+                bn_output_node_idx,
+                extra_input_node_idx,
+            ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node)
+            if (bn_output_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if bn_output_node != binary_node.args[bn_output_node_idx]:
+                raise ValueError(f"{bn_output_node} doesn't match input of binary node")
+
+            extra_input_node = binary_node.args[extra_input_node_idx]
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _skip_annotate([binary_node, bn_output_node, conv_node], filter_fn):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+
+            if quantization_config is not None:
+                binary_node_input_qspec_map = {}
+                binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                    quantization_config
+                )
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # pyrefly: ignore [bad-argument-type]
+                        input_qspec_map=binary_node_input_qspec_map,
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+                )
+            else:
+                _annotate_nodes_not_quantize(binary_node)
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(binary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        fused_partitions = []
+        unary_patterns = [
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardswish],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU6],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.SiLU],
+        ]
+        for unary_pattern in unary_patterns:
+            partitions = find_sequential_partitions(gm, unary_pattern)
+            if partitions:
+                # Extend the fused_partitions if partitions is not empty
+                fused_partitions.extend(partitions)
+
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition, unary_partition = fused_partition
+            (
+                conv_node,
+                bn_output_node,
+                unary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, unary_partition]
+            )
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _skip_annotate([unary_node, bn_output_node, conv_node], filter_fn):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            if quantization_config is not None:
+                unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+                )
+            else:
+                _annotate_nodes_not_quantize(unary_node)
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(unary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition = fused_partition
+            conv_node, bn_output_node = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition]
+            )
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _skip_annotate([bn_output_node, conv_node], filter_fn):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            if quantization_config is not None:
+                bn_output_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+                )
+            else:
+                _annotate_nodes_not_quantize(bn_output_node)
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_conv2d_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        if (quantization_config is None) or (quantization_config.is_qat):
+            # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat
+            self._annotate_qat_conv2d_fusion_pattern(
+                model, quantization_config, filter_fn
+            )
+        self._annotate_conv2d_binary_unary(model, quantization_config, filter_fn)
+        self._annotate_conv2d_binary(model, quantization_config, filter_fn)
+        self._annotate_conv2d_unary(model, quantization_config, filter_fn)
+        self._annotate_conv2d(model, quantization_config, filter_fn)
+
+    def _annotate_linear_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        self._annotate_linear_binary_unary(model, quantization_config, filter_fn)
+        self._annotate_linear_unary(model, quantization_config, filter_fn)
+        self._annotate_linear(model, quantization_config, filter_fn)
+
+    def _annotate_matmul(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        for node in model.graph.nodes:
+            if node.target != torch.ops.aten.matmul.default:
+                continue
+            if _skip_annotate([node], filter_fn):
+                continue
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize(node)
+                continue
+
+            input_qspec_map = {}
+            matmul_node = node
+            for input_node in matmul_node.args:
+                input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+            matmul_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d_binary_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        # Conv2d + add + unary op
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, operator.add, torch.nn.ReLU]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, binary_partition, unary_partition = fused_partition
+            conv_node, binary_node, unary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, binary_partition, unary_partition]
+            )
+            if len(conv_node.users) != 1:
+                # Conv Node should only has 1 user node
+                continue
+            conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node(
+                conv_node, binary_node
+            )
+            if (conv_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if conv_node != binary_node.args[conv_node_idx]:
+                raise ValueError(f"{conv_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                # No conv node found to be fused with add
+                continue
+            if _skip_annotate([unary_node, binary_node, conv_node], filter_fn):
+                continue
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize([conv_node, binary_node, unary_node])
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # pyrefly: ignore [bad-argument-type]
+                input_qspec_map=binary_node_input_qspec_map,
+                _annotated=True,
+            )
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d_binary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        # Conv2d + add
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, operator.add]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, binary_partition = fused_partition
+            conv_node, binary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, binary_partition]
+            )
+            if len(conv_node.users) != 1:
+                # Conv Node should only has 1 user node
+                continue
+            conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node(
+                conv_node, binary_node
+            )
+            if (conv_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if conv_node != binary_node.args[conv_node_idx]:
+                raise ValueError(f"{conv_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+            if not isinstance(conv_node, Node):
+                raise AssertionError("conv_node must be a FX Node")
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                # No conv node found to be fused with add
+                continue
+            if _skip_annotate([binary_node, conv_node], filter_fn):
+                continue
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize([conv_node, binary_node])
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # pyrefly: ignore [bad-argument-type]
+                input_qspec_map=binary_node_input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        fused_partitions = []
+        unary_patterns = [
+            [torch.nn.Conv2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.Hardswish],
+            [torch.nn.Conv2d, torch.nn.ReLU6],
+            [torch.nn.Conv2d, torch.nn.SiLU],
+            [torch.nn.Conv1d, torch.nn.ReLU],
+        ]
+        for unary_pattern in unary_patterns:
+            partitions = find_sequential_partitions(gm, unary_pattern)
+            if partitions:
+                # Extend the fused_partitions if partitions is not empty
+                fused_partitions.extend(partitions)
+
+        for fused_partition in fused_partitions:
+            conv_partition, unary_partition = fused_partition
+            conv_node, unary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, unary_partition]
+            )
+            if conv_node.op != "call_function" or conv_node.target not in (
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.conv1d.default,
+            ):
+                continue
+            if _skip_annotate([unary_node, conv_node], filter_fn):
+                continue
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize([conv_node, unary_node])
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        conv_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Conv2d, torch.nn.functional.conv2d]
+        )
+        conv_partitions = list(itertools.chain.from_iterable(conv_partitions.values()))
+        for conv_partition in conv_partitions:
+            if len(conv_partition.output_nodes) > 1:
+                raise ValueError("conv partition has more than one output node")
+            conv_node = conv_partition.output_nodes[0]
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                raise ValueError(f"{conv_node} is not an aten conv2d operator")
+            # skip annotation if it is already annotated
+            if _skip_annotate([conv_node], filter_fn):
+                continue
+            self._annotate_conv_node_helper(conv_node, True, quantization_config)
+
+    def _annotate_maxpool2d(
+        self,
+        node: Node,
+        quantization_config: QuantizationConfig | None,
+    ) -> None:
+        if node.target is not torch.ops.aten.max_pool2d.default:
+            return
+        if quantization_config is None:
+            _annotate_nodes_not_quantize(node)
+            return
+
+        maxpool_node = node
+        if _is_any_annotated(
+            [
+                maxpool_node,
+            ]
+        ):
+            return
+
+        input_node = maxpool_node.args[0]
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
+        input_qspec_map = {}
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        maxpool_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+            _is_output_of_quantized_pattern=True,
+        )
+
+    def _annotate_cat(
+        self, node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        if quantization_config is None:
+            _annotate_nodes_not_quantize(node)
+            return
+        cat_node = node
+        input_nodes = cat_node.args[0]
+        if not isinstance(input_nodes, Sequence):
+            raise AssertionError("input_nodes must be a Sequence of FX Nodes")
+        first_input_node = input_nodes[0]
+        input_qspec_map = {}
+        if not isinstance(first_input_node, Node):
+            raise AssertionError("first_input_node must be a FX Node")
+        if not isinstance(cat_node, Node):
+            raise AssertionError("cat_node must be a FX Node")
+        input_qspec_map[first_input_node] = get_input_act_qspec(quantization_config)
+        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+            (first_input_node, cat_node)
+        )
+
+        for input_node in input_nodes[1:]:
+            if input_node not in input_qspec_map:
+                # There has the case of cat same nodes: torch.cat([input0, input0], 1)
+                if not isinstance(input_node, Node):
+                    raise AssertionError("input_node must be a FX Node")
+                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+
+        cat_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+            _is_output_of_quantized_pattern=True,
+        )
+
+    def _annotate_propagation_quantizable_pattern_entry(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        for node in gm.graph.nodes:
+            self._annotate_propagation_quantizable_pattern(
+                node, quantization_config, filter_fn
+            )
+
+    def _annotate_propagation_quantizable_pattern(
+        self, node: Node, quantization_config, filter_fn
+    ) -> None:
+        # Propagate annotation to quantizable patterns.
+        if (
+            (node.target in propagation_quantizable_ops)
+            and (not _is_any_annotated([node]))
+            and (node.op == "call_function")
+        ):
+
+            def is_all_inputs_connected_to_quantized_op(input_nodes):
+                # Ensure all the inputs connect to fusion pattern or quantized node
+                for input_node in input_nodes:
+                    if not _is_quantized_op_pt2e(input_node):
+                        return False
+                return True
+
+            if _skip_annotate([node], filter_fn):
+                return
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize(node)
+                return
+
+            if node.target is torch.ops.aten.max_pool2d.default:
+                # Recipe of maxpool2d: check input arg[0] of maxpool2d is quantized or not
+                input_nodes_to_check = [node.all_input_nodes[0]]
+                if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
+                    if quantization_config is not None:
+                        warnings.warn(
+                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.",
+                            stacklevel=2,
+                        )
+                    return
+
+                self._annotate_maxpool2d(node, quantization_config)
+                return
+            elif node.target is torch.ops.aten.cat.default:
+                input_nodes_to_check = node.all_input_nodes
+                if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
+                    return
+                self._annotate_cat(node, quantization_config)
+            elif (
+                node.target is torch.ops.aten.flatten.using_ints
+                and len(node.users) > 0
+                and not any(user.target in quantizable_ops for user in node.users)
+            ):
+                # Recipe of flatten: check if any users of flatten node are quantizable ops or not
+                return
+            else:
+                input_node = node.all_input_nodes[0]
+                if not is_all_inputs_connected_to_quantized_op(
+                    [
+                        input_node,
+                    ]
+                ):
+                    return
+                input_qspec_map = {}
+                input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+                node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                    input_qspec_map=input_qspec_map,
+                    _annotated=True,
+                    _is_output_of_quantized_pattern=True,
+                )
+        return
+
+    def _annotate_output_share_observer_as_input(
+        self, input_node: Node, source_node: Node
+    ):
+        source_node_quantization_annotation = source_node.meta.get(QUANT_ANNOTATION_KEY)
+        if (
+            source_node_quantization_annotation
+            and source_node_quantization_annotation._is_output_of_quantized_pattern
+        ):
+            edge_or_node = (input_node, source_node)
+            source_node_quantization_annotation.output_qspec = SharedQuantizationSpec(
+                edge_or_node
+            )
+        return
+
+    def _annotate_output_for_int8_in_int8_out_pattern_entry(
+        self,
+        model: torch.fx.GraphModule,
+    ):
+        for node in model.graph.nodes:
+            self._annotate_output_for_int8_in_int8_out_pattern(node)
+
+    def _annotate_output_for_int8_in_int8_out_pattern(
+        self,
+        node: Node,
+    ) -> None:
+        r"""
+        Check and insert observer at output of node in int8_in_int8_out_ops if needed.
+        Recipe refers to
+        https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
+        """  # noqa: B950
+        edge_or_node: tuple[Node, Node]
+        if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
+            if node.target is torch.ops.aten.max_pool2d.default:
+                maxpool_node = node
+                if not _is_all_annotated(
+                    [
+                        maxpool_node,
+                    ]
+                ):
+                    return
+
+                # Get the quantization_annotation from getitem_node
+                maxpool_node_quantization_annotation = maxpool_node.meta.get(
+                    QUANT_ANNOTATION_KEY
+                )
+                if (
+                    maxpool_node_quantization_annotation
+                    and maxpool_node_quantization_annotation._is_output_of_quantized_pattern
+                ):
+                    # Annotate the output_qspec of getitem_node
+                    input_act = maxpool_node.args[0]
+                    if not isinstance(input_act, Node):
+                        raise AssertionError("input_act must be a FX Node")
+                    if not isinstance(maxpool_node, Node):
+                        raise AssertionError("maxpool_node must be a FX Node")
+                    edge_or_node = (input_act, maxpool_node)
+                    maxpool_node_quantization_annotation.output_qspec = (
+                        SharedQuantizationSpec(edge_or_node)
+                    )
+            else:
+                input_node = node.all_input_nodes[0]
+                self._annotate_output_share_observer_as_input(input_node, node)
+        return
+
+    def _annotate_linear(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        linear_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Linear, torch.nn.functional.linear]
+        )
+        linear_partitions = list(
+            itertools.chain.from_iterable(linear_partitions.values())
+        )
+        for partition in linear_partitions:
+            if len(partition.output_nodes) > 1:
+                raise ValueError(
+                    "Linear partition cannot have more than one output node"
+                )
+            linear_node = partition.output_nodes[0]
+            if (
+                linear_node.op != "call_function"
+                or linear_node.target != torch.ops.aten.linear.default
+            ):
+                raise ValueError(f"{linear_node} is not an aten linear operator")
+            # skip annotation if it is already annotated
+            if _skip_annotate([linear_node], filter_fn):
+                continue
+            self._annotate_linear_node_helper(linear_node, True, quantization_config)
+
+    def _annotate_linear_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        postop_list = [
+            torch.nn.ReLU,
+            torch.nn.LeakyReLU,
+            torch.nn.Tanh,
+            torch.nn.GELU,
+        ]
+        fused_partitions: list[tuple] = []
+        for postop in postop_list:
+            fused_partitions = fused_partitions + find_sequential_partitions(
+                gm, [torch.nn.Linear, postop]
+            )
+        for fused_partition in fused_partitions:
+            linear_partition, unary_partition = fused_partition
+            linear_node, unary_node = self._get_output_nodes_of_partitions(
+                [linear_partition, unary_partition]
+            )
+            if (
+                linear_node.op != "call_function"
+                or linear_node.target != torch.ops.aten.linear.default
+            ):
+                continue
+            if _skip_annotate([unary_node, linear_node], filter_fn):
+                continue
+
+            if quantization_config is None:
+                _annotate_nodes_not_quantize([linear_node, unary_node])
+                continue
+
+            self._annotate_linear_node_helper(linear_node, False, quantization_config)
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_linear_binary_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ) -> None:
+        # linear + binary_op + (optional) unary op
+        binary_op_list = [operator.add]
+        unary_op_list = [torch.nn.ReLU, None]
+        combinations = itertools.product(binary_op_list, unary_op_list)
+        for binary_op, unary_op in combinations:
+            has_unary = unary_op is not None
+            seq_partition = [torch.nn.Linear, binary_op]
+            if has_unary:
+                # pyrefly: ignore [bad-argument-type]
+                seq_partition.append(unary_op)
+            fused_partitions = find_sequential_partitions(gm, seq_partition)
+            for fused_partition in fused_partitions:
+                unary_partition, unary_node = None, None
+                if has_unary:
+                    (
+                        linear_partition,
+                        binary_partition,
+                        unary_partition,
+                    ) = fused_partition
+                    (
+                        linear_node,
+                        binary_node,
+                        unary_node,
+                    ) = self._get_output_nodes_of_partitions(
+                        [linear_partition, binary_partition, unary_partition]
+                    )
+                else:
+                    linear_partition, binary_partition = fused_partition
+                    linear_node, binary_node = self._get_output_nodes_of_partitions(
+                        [linear_partition, binary_partition]
+                    )
+                if len(linear_node.users) != 1:
+                    # Linear Node should only has 1 user node
+                    continue
+                (
+                    linear_node_idx,
+                    extra_input_node_idx,
+                ) = self._get_input_idx_for_binary_node(linear_node, binary_node)
+                if (linear_node_idx is None) or (extra_input_node_idx is None):
+                    continue
+                if linear_node != binary_node.args[linear_node_idx]:
+                    raise ValueError(
+                        f"{linear_node} doesn't match input of binary node"
+                    )
+                if not isinstance(linear_node, Node):
+                    raise AssertionError("linear_node must be a FX Node")
+                if (
+                    linear_node.op != "call_function"
+                    or linear_node.target != torch.ops.aten.linear.default
+                ):
+                    # No linear node found to be fused with add
+                    continue
+                node_list = (
+                    [binary_node, linear_node]
+                    if unary_node is None
+                    else [unary_node, binary_node, linear_node]
+                )
+                if _skip_annotate(node_list, filter_fn):
+                    continue
+
+                if quantization_config is None:
+                    _annotate_nodes_not_quantize(node_list)
+                    continue
+
+                self._annotate_linear_node_helper(
+                    linear_node, False, quantization_config
+                )
+                # We don't insert q-dq before the binary input node due to accuracy issues
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        input_qspec_map={},
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=(not has_unary),
+                    )
+                )
+                if unary_node is not None:
+                    unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                        _X86InductorQuantizationAnnotation(
+                            _annotated=True,
+                            _is_output_of_quantized_pattern=True,
+                        )
+                    )
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a2234fdff3f137170d2810ef82fe8b7c706c0c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -0,0 +1,451 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import copy
+import functools
+import typing_extensions
+from typing import Any, TYPE_CHECKING
+
+import torch
+import torch._dynamo as torchdynamo
+import torch.nn.functional as F
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    HistogramObserver,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+)
+from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
+from torch.ao.quantization.quantizer.utils import _get_module_name_filter
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    _convert_scalars_to_attrs,
+    OP_TO_ANNOTATOR,
+    OperatorConfig,
+    OperatorPatternType,
+    propagate_annotation,
+    QuantizationConfig,
+)
+from torch.fx._compatibility import compatibility
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+    from torch.fx import Node
+
+
+__all__ = [
+    "XNNPACKQuantizer",
+    "get_symmetric_quantization_config",
+]
+
+
+def _get_dynamo_graph(function: Callable, inputs) -> torch.fx.Graph:
+    gm, _ = torchdynamo.export(function, aten_graph=True)(*inputs)
+    gm.graph.eliminate_dead_code()
+    return gm.graph
+
+
+def _get_linear_patterns(input_size: list[int]):
+    in_channels = input_size[-1]
+    out_channels = 8  # hard coding but this should not matter
+    weight = torch.ones((out_channels, in_channels))
+    bias = torch.ones((out_channels,))
+    act = torch.ones(input_size)
+
+    def linear_op(act, weight, bias=None):
+        return F.linear(act, weight, bias)
+
+    pattern_w_bias = _get_dynamo_graph(linear_op, (act, weight, bias))
+    pattern_wo_bias = _get_dynamo_graph(linear_op, (act, weight))
+    return [pattern_w_bias, pattern_wo_bias]
+
+
+def _supported_symmetric_quantized_operators() -> dict[str, list[OperatorPatternType]]:
+    supported_operators: dict[str, list[OperatorPatternType]] = {
+        # Both conv and linear should be able to handle relu + hardtanh fusion since
+        # those are clamp ops
+        "conv2d": [
+            [torch.nn.Conv2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, F.relu],
+            [F.conv2d, torch.nn.ReLU],
+            [F.conv2d, F.relu],
+        ],
+        "linear": [[torch.nn.Linear], [F.linear]],
+        "add": [[torch.add]],
+        "adaptive_avg_pool2d": [
+            [torch.nn.AdaptiveAvgPool2d],
+            [F.adaptive_avg_pool2d],
+        ],
+    }
+    return copy.deepcopy(supported_operators)
+
+
+def _get_supported_symmetric_config_and_operators() -> list[OperatorConfig]:
+    supported_config_and_operators: list[OperatorConfig] = []
+    for quantization_config in [
+        get_symmetric_quantization_config(),
+        get_symmetric_quantization_config(is_qat=True),
+        get_symmetric_quantization_config(is_per_channel=True),
+        get_symmetric_quantization_config(is_per_channel=True, is_qat=True),
+    ]:
+        ops = _supported_symmetric_quantized_operators()
+        supported_config_and_operators.extend(
+            OperatorConfig(quantization_config, pattern_list)
+            for pattern_list in ops.values()
+        )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+@functools.lru_cache
+def get_symmetric_quantization_config(
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    act_qmin: int = -128,
+    act_qmax: int = 127,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
+):
+    extra_args: dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=act_qmin,
+        quant_max=act_qmax,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args,
+        ),
+    )
+    weight_qscheme = (
+        torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric
+    )
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        MinMaxObserver
+    )
+    if is_qat:
+        # TODO: qat + per channel?
+        weight_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize
+    elif is_per_channel:
+        weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver
+
+    extra_args: dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if weight_qscheme == torch.per_tensor_symmetric:
+            extra_args["observer"] = MovingAverageMinMaxObserver
+        else:
+            extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=weight_qmin,
+        quant_max=weight_qmax,
+        qscheme=weight_qscheme,
+        ch_axis=0,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    bias_quantization_spec = None
+    if is_dynamic:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            None,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    else:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            act_quantization_spec,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    return quantization_config
+
+
+def _get_supported_config_and_operators() -> list[OperatorConfig]:
+    return _get_supported_symmetric_config_and_operators()
+
+
+def _get_module_type_filter(tp: Callable):
+    """Get the module_type_filter function for a given module type, the filter accepts
+    a node and checks if the node comes from a module that has certain module type
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with type Block -> Sub -> Linear
+
+
+    >> module_type_filter = _get_module_type_filter(Sub)  # submodule with type `Sub`, under the `Block` submodule
+    >> print(module_type_filter(node))
+    True  # the node is from the submodule `Sub` (same for `Block` and `Linear` as well)
+    """
+
+    tp_str = tp.__module__ + "." + tp.__qualname__
+
+    def module_type_filter(n: Node) -> bool:
+        # example: {
+        #     'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #     'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+        types = []
+        for _, t in nn_module_stack.values():
+            # export() returns str, but older APIs (e.g. capture_pre_autograd_graph)
+            # return type. Handle both cases.
+            if isinstance(t, type):
+                t = t.__module__ + "." + t.__qualname__
+            types.append(t)
+        return tp_str in types
+
+    return module_type_filter
+
+
+def _get_not_module_type_or_name_filter(
+    tp_list: list[Callable], module_name_list: list[str]
+) -> Callable[[Node], bool]:
+    module_type_filters = [_get_module_type_filter(tp) for tp in tp_list]
+    module_name_list_filters = [_get_module_name_filter(m) for m in module_name_list]
+
+    def not_module_type_or_name_filter(n: Node) -> bool:
+        return not any(f(n) for f in module_type_filters + module_name_list_filters)
+
+    return not_module_type_or_name_filter
+
+
+@compatibility(is_backward_compatible=False)
+@typing_extensions.deprecated(
+    "XNNPACKQuantizer is deprecated! Please use xnnpack quantizer in "
+    "ExecuTorch (https://github.com/pytorch/executorch/tree/main/backends/xnnpack/quantizer) instead."
+)
+class XNNPACKQuantizer(Quantizer):
+    """
+    !!! DEPRECATED !!!
+    XNNPACKQuantizer is a marked as deprecated. It will be removed in the future.
+    It has been moved to executorch.backends.xnnpack.quantizer.xnnpack_quantizer.XNNPACKQuantizer.
+    Please use the new quantizer instead.
+    """
+
+    supported_config_and_operators = _get_supported_config_and_operators()
+    STATIC_QAT_ONLY_OPS = [
+        "conv_bn_relu",
+        "conv_bn",
+        "conv_transpose_bn_relu",
+        "conv_transpose_bn",
+    ]
+
+    # static quantization ops (both PTQ and QAT)
+    # Preserve the order that fusions come before singular ops
+    STATIC_OPS = [
+        "linear_relu",
+        "linear",
+        "conv_relu",
+        "conv",
+        "conv_transpose_relu",
+        "adaptive_avg_pool2d",
+        # TODO: move this to BoltNNQuantizer?
+        "gru_io_only",
+        "add_relu",
+        "add",
+        "mul_relu",
+        "mul",
+        "cat",
+    ]
+
+    DYNAMIC_OPS = [
+        "linear",
+    ]
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.global_config: QuantizationConfig | None = None
+        self.operator_type_config: dict[
+            torch._ops.OpOverloadPacket, QuantizationConfig | None
+        ] = {}
+        self.module_type_config: dict[Callable, QuantizationConfig | None] = {}
+        self.module_name_config: dict[str, QuantizationConfig | None] = {}
+
+    @classmethod
+    def get_supported_quantization_configs(cls) -> list[QuantizationConfig]:
+        op_configs: set[QuantizationConfig] = {
+            spec for spec, _ in cls.supported_config_and_operators
+        }
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: QuantizationConfig | None
+    ) -> list[OperatorPatternType]:
+        if quantization_config is None:
+            all_ops = []
+            for _, ops in cls.supported_config_and_operators:
+                all_ops.extend(ops)
+            return all_ops
+
+        for config, ops in cls.supported_config_and_operators:
+            # note: this assumes each entry in cls.supported_spec_and_operators
+            # corresponds to one spec, e.g. we don't have
+            # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)]
+            # where the first and second entry have the same spec but did not
+            # merge the op list
+            if config == quantization_config:
+                return ops
+        return []
+
+    def set_global(self, quantization_config: QuantizationConfig) -> XNNPACKQuantizer:
+        self.global_config = quantization_config
+        return self
+
+    def set_operator_type(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+        quantization_config: QuantizationConfig,
+    ) -> XNNPACKQuantizer:
+        self.operator_type_config[operator_type] = quantization_config
+        return self
+
+    def set_module_type(
+        self, module_type: Callable, quantization_config: QuantizationConfig
+    ):
+        """Set quantization_config for a submodule with type: `module_type`, for example:
+        quantizer.set_module_name(Sub) or quantizer.set_module_name(nn.Linear), it will quantize all supported operator/operator
+        patterns in the submodule with this module type with the given `quantization_config`
+        """
+        self.module_type_config[module_type] = quantization_config
+        return self
+
+    def set_module_name(
+        self, module_name: str, quantization_config: QuantizationConfig | None
+    ):
+        """Set quantization_config for a submodule with name: `module_name`, for example:
+        quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
+        patterns in the submodule with this module name with the given `quantization_config`
+        """
+        if quantization_config is None:
+            raise AssertionError("quantization_config == None is not supported yet")
+        self.module_name_config[module_name] = quantization_config
+        return self
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """Transforms scalar values to tensor attributes"""
+        return _convert_scalars_to_attrs(model)
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        # hacked for handling dynamic linear quant. will fix later.
+        if self.global_config and self.global_config.input_activation.is_dynamic:  # type: ignore[union-attr]
+            model = self._annotate_for_dynamic_quantization_config(model)
+        else:
+            model = self._annotate_for_static_quantization_config(model)
+        propagate_annotation(model)
+        return model
+
+    def _annotate_all_static_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: Callable[[Node], bool] | None = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        if quantization_config.is_qat:
+            for op in self.STATIC_QAT_ONLY_OPS:
+                OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        for op in self.STATIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_all_dynamic_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: Callable[[Node], bool] | None = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        for op in self.DYNAMIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_for_static_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_static_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def _annotate_for_dynamic_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_dynamic_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> list[OperatorConfig]:
+        return cls.supported_config_and_operators
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..22282d3d071a899e31cd4607027aa3abec249c7f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -0,0 +1,1152 @@
+# mypy: allow-untyped-defs
+import itertools
+import typing
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import NamedTuple
+
+import torch
+import torch.nn.functional as F
+from torch._subclasses import FakeTensor
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
+from torch.ao.quantization.pt2e.utils import (
+    _get_aten_graph_module_for_pattern,
+    _is_conv_node,
+    _is_conv_transpose_node,
+)
+from torch.ao.quantization.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
+    SubgraphMatcherWithNameNodeMap,
+)
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+__all__ = [
+    "OperatorConfig",
+    "OperatorPatternType",
+    "QuantizationConfig",
+    "get_input_act_qspec",
+    "get_output_act_qspec",
+    "get_weight_qspec",
+    "get_bias_qspec",
+    "OP_TO_ANNOTATOR",
+    "propagate_annotation",
+]
+
+
+# In the absence of better name, just winging it with QuantizationConfig
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: QuantizationSpec | None
+    output_activation: QuantizationSpec | None
+    weight: QuantizationSpec | None
+    bias: QuantizationSpec | None
+    # TODO: remove, since we can use observer_or_fake_quant_ctr to express this
+    is_qat: bool = False
+
+
+# Use Annotated because list[Callable].__module__ is read-only.
+OperatorPatternType = typing.Annotated[list[Callable], None]
+OperatorPatternType.__module__ = (
+    "torch.ao.quantization.quantizer.xnnpack_quantizer_utils"
+)
+
+AnnotatorType = Callable[
+    [
+        torch.fx.GraphModule,
+        QuantizationConfig | None,
+        Callable[[Node], bool] | None,
+    ],
+    list[list[Node]] | None,
+]
+OP_TO_ANNOTATOR: dict[str, AnnotatorType] = {}
+
+
+def register_annotator(op: str) -> Callable[[AnnotatorType], None]:
+    def decorator(annotator: AnnotatorType) -> None:
+        OP_TO_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+class OperatorConfig(NamedTuple):
+    # fix List[str] with List[List[Union[nn.Module, FunctionType, BuiltinFunctionType]]]
+    # Basically we are mapping a quantization config to some list of patterns.
+    # a pattern is defined as a list of nn module, function or builtin function names
+    # e.g. [nn.Conv2d, torch.relu, torch.add]
+    # We have not resolved whether fusion can be considered internal details of the
+    # quantizer hence it does not need communication to user.
+    # Note this pattern is not really informative since it does not really
+    # tell us the graph structure resulting from the list of ops.
+    config: QuantizationConfig
+    operators: list[OperatorPatternType]
+
+
+def _is_annotated(nodes: list[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    check if any of the node is annotated, return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _mark_nodes_as_annotated(nodes: list[Node]):
+    for node in nodes:
+        if node is not None:
+            if "quantization_annotation" not in node.meta:
+                node.meta["quantization_annotation"] = QuantizationAnnotation()
+            node.meta["quantization_annotation"]._annotated = True
+
+
+def get_input_act_qspec(quantization_config: QuantizationConfig | None):
+    if quantization_config is None:
+        return None
+    if quantization_config.input_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.input_activation
+    if quantization_spec.qscheme not in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]:
+        raise AssertionError(
+            f"Unsupported activation qscheme: {quantization_spec.qscheme}"
+        )
+    return quantization_spec
+
+
+def get_output_act_qspec(quantization_config: QuantizationConfig | None):
+    if quantization_config is None:
+        return None
+    if quantization_config.output_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.output_activation
+    if quantization_spec.qscheme not in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]:
+        raise AssertionError(
+            f"Unsupported activation qscheme: {quantization_spec.qscheme}"
+        )
+    return quantization_spec
+
+
+def get_weight_qspec(quantization_config: QuantizationConfig | None):
+    if quantization_config is None:
+        return None
+    if quantization_config is None:
+        raise AssertionError("quantization_config must not be None")
+    if quantization_config.weight is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.weight
+    if quantization_spec.qscheme not in [
+        torch.per_tensor_symmetric,
+        torch.per_channel_symmetric,
+        None,
+    ]:
+        raise ValueError(
+            f"Unsupported quantization_spec {quantization_spec} for weight"
+        )
+    return quantization_spec
+
+
+def get_bias_qspec(quantization_config: QuantizationConfig | None):
+    if quantization_config is None:
+        return None
+    if quantization_config is None:
+        raise AssertionError("quantization_config must not be None")
+    if quantization_config.bias is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.bias
+    if quantization_spec.dtype != torch.float:
+        raise AssertionError(
+            "Only float dtype for bias is supported for bias right now"
+        )
+    return quantization_spec
+
+
+@register_annotator("linear")
+def _annotate_linear(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target != torch.ops.aten.linear.default:
+            continue
+        if filter_fn and not filter_fn(node):
+            continue
+        act_node = node.args[0]
+        weight_node = node.args[1]
+        bias_node = None
+        if len(node.args) > 2:
+            bias_node = node.args[2]
+
+        if _is_annotated([node]) is False:  # type: ignore[list-item]
+            _annotate_input_qspec_map(
+                node,
+                act_node,
+                input_act_qspec,
+            )
+            _annotate_input_qspec_map(
+                node,
+                weight_node,
+                weight_qspec,
+            )
+            nodes_to_mark_annotated = [node, weight_node]
+            if bias_node:
+                _annotate_input_qspec_map(
+                    node,
+                    bias_node,
+                    bias_qspec,
+                )
+                nodes_to_mark_annotated.append(bias_node)
+            _annotate_output_qspec(node, output_act_qspec)
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+            annotated_partitions.append(nodes_to_mark_annotated)
+
+    return annotated_partitions
+
+
+@register_annotator("linear_relu")
+def _annotate_linear_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = node
+        maybe_linear_node = node.args[0]
+        if (
+            not isinstance(maybe_linear_node, Node)
+            or maybe_linear_node.op != "call_function"
+            or maybe_linear_node.target != torch.ops.aten.linear.default
+        ):
+            continue
+
+        linear_node = maybe_linear_node
+        if len(linear_node.users) > 1:
+            # if linear node has multiple users, then it can't be fused with relu
+            continue
+
+        input_qspec_map = {}
+        input_act = linear_node.args[0]
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+        input_qspec_map[input_act] = input_act_qspec
+
+        weight = linear_node.args[1]
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
+        input_qspec_map[weight] = weight_qspec
+
+        # adding weight node to the partition as well
+        partition = [relu_node, linear_node, weight]
+        bias = linear_node.args[2] if len(linear_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = bias_qspec
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        linear_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv")
+def _annotate_conv(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+        ]:
+            continue
+        conv_node = n
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [conv_node, conv_node.args[1]]
+
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=get_output_act_qspec(quantization_config),
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+def _do_annotate_conv_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+    is_conv_transpose: bool = False,
+):
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = n
+        maybe_conv_node = n.args[0]
+
+        is_conv_node = _is_conv_transpose_node if is_conv_transpose else _is_conv_node
+        if not isinstance(maybe_conv_node, Node) or not is_conv_node(maybe_conv_node):
+            continue
+        conv_node = maybe_conv_node
+
+        if len(conv_node.users) > 1:
+            # relu shouldn't be fuseable to conv if there are other users
+            # of convolution
+            continue
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [relu_node, conv_node, conv_node.args[1]]
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        # pyrefly: ignore [bad-argument-type]
+        if _is_annotated(partition):
+            continue
+
+        # pyrefly: ignore [bad-argument-type]
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map, _annotated=True
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        # pyrefly: ignore [bad-argument-type]
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv_relu")
+def _annotate_conv_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    return _do_annotate_conv_relu(
+        gm, quantization_config, filter_fn, is_conv_transpose=False
+    )
+
+
+@register_annotator("conv_transpose_relu")
+def _annotate_conv_transpose_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    return _do_annotate_conv_relu(
+        gm, quantization_config, filter_fn, is_conv_transpose=True
+    )
+
+
+@register_annotator("conv_bn")
+def _annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    """
+    Find conv + batchnorm partitions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False)
+
+
+@register_annotator("conv_bn_relu")
+def _annotate_conv_bn_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    """
+    Find conv + batchnorm + relu partitions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
+
+
+@register_annotator("conv_transpose_bn")
+def _annotate_conv_transpose_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    """
+    Find conv_transpose + batchnorm partitions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(
+        gm, quantization_config, filter_fn, has_relu=False, is_conv_transpose=True
+    )
+
+
+@register_annotator("conv_transpose_bn_relu")
+def _annotate_conv_transpose_bn_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    """
+    Find conv_transpose + batchnorm + relu partitions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(
+        gm, quantization_config, filter_fn, has_relu=True, is_conv_transpose=True
+    )
+
+
+def _do_annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None,
+    has_relu: bool,
+    is_conv_transpose: bool = False,
+) -> list[list[Node]]:
+    """
+    Given a function that takes in a `conv_fn` and returns a conv-bn[-relu] pattern,
+    return a list of annotated partitions.
+
+    The output of the pattern must include a dictionary from string name to node
+    for the following names: "input", "conv", "weight", "bias", and "output".
+    """
+
+    # Example inputs for conv-bn1d patterns
+    _conv1d_bn_example_inputs = (
+        torch.randn(1, 1, 3),  # x
+        torch.randn(1, 1, 1),  # conv_weight
+        torch.randn(1),  # conv_bias
+        torch.randn(1),  # bn_weight
+        torch.randn(1),  # bn_bias
+        torch.randn(1),  # bn_running_mean
+        torch.randn(1),  # bn_running_var
+    )
+
+    # Example inputs for conv-bn2d patterns
+    _conv2d_bn_example_inputs = (
+        torch.randn(1, 1, 3, 3),  # x
+        torch.randn(1, 1, 1, 1),  # conv_weight
+        torch.randn(1),  # conv_bias
+        torch.randn(1),  # bn_weight
+        torch.randn(1),  # bn_bias
+        torch.randn(1),  # bn_running_mean
+        torch.randn(1),  # bn_running_var
+    )
+
+    def get_pattern(conv_fn: Callable, relu_is_inplace: bool):
+        def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
+            conv = conv_fn(x, conv_weight, conv_bias)
+            bn = F.batch_norm(conv, bn_rm, bn_rv, bn_weight, bn_bias, training=True)
+            if has_relu:
+                output = F.relu_(bn) if relu_is_inplace else F.relu(bn)
+            else:
+                output = bn
+            return output, {
+                "input": x,
+                "conv": conv,
+                "weight": conv_weight,
+                "bias": conv_bias,
+                "output": output,
+            }
+
+        return _WrapperModule(_conv_bn)
+
+    # Needed for matching, otherwise the matches gets filtered out due to unused
+    # nodes returned by batch norm
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+    matches = []
+    if is_conv_transpose:
+        combinations = [
+            (F.conv_transpose1d, _conv1d_bn_example_inputs),
+            (F.conv_transpose2d, _conv2d_bn_example_inputs),
+        ]
+    else:
+        combinations = [
+            (F.conv1d, _conv1d_bn_example_inputs),  # type: ignore[list-item]
+            (F.conv2d, _conv2d_bn_example_inputs),  # type: ignore[list-item]
+        ]
+
+    # Add `is_cuda` and `relu_is_inplace` dimensions
+    combinations = itertools.product(  # type: ignore[assignment]
+        combinations,
+        [True, False] if torch.cuda.is_available() else [False],  # is_cuda
+        [True, False] if has_relu else [False],  # relu_is_inplace
+    )
+
+    # Match against all conv dimensions and cuda variants
+    for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:  # type: ignore[misc]
+        pattern = get_pattern(conv_fn, relu_is_inplace)  # type: ignore[has-type]
+        pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs, is_cuda)  # type: ignore[has-type]
+        pattern.graph.eliminate_dead_code()
+        pattern.recompile()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
+        matches.extend(matcher.match(gm.graph))
+
+    # Annotate nodes returned in the matches
+    annotated_partitions = []
+    for match in matches:
+        name_node_map = match.name_node_map
+        input_node = name_node_map["input"]
+        conv_node = name_node_map["conv"]
+        weight_node = name_node_map["weight"]
+        bias_node = name_node_map["bias"]
+        output_node = name_node_map["output"]
+
+        # TODO: annotate the uses of input, weight, and bias separately instead
+        # of assuming they come from a single conv node. This is not possible today
+        # because input may have multiple users, and we can't rely on the conv node
+        # always being the first user. This was the case in models with skip
+        # connections like resnet18
+
+        # Validate conv args
+        if conv_node.args[0] is not input_node:
+            raise ValueError("Conv arg did not contain input node ", input_node)
+        if conv_node.args[1] is not weight_node:
+            raise ValueError("Conv arg did not contain weight node ", weight_node)
+        if len(conv_node.args) > 2 and conv_node.args[2] is not bias_node:
+            raise ValueError("Conv arg did not contain bias node ", bias_node)
+
+        # Skip if the partition is already annotated or is filtered out by the user
+        partition = [conv_node, weight_node]
+        if bias_node is not None:
+            partition.append(bias_node)
+        if _is_annotated(partition):
+            continue
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        # Annotate conv inputs and pattern output
+        input_qspec_map = {}
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+        if bias_node is not None:
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        output_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("gru_io_only")
+def _annotate_gru_io_only(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    gru_partitions = get_source_partitions(gm.graph, [torch.nn.GRU], filter_fn)
+    gru_partitions = list(itertools.chain.from_iterable(gru_partitions.values()))
+    annotated_partitions = []
+    for gru_partition in gru_partitions:
+        annotated_partitions.append(gru_partition.nodes)
+        output_nodes = gru_partition.output_nodes
+        input_nodes = gru_partition.input_nodes
+        # skip annotation if it is already annotated
+        if _is_annotated(input_nodes + output_nodes):
+            continue
+        # inside each GRU partition, we should be able to annotate each linear
+        # subgraph
+        input_act = input_nodes[0]
+        input_act_user = next(iter(input_act.users.keys()))
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+        if not isinstance(input_act_user, Node):
+            raise AssertionError("input activation user must be a FX Node")
+        input_act_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        hidden_state = input_nodes[1]
+        hidden_state_user = next(iter(hidden_state.users.keys()))
+        if not isinstance(hidden_state, Node):
+            raise AssertionError("hidden state must be a FX Node")
+        if not isinstance(hidden_state_user, Node):
+            raise AssertionError("hidden state user must be a FX Node")
+        hidden_state_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                hidden_state: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        if len(output_nodes) != 2:
+            raise AssertionError("expecting GRU to have two outputs")
+        for output in output_nodes:
+            output.meta["quantization_annotation"] = QuantizationAnnotation(
+                output_qspec=get_output_act_qspec(quantization_config),
+                _annotated=True,
+            )
+        nodes_to_mark_annotated = list(gru_partition.nodes)
+        _mark_nodes_as_annotated(nodes_to_mark_annotated)
+    return annotated_partitions
+
+
+@register_annotator("adaptive_avg_pool2d")
+def _annotate_adaptive_avg_pool2d(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    """Always annotate adaptive_avg_pool2d op"""
+    module_partitions = get_source_partitions(
+        gm.graph, [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], filter_fn
+    )
+    partitions = list(itertools.chain.from_iterable(module_partitions.values()))
+    annotated_partitions = []
+    for partition in partitions:
+        pool_node = partition.output_nodes[0]
+        if (
+            pool_node.op != "call_function"
+            or pool_node.target != torch.ops.aten.adaptive_avg_pool2d.default
+        ):
+            raise ValueError(f"{pool_node} is not an aten adaptive_avg_pool2d operator")
+
+        if _is_annotated([pool_node]):
+            continue
+
+        annotated_partitions.append(partition.nodes)
+        input_act = pool_node.args[0]
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+
+        # only annotate input output sharing operator
+        # when the output of the input node is annotated
+        if (
+            "quantization_annotation" not in input_act.meta
+            or not input_act.meta["quantization_annotation"]._annotated
+            or input_act.meta["quantization_annotation"].output_qspec is None
+        ):
+            input_act_qspec = get_input_act_qspec(quantization_config)
+        else:
+            input_act_qspec = SharedQuantizationSpec(input_act)
+
+        # output sharing with input
+        output_act_qspec = SharedQuantizationSpec((input_act, pool_node))
+        pool_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: input_act_qspec,
+            },
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_input_large_scalar(node: Node, gm: torch.fx.GraphModule):
+    """Check if input is a large scalar value. So that we can skip quantization for the node
+    since histc op (in HistogramObserver) only works for values up to certain upper bound
+    """
+    if node.op == "get_attr":
+        qualified_name = str(node.target)
+        module_path, _, name = qualified_name.rpartition(".")
+        submod = gm.get_submodule(module_path)
+        tensor = getattr(submod, name)
+        # torch.histc works until this upper bound
+        HISTC_UPPER_BOUND = 3.4028235e15
+        return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    return False
+
+
+def _is_input_non_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
+        return True
+    return node.meta["val"].dtype != torch.float32
+
+
+@register_annotator("add_relu")
+def _annotate_add_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = node
+        maybe_add = node.args[0]
+        if (
+            not isinstance(maybe_add, Node)
+            or maybe_add.op != "call_function"
+            or maybe_add.target
+            not in [
+                torch.ops.aten.add.Tensor,
+                torch.ops.aten.add_.Tensor,
+            ]
+        ):
+            continue
+
+        add_node = maybe_add
+
+        if len(add_node.users) > 1:
+            # add can't be fused with ReLU if the result of add is being used
+            # else where in the graph
+            continue
+
+        partition = [relu_node, add_node]
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            partition.append(input_act0)
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            partition.append(input_act1)
+            input_qspec_map[input_act1] = input_act_qspec
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("add")
+def _annotate_add(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.add_.Tensor,
+        ]:
+            continue
+        add_node = node
+        partition = [add_node]
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+            partition.append(input_act0)
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+            partition.append(input_act1)
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("mul_relu")
+def _annotate_mul_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = node
+        maybe_mul = node.args[0]
+        if (
+            not isinstance(maybe_mul, Node)
+            or maybe_mul.op != "call_function"
+            or maybe_mul.target
+            not in [
+                torch.ops.aten.mul.Tensor,
+                torch.ops.aten.mul_.Tensor,
+            ]
+        ):
+            continue
+
+        mul_node = maybe_mul
+        if len(mul_node.users) > 1:
+            # mul can't be fused with ReLU if the result of mul is being used
+            # else where in the graph
+            continue
+
+        partition = [relu_node, mul_node]
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            partition.append(input_act0)
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            partition.append(input_act1)
+            input_qspec_map[input_act1] = input_act_qspec
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("mul")
+def _annotate_mul(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    annotated_partitions = []
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.mul_.Tensor,
+        ]:
+            continue
+
+        mul_node = node
+        partition = [mul_node]
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+            partition.append(input_act0)
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+            partition.append(input_act0)
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+# TODO: remove Optional in return type, fix annotated_partitions logic
+@register_annotator("cat")
+def _annotate_cat(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig | None,
+    filter_fn: Callable[[Node], bool] | None = None,
+) -> list[list[Node]] | None:
+    cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn)
+    cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values()))
+    annotated_partitions = []
+    for cat_partition in cat_partitions:
+        cat_node = cat_partition.output_nodes[0]
+        if _is_annotated([cat_node]):
+            continue
+
+        if cat_node.target != torch.ops.aten.cat.default:
+            # TODO: change this to AnnotationException
+            raise Exception(  # noqa: TRY002
+                f"Expected cat node: torch.ops.aten.cat.default, but found {cat_node.target}"
+                " please check if you are calling the correct capture API"
+            )
+
+        annotated_partitions.append(cat_partition.nodes)
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        inputs = cat_node.args[0]
+
+        input_qspec_map = {}
+        input_act0 = inputs[0]  # type: ignore[index]
+        if isinstance(input_act0, Node):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node))  # type: ignore[arg-type]
+        for input_act in inputs[1:]:  # type: ignore[index, union-attr]
+            if input_act not in input_qspec_map:
+                input_qspec_map[input_act] = shared_with_input0_qspec  # type: ignore[index]
+
+        output_act_qspec = shared_with_input0_qspec
+
+        cat_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_share_obs_or_fq_op(op: Callable) -> bool:
+    return op in [
+        torch.ops.aten.relu.default,
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.permute.default,
+        torch.ops.aten.permute_copy.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dim,
+        # TODO: remove?
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.view_copy.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten.slice_copy.Tensor,
+        torch.ops.aten.flatten.using_ints,
+    ]
+
+
+def propagate_annotation(model: torch.fx.GraphModule) -> None:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target):
+            continue
+
+        prev_node = n.args[0]
+        if not isinstance(prev_node, Node):
+            continue
+
+        quantization_annotation = prev_node.meta.get("quantization_annotation", None)
+        if not quantization_annotation:
+            continue
+
+        output_qspec = quantization_annotation.output_qspec
+        if not output_qspec:
+            continue
+
+        # make sure current node is not annotated
+        if (
+            "quantization_annotation" in n.meta
+            and n.meta["quantization_annotation"]._annotated
+        ):
+            continue
+
+        shared_qspec = SharedQuantizationSpec(prev_node)
+        # propagate the previous output_qspec to the current node
+        n.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                prev_node: shared_qspec,
+            },
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# TODO: make the list of ops customizable
+def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.mul.Tensor,
+        ]:
+            continue
+        args = list(n.args)
+        new_args = []
+        for i in range(len(args)):
+            if isinstance(args[i], torch.fx.Node):
+                new_args.append(args[i])
+                continue
+            prefix = "_tensor_constant_"
+            get_new_attr_name = get_new_attr_name_with_prefix(prefix)
+            tensor_constant_name = get_new_attr_name(model)
+            float_tensor = torch.tensor(float(args[i]))
+            model.register_buffer(tensor_constant_name, float_tensor)
+            fake_mode = n.meta["val"].fake_mode
+            with model.graph.inserting_before(n):
+                get_attr_node = model.graph.create_node(
+                    "get_attr", tensor_constant_name, (), {}
+                )
+                get_attr_node.meta["val"] = fake_mode.from_tensor(
+                    float_tensor, static_shapes=True
+                )
+                new_args.append(get_attr_node)
+        n.args = tuple(new_args)
+    model.recompile()
+    return model
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c0fc48fd54fa17b6ed0db900677ab339d62a988
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
@@ -0,0 +1,117 @@
+# mypy: allow-untyped-defs
+import functools
+from typing import Any, TYPE_CHECKING
+
+import torch
+from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
+from torch.ao.quantization.quantizer.quantizer import QuantizationSpec
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
+    _is_any_annotated,
+    FilterFn,
+    int8_in_int8_out_ops,
+    X86InductorQuantizer,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig
+from torch.fx import Node
+
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
+__all__ = [
+    "XPUInductorQuantizer",
+    "get_default_xpu_inductor_quantization_config",
+]
+
+
+@functools.lru_cache
+def get_default_xpu_inductor_quantization_config():
+    extra_args: dict[str, Any] = {"eps": 2**-12}
+    act_observer_or_fake_quant_ctr = HistogramObserver
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        PerChannelMinMaxObserver
+    )
+
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,  # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    bias_quantization_spec = None  # will use placeholder observer by default
+    quantization_config = QuantizationConfig(
+        act_quantization_spec,
+        act_quantization_spec,
+        weight_quantization_spec,
+        bias_quantization_spec,
+        False,
+    )
+    return quantization_config
+
+
+class XPUInductorQuantizer(X86InductorQuantizer):
+    """
+    XPUInductorQuantizer is a class designed to facilitate
+    quantization capability at Intel GPU backend. The class
+    highly reuses the existing implementation of
+    X86InductorQuantizer as both are intended to take advantage
+    of the optimized kernels in oneDNN library.
+    """
+
+    """
+        Following annotate_xx overrides the impls in base class, as
+        no XPU implementation for these operators currently. We would
+        gradually enable the XPU implementation and remove following
+        overrides. We keep the annotate methods but make the function
+        body empty, aiming to let `_generate_qdq_quantized_model`
+        generate qdq around op and graph execute on fp32 dtype for
+        unsupported operators.
+    """
+
+    def _annotate_qat_conv2d_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig | None,
+        filter_fn: FilterFn | None = None,
+    ):
+        pass
+
+    def _annotate_maxpool2d(
+        self,
+        node: Node,
+        quantization_config: QuantizationConfig | None,
+    ) -> None:
+        """
+        Here we skip the annotate logic for maxpool at XPU backend
+        as the quantized::max_pool2d is only implemented for CPU.
+        """
+        return
+
+    def _annotate_output_for_int8_in_int8_out_pattern(
+        self,
+        node: Node,
+    ) -> None:
+        if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
+            if node.target is torch.ops.aten.max_pool2d.default:
+                return
+            else:
+                input_node = node.all_input_nodes[0]
+                self._annotate_output_share_observer_as_input(input_node, node)
+        return
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd05374eff844be2cec2d913b88a338aded4e6a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/stubs.py
@@ -0,0 +1,74 @@
+from typing import Any
+
+import torch
+from torch import nn
+from torch.ao.quantization import QConfig
+
+
+__all__ = ["QuantStub", "DeQuantStub", "QuantWrapper"]
+
+
+class QuantStub(nn.Module):
+    r"""Quantize stub module, before calibration, this is same as an observer,
+    it will be swapped as `nnq.Quantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+
+    def __init__(self, qconfig: QConfig | None = None):
+        super().__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+class DeQuantStub(nn.Module):
+    r"""Dequantize stub module, before calibration, this is same as identity,
+    this will be swapped as `nnq.DeQuantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+
+    def __init__(self, qconfig: Any | None = None):
+        super().__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+class QuantWrapper(nn.Module):
+    r"""A wrapper class that wraps the input module, adds QuantStub and
+    DeQuantStub and surround the call to module with call to quant and dequant
+    modules.
+
+    This is used by the `quantization` utility functions to add the quant and
+    dequant modules, before `convert` function `QuantStub` will just be observer,
+    it observes the input tensor, after `convert`, `QuantStub`
+    will be swapped to `nnq.Quantize` which does actual quantization. Similarly
+    for `DeQuantStub`.
+    """
+
+    quant: QuantStub
+    dequant: DeQuantStub
+    module: nn.Module
+
+    def __init__(self, module: nn.Module):
+        super().__init__()
+        qconfig = getattr(module, "qconfig", None)
+        self.add_module("quant", QuantStub(qconfig))
+        self.add_module("dequant", DeQuantStub(qconfig))
+        self.add_module("module", module)
+        self.train(module.training)
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        X = self.quant(X)
+        X = self.module(X)
+        return self.dequant(X)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a027e17e6b07cfbddc8b7b436ba0299b32ef91
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/ao/quantization/utils.py
@@ -0,0 +1,875 @@
+# mypy: allow-untyped-defs
+"""
+Utils shared by different modes of quantization (eager/graph)
+"""
+
+import functools
+import sys
+import warnings
+from collections import OrderedDict
+from collections.abc import Callable
+from inspect import getfullargspec, signature
+from typing import Any, Union
+
+import torch
+from torch.ao.quantization.quant_type import QuantType
+from torch.fx import Node
+from torch.nn.utils.parametrize import is_parametrized
+
+
+if sys.version_info < (3, 12):
+    NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+    NodePattern.__module__ = "torch.ao.quantization.utils"
+else:
+    from typing import TypeAliasType
+
+    NodePattern = TypeAliasType(
+        "NodePattern", tuple[Node, Node] | tuple[Node, tuple[Node, Node]] | Any
+    )
+
+
+# This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+# Define separately to prevent circular imports.
+# TODO(future PR): improve this.
+# make this public once fixed (can't be public as is because setting the module directly
+# doesn't work)
+QuantizerCls = Any
+
+# Type for fusion patterns, it can be more complicated than the following actually,
+# see pattern.md for docs
+# TODO: not sure if typing supports recursive data types
+
+if sys.version_info < (3, 12):
+    Pattern = Union[
+        Callable,
+        tuple[Callable, Callable],
+        tuple[Callable, tuple[Callable, Callable]],
+        Any,
+    ]
+    Pattern.__module__ = "torch.ao.quantization.utils"
+else:
+    from typing import TypeAliasType
+
+    Pattern = TypeAliasType(
+        "Pattern",
+        Callable
+        | tuple[Callable, Callable]
+        | tuple[Callable, tuple[Callable, Callable]]
+        | Any,
+    )
+
+
+# TODO: maybe rename this to MatchInputNode
+class MatchAllNode:
+    """A node pattern that matches all nodes, used in defining
+    fusion patterns in FX Graph Mode Quantization
+    """
+
+
+module_type_list = {
+    torch.nn.ReLU,
+    torch.nn.ReLU6,
+    torch.nn.AdaptiveAvgPool1d,
+    torch.nn.AdaptiveAvgPool2d,
+    torch.nn.AdaptiveAvgPool3d,
+    torch.nn.AvgPool1d,
+    torch.nn.AvgPool2d,
+    torch.nn.AvgPool3d,
+    torch.nn.MaxPool1d,
+    torch.nn.MaxPool2d,
+    torch.nn.MaxPool3d,
+    torch.nn.Identity,
+    torch.nn.Hardsigmoid,
+    torch.nn.Sigmoid,
+    torch.nn.Tanh,
+}
+func_list = {
+    torch.nn.functional.adaptive_avg_pool1d,
+    torch.nn.functional.adaptive_avg_pool2d,
+    torch.nn.functional.adaptive_avg_pool3d,
+    torch.nn.functional.elu,
+    torch.nn.functional.hardswish,
+    torch.nn.functional.instance_norm,
+    torch.nn.functional.layer_norm,
+    torch.nn.functional.leaky_relu,
+    torch.nn.functional.silu,
+    torch.nn.functional.mish,
+    torch.nn.functional.dropout,
+    torch.nn.functional.max_pool1d,
+    torch.nn.functional.max_pool2d,
+    torch.nn.functional.max_pool3d,
+    torch.nn.functional.relu,
+    torch.nn.functional.hardtanh,
+    torch.nn.functional.hardtanh_,
+    torch.nn.functional.hardsigmoid,
+    torch.nn.functional.sigmoid,
+    torch.transpose,
+    torch.repeat_interleave,
+    torch.sigmoid,
+    torch.squeeze,
+    torch.stack,
+    torch.sum,
+    torch.tanh,
+    torch.unsqueeze,
+    torch.cat,
+}
+method_list = {
+    torch.mean,
+    "relu",
+    "relu_",
+    "contiguous",
+    "detach",
+    "detach_",
+    "hardsigmoid",
+    "hardsigmoid_",
+    "permute",
+    "repeat",
+    "repeat_interleave",
+    "reshape",
+    "resize_",
+    "shape",
+    "sigmoid",
+    "sigmoid_",
+    "size",
+    "squeeze",
+    "squeeze_",
+    "tanh",
+    "tanh_",
+    "transpose",
+    "unsqueeze",
+    "unsqueeze_",
+    "view",
+}
+
+
+# TODO: not used now, remove
+def check_node(node, modules):
+    # TODO: reuse is_fixed_qparam_node after we move this function to _lower_to_native_backend.py
+    is_call_function = node.op == "call_function" and node.target in func_list
+    is_call_method = node.op == "call_method" and node.target in method_list
+    is_call_module = (
+        node.op == "call_module" and type(modules[str(node.target)]) in module_type_list
+    )
+    return is_call_function, is_call_method, is_call_module
+
+
+def get_combined_dict(default_dict, additional_dict):
+    """
+    Combines two dictionaries.
+
+    This function takes two dictionaries as input and returns a new dictionary
+    that contains all the key-value pairs from both input dictionaries.
+    If there are any duplicate keys in the `additional_dict`, the values
+    from the `additional_dict` will overwrite those in the `default_dict`.
+    Args:
+        default_dict (dict): The main dictionary that will be used as the base
+        additional_dict (dict): The dictionary used to update `default_dict`
+
+    Returns:
+        dict: The resulting dictionary
+    Example:
+        >>> x = dict(a=1, b=1)
+        >>> y = dict(b=2, c=3)
+        >>> get_combined_dict(x, y)
+        {'a': 1, 'b': 2, 'c': 3}
+    """
+    d = default_dict.copy()
+    d.update(additional_dict)
+    return d
+
+
+def is_per_tensor(qscheme):
+    return qscheme == torch.per_tensor_affine or qscheme == torch.per_tensor_symmetric
+
+
+def is_per_channel(qscheme):
+    return qscheme in [
+        torch.per_channel_affine,
+        torch.per_channel_affine_float_qparams,
+        torch.per_channel_symmetric,
+    ]
+
+
+def getattr_from_fqn(obj: Any, fqn: str) -> Any:
+    """
+    Given an obj and a fqn such as "foo.bar.baz", returns gm.foo.bar.baz.
+    """
+    return functools.reduce(getattr, fqn.split("."), obj)
+
+
+def to_underlying_dtype(qdtype):
+    DTYPE_MAPPING = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+        torch.quint4x2: torch.uint8,
+        torch.quint2x4: torch.uint8,
+        torch.uint8: torch.uint8,
+        torch.int8: torch.int8,
+        torch.uint16: torch.uint16,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+        torch.float8_e5m2: torch.float8_e5m2,
+        torch.float8_e4m3fn: torch.float8_e4m3fn,
+    }
+    if qdtype not in DTYPE_MAPPING:
+        raise AssertionError("Unsupported dtype: " + str(qdtype))
+    return DTYPE_MAPPING[qdtype]
+
+
+def get_qparam_dict(observer_or_fake_quant):
+    from torch.ao.quantization.observer import PlaceholderObserver
+
+    qscheme = getattr(observer_or_fake_quant, "qscheme", None)
+    dtype = observer_or_fake_quant.dtype
+    qparams = {"qscheme": qscheme, "dtype": dtype}
+
+    if not qscheme or isinstance(observer_or_fake_quant, PlaceholderObserver):
+        return {"qscheme": None, "dtype": dtype}
+
+    if is_per_tensor(qscheme):
+        qscheme = torch.per_tensor_affine
+    elif is_per_channel(qscheme):
+        # change symmetric to affine since we do not have symmetric
+        # quantized Tensor
+        if qscheme == torch.per_channel_symmetric:
+            qscheme = torch.per_channel_affine
+        qparams["axis"] = observer_or_fake_quant.ch_axis
+    else:
+        raise RuntimeError(f"Unrecognized qscheme: {qscheme}")
+    # update qscheme, since we don't have symmetric quant qscheme
+    # in quantized Tensor
+    qparams["qscheme"] = qscheme
+
+    scale, zero_point = observer_or_fake_quant.calculate_qparams()
+    qparams["scale"] = scale
+    qparams["zero_point"] = zero_point
+
+    if hasattr(observer_or_fake_quant, "quant_min"):
+        qparams["quant_min"] = observer_or_fake_quant.quant_min
+    if hasattr(observer_or_fake_quant, "quant_max"):
+        qparams["quant_max"] = observer_or_fake_quant.quant_max
+
+    return qparams
+
+
+def get_swapped_custom_module_class(
+    custom_module, custom_module_class_mapping, qconfig
+):
+    """Get the observed/quantized custom module class that we need
+    to swap `custom_module` to
+    Input:
+        custom_module: input, can be an instance of either a float or observed custom module
+        custom_module_class_mapping: the float to observed or observed to quantized custom module class mapping
+        qconfig: qconfig configured for the custom module
+
+    Output:
+        corresponding observed/quantized custom module class for input custom module instance
+    """
+    quant_type = get_quant_type(qconfig)
+    class_mapping = custom_module_class_mapping.get(quant_type, {})
+    if type(custom_module) not in class_mapping:
+        raise AssertionError(
+            "did not find corresponding observed "
+            f"module class for {type(custom_module)} in mapping: {class_mapping}"
+        )
+    return class_mapping[type(custom_module)]
+
+
+def activation_dtype(qconfig):
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine activation dtype")
+    activation = qconfig.activation()
+    return activation.dtype
+
+
+def weight_dtype(qconfig):
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine weight dtype")
+    weight = qconfig.weight()
+    return weight.dtype
+
+
+def activation_is_statically_quantized(qconfig):
+    """Given a qconfig, decide if the activation needs to be
+    quantized or not, this includes quantizing to quint8, qint8 and qint32 and float16
+    """
+    return activation_dtype(qconfig) in [
+        torch.quint8,
+        torch.qint8,
+        torch.qint32,
+        torch.float16,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+    ] and (not activation_is_dynamically_quantized(qconfig))
+
+
+def activation_is_dynamically_quantized(qconfig):
+    """Given a qconfig, decide if the activation needs to be
+    dynamically quantized or not, this includes dynamically quantizing to
+    quint8, qint8 and float16
+    """
+    _activation_dtype, _, activation_is_dynamic = get_qconfig_dtypes(qconfig)
+    return activation_is_dynamic
+
+
+def activation_is_int8_quantized(qconfig):
+    """Given a qconfig, decide if the activation needs to be
+    quantized to int8 or not, this includes quantizing to quint8, qint8
+    """
+    return activation_dtype(qconfig) in [
+        torch.quint8,
+        torch.qint8,
+        torch.uint8,
+        torch.int8,
+    ]
+
+
+def activation_is_int32_quantized(qconfig):
+    """Given a qconfig, decide if the activation needs to be
+    quantized to int32 or not
+    """
+    return activation_dtype(qconfig) in [torch.qint32, torch.int32]
+
+
+def weight_is_quantized(qconfig):
+    """Given a qconfig, decide if the weight needs to be
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [
+        torch.quint8,
+        torch.qint8,
+        torch.float16,
+        torch.quint4x2,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+    ]
+
+
+def weight_is_statically_quantized(qconfig):
+    """Given a qconfig, decide if the weight needs to be statically
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.uint8, torch.int8]
+
+
+def op_is_int8_dynamically_quantized(qconfig) -> bool:
+    """Given a qconfig, returns True if this op is using int8 dynamic
+    quantization
+    """
+    activation_dtype, weight_dtype, activation_is_dynamic = get_qconfig_dtypes(qconfig)
+    return (
+        activation_dtype in [torch.quint8, torch.uint8]
+        and
+        # for now, the lines below assume fbgemm or qnnpack
+        weight_dtype in [torch.qint8, torch.int8]
+        and activation_is_dynamic
+    )
+
+
+def get_qconfig_dtypes(qconfig):
+    r"""returns the qconfig tuple for qconfig:
+    (activation_dtype, weight_dtype, activation_is_dynamic)
+    """
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to extract dtypes")
+    activation = qconfig.activation()
+    weight = qconfig.weight()
+    act_is_dynamic = getattr(activation, "is_dynamic", False)
+    return (activation.dtype, weight.dtype, act_is_dynamic)
+
+
+def get_quant_type(qconfig):
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine quant type")
+    activation = qconfig.activation()
+    weight = qconfig.weight()
+    static_dtypes = [
+        torch.quint8,
+        torch.qint8,
+        torch.quint4x2,
+        torch.qint32,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+    ]
+    if weight.dtype in static_dtypes:
+        if hasattr(activation, "is_dynamic") and activation.is_dynamic:
+            return QuantType.DYNAMIC
+        elif activation.dtype in static_dtypes:
+            return QuantType.STATIC
+        else:
+            return QuantType.WEIGHT_ONLY
+
+    if weight.dtype == torch.float16:
+        if hasattr(activation, "is_dynamic") and activation.is_dynamic:
+            return QuantType.DYNAMIC
+        elif activation.dtype == torch.float16:
+            return QuantType.STATIC
+
+    raise Exception(  # noqa: TRY002
+        f"Unrecognized dtype combination in get_quant_type: activation({activation.dtype}),"
+        f"weight({weight.dtype})"
+    )
+
+
+def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
+    """Checks if the given minimum and maximum values are valid, meaning that
+    they exist and the min value is less than the max value.
+    """
+    if min_val.numel() == 0 or max_val.numel() == 0:
+        warnings.warn(
+            "must run observer before calling calculate_qparams. "
+            + "Returning default values.",
+            stacklevel=2,
+        )
+        return False
+
+    if min_val.dim() == 0 or max_val.dim() == 0:
+        if min_val == float("inf") and max_val == float("-inf"):
+            warnings.warn(
+                "must run observer before calling calculate_qparams. "
+                + "Returning default values.",
+                stacklevel=2,
+            )
+
+            return False
+
+        if min_val > max_val:
+            raise AssertionError(f"min {min_val} should be less than max {max_val}")
+    else:
+        if torch.any(min_val > max_val):
+            raise AssertionError(f"min {min_val} should be less than max {max_val}")
+
+    return True
+
+
+def calculate_qmin_qmax(
+    quant_min: int,
+    quant_max: int,
+    has_customized_qrange: bool,
+    dtype: torch.dtype,
+    reduce_range: bool,
+) -> tuple[int, int]:
+    r"""Calculates actual qmin and qmax based on the quantization range,
+    observer datatype and if range is reduced.
+    """
+    # TODO(jerryzh): Figure out why custom quant_min/quant_max are still adjusted.
+    if has_customized_qrange:
+        # This initialization here is to be resolve TorchScript compilation issues and allow
+        # using of refinement to decouple initial_qmin and initial_qmax from quantization range.
+        # The actual values of initial_qmin and initial_qmax will be reset below.
+        if dtype in [torch.qint32, torch.int32]:
+            initial_quant_min, initial_quant_max = 0, 2**32 - 1
+        else:
+            initial_quant_min, initial_quant_max = 0, 255
+        # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the
+        # attribute from Optional valid integers for use, based on TorchScript's requirements.
+        custom_quant_min, custom_quant_max = quant_min, quant_max
+        if custom_quant_min is not None and custom_quant_max is not None:
+            initial_quant_min, initial_quant_max = (
+                custom_quant_min,
+                custom_quant_max,
+            )
+
+        qrange_len = initial_quant_max - initial_quant_min + 1
+        if dtype in [torch.qint8, torch.int8]:
+            if not (0 < qrange_len <= 256):
+                raise AssertionError(
+                    "quantization range should be positive and not exceed the maximum bit range (=256)."
+                )
+        elif dtype in [torch.qint32, torch.int32]:
+            if not (0 < qrange_len <= 2**32):
+                raise AssertionError(
+                    "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
+                )
+        if reduce_range:
+            quant_min, quant_max = quant_min // 2, quant_max // 2
+    else:
+        # Fallback onto default 8-bit qmin and qmax calculation if dynamic range is not used.
+        if dtype in [torch.qint8, torch.int8]:
+            if reduce_range:
+                quant_min, quant_max = -64, 63
+            else:
+                quant_min, quant_max = -128, 127
+        elif dtype in [torch.quint8, torch.uint8]:
+            if reduce_range:
+                quant_min, quant_max = 0, 127
+            else:
+                quant_min, quant_max = 0, 255
+        elif dtype in [torch.qint32, torch.int32]:
+            quant_min, quant_max = -1 * (2**31), (2**31) - 1
+        elif dtype == torch.uint16:
+            quant_min, quant_max = 0, 2**16 - 1
+        elif dtype == torch.int16:
+            quant_min, quant_max = -(2**15), 2**15 - 1
+        else:
+            quant_min, quant_max = 0, 15
+    return quant_min, quant_max
+
+
+def _parent_name(target):
+    """
+    Turn 'foo.bar' into ['foo', 'bar']
+    """
+    r = target.rsplit(".", 1)
+    if len(r) == 1:
+        return "", r[0]
+    else:
+        return r[0], r[1]
+
+
+def has_no_children_ignoring_parametrizations(module):
+    """
+    Checks if module._modules is empty or
+    if module is a parametrization, checks that module._modules only has
+    the 'parametrizations' module
+    """
+    if len(module._modules) == 0:
+        return True
+    elif is_parametrized(module):
+        return len(module._modules) == 1 and "parametrizations" in module._modules
+    else:
+        return False
+
+
+def _get_path_of_module(
+    root: torch.nn.Module, submodule: torch.nn.Module
+) -> str | None:
+    """Get the path (fully qualified name) of a submodule
+
+    Example::
+
+    >> class M(torch.nn.Module):
+           def __init__(self) -> None:
+               self.linear = torch.nn.Linear(5, 5)
+           def forward(self, x):
+               return self.linear(x)
+
+    >> m = M()
+    >> l = m.linear
+    >> _get_path_of_module(m, l)
+    "linear"
+    """
+    for n, p in root.named_modules():
+        if submodule is p:
+            return n
+    return None
+
+
+def _get_signature_locals(f: Callable, loc: dict[str, Any]) -> dict[str, Any]:
+    """Get local keyword arguments
+
+    Example::
+
+    >> def f(self, a, b=9):
+           pass
+    >> loc = {"a": 6, "c": 7}
+    >> _get_signature_locals(f, loc)
+    {"a": 6}
+    """
+    return {k: v for k, v in loc.items() if k in signature(f).parameters}
+
+
+def _get_default_kwargs(f: Callable) -> "OrderedDict[str, Any]":
+    """Get all default keyword arguments from function signature
+
+    Example::
+
+    >> def f(self, a, b=9):
+           pass
+    >> _get_default_kwargs(f)
+    {"b": 9}
+    """
+    kwargs = {}
+    for name, param in signature(f).parameters.items():
+        if param.default is not param.empty:
+            kwargs[name] = param.default
+        elif param.kind is param.VAR_POSITIONAL:
+            kwargs[name] = ()
+        elif param.kind is param.VAR_KEYWORD:
+            kwargs[name] = {}
+    return OrderedDict(kwargs)
+
+
+def _normalize_kwargs(func: Callable, loc: dict[str, Any]) -> "OrderedDict[str, Any]":
+    """Given a function and local function arguments, normalize the keyword
+    arguments by filling in default arguments from function signature
+
+    Example::
+
+    >> def f(self, key1=3, key2=3):
+           pass
+    >> loc = {"key2": 6}
+    >> _normalize_kwargs(f, loc)
+    {"key1": 3, "key2": 6}
+    """
+    default_kwargs = _get_default_kwargs(func)
+    local_kwargs = _get_signature_locals(func, loc)
+    normalized_kwargs = default_kwargs.copy()
+    for attr, val in local_kwargs.items():
+        if attr in normalized_kwargs:
+            # override the default keyword arguments
+            normalized_kwargs[attr] = val
+    return normalized_kwargs
+
+
+def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
+    r"""Validates that the user-specified quantization range is properly initialized
+    and within the given bound supported by the observer dtype.
+
+    To accommodate lower-bit quantization with respect to the existing torch.qint8 and
+    torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing
+    in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax
+    values are used to calculate static estimates of the scale and zero point for aggressive lower-bit
+    fake quantization. These estimates are compared against parameters learned through backpropagation.
+    The related literatures for scale and zero point via backpropagation are as follows:
+
+    Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS
+    Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf
+    """
+    # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
+    # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+    if not (quant_min <= 0 <= quant_max):
+        raise AssertionError("Used-specified quantization range must include 0.")
+    if quant_min >= quant_max:
+        raise AssertionError(
+            "qmin must be strictly less than qmax for user-specified quantization range."
+        )
+
+
+# Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
+# as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikely to change
+# (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168)
+def determine_qparams(
+    min_val: torch.Tensor,
+    max_val: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    eps: torch.Tensor,
+    has_customized_qrange: bool,
+    qscheme: torch.qscheme = torch.per_tensor_affine,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""Calculates the quantization parameters, given min and max
+    value tensors. Works for both per tensor and per channel cases
+
+    Args:
+        min_val: Minimum values per channel
+        max_val: Maximum values per channel
+
+    Returns:
+        scales: Scales tensor of shape (#channels,)
+        zero_points: Zero points tensor of shape (#channels,)
+    """
+    if not check_min_max_valid(min_val, max_val):
+        return torch.tensor([1.0], device=min_val.device.type), torch.tensor(
+            [0], device=min_val.device.type
+        )
+
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+
+    device = min_val_neg.device
+    scale = torch.ones(min_val_neg.size(), dtype=torch.double, device=device)
+    zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+    eps = eps.to(device)
+
+    if qscheme == torch.per_tensor_symmetric or qscheme == torch.per_channel_symmetric:
+        max_val_pos = torch.max(-min_val_neg, max_val_pos)
+        scale = max_val_pos / (float(quant_max - quant_min) / 2)
+        scale = torch.max(scale, eps)
+        if dtype in [torch.uint8, torch.quint8]:
+            if has_customized_qrange:
+                # When customized quantization range is used, down-rounded midpoint of the range is chosen.
+                zero_point = zero_point.new_full(
+                    zero_point.size(), (quant_min + quant_max) // 2
+                )
+            else:
+                zero_point = zero_point.new_full(zero_point.size(), 128)
+    elif qscheme == torch.per_channel_affine_float_qparams:
+        scale = (max_val - min_val) / float(quant_max - quant_min)
+        scale = torch.where(scale > eps, scale, torch.ones_like(scale))
+        # We use the quantize function
+        # xq = Round(Xf * inv_scale + zero_point),
+        # setting zero_point to (-1 * min *inv_scale) we get
+        # Xq = Round((Xf - min) * inv_scale)
+        zero_point = -1 * min_val / scale
+    else:
+        scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+        scale = torch.max(scale, eps)
+        zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+        zero_point = torch.clamp(zero_point, quant_min, quant_max)
+
+    # For scalar values, cast them to Tensors of size 1 to keep the shape
+    # consistent with default values in FakeQuantize.
+    if len(scale.shape) == 0:
+        # TODO: switch to scale.item() after adding JIT support
+        scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device)
+    if len(zero_point.shape) == 0:
+        # TODO: switch to zero_point.item() after adding JIT support
+        zero_point = torch.tensor(
+            [int(zero_point)], dtype=zero_point.dtype, device=device
+        )
+        if qscheme == torch.per_channel_affine_float_qparams:
+            zero_point = torch.tensor(
+                [float(zero_point)], dtype=zero_point.dtype, device=device
+            )
+
+    return scale.to(torch.double), zero_point.to(torch.int64)
+
+
+def _get_num_pos_args(f: Callable) -> int:
+    """Get number of positional args for a function
+
+    Example::
+
+    >> def f(self, key1=3, key2=3):
+           pass
+    >> _get_num_pos_args(f)
+    3
+    """
+    return len(getfullargspec(f).args)
+
+
+def get_fqn_to_example_inputs(
+    model: torch.nn.Module, example_inputs: tuple[Any, ...]
+) -> dict[str, tuple[Any, ...]]:
+    """Given a model and its example inputs, return a dictionary from
+    fully qualified name of submodules to example_inputs for that submodule,
+    e.g. {"linear1": (tensor1,), "linear2": (tensor2,), "sub": (tensor3,),
+          "sub.linear1": (tensor4,), ...}
+
+    Used to make quantizing submodules easier now that FX Graph Mode Quantization requires
+    example inputs.
+
+    Also works for keyword arguments with default values, we would flatten keyword
+    arguments as positional arguments and fill in the missing keyword args with default
+    values, e.g. if we have a forward function:
+    def forward(self, x, key1=3, key2=3):
+        ...
+
+    and we call it with self.submodule(x, key2=6)
+    we'll get example_inputs: (x, 3, 6)
+
+    user can also override `key1` with positional arguments as well:
+    for self.submodule(x, 5, key2=6)
+    we'll get: (x, 5, 6)
+
+    variable positional arguments and variable positional keyword arguments in forward
+    function are not supported currently, so please make sure no submodules is using
+    them.
+    """
+    root = model
+    fqn_to_example_inputs = {}
+
+    def _patched_module_call(self, *args, **kwargs):
+        submodule_example_inputs = list(args).copy()
+        normalized_kwargs = _normalize_kwargs(self.forward, kwargs)
+        # minus 1 to skipping counting `self`
+        num_args = _get_num_pos_args(self.forward) - 1
+        num_to_pop = num_args - len(submodule_example_inputs)
+        while num_to_pop and normalized_kwargs:
+            normalized_kwargs.popitem(last=False)
+            num_to_pop -= 1
+        submodule_example_inputs.extend(normalized_kwargs.values())
+        submodule_example_inputs_tuple = tuple(submodule_example_inputs)
+        fqn = _get_path_of_module(root, self)
+        if fqn is not None:
+            fqn_to_example_inputs[fqn] = submodule_example_inputs_tuple
+        return orig_module_call(self, *args, **kwargs)
+
+    orig_module_call = torch.nn.Module.__call__
+    torch.nn.Module.__call__ = _patched_module_call  # type: ignore[method-assign]
+    try:
+        model(*example_inputs)
+    finally:
+        # restore the module call even if there is an exception
+        torch.nn.Module.__call__ = orig_module_call  # type: ignore[method-assign]
+    return fqn_to_example_inputs
+
+
+def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
+    """
+    Returns the unique device for a module, or None if no device is found.
+    Throws an error if multiple devices are detected.
+    """
+    devices = {p.device for p in module.parameters()} | {
+        p.device for p in module.buffers()
+    }
+    """
+    As a temp workaround for AIMP HHC publish we added CPU check.remove it later. T163614564
+    """
+    if {torch.device("cpu"), torch.device("meta")} == devices:
+        warnings.warn(
+            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.",
+            stacklevel=2,
+        )
+        devices = {torch.device("cpu")}
+    ""
+    if len(devices) > 1:
+        raise AssertionError(
+            "prepare only works with cpu or single-device CUDA modules, "
+            f"but got devices {devices}"
+        )
+    device = next(iter(devices)) if len(devices) > 0 else None
+    return device
+
+
+DEPRECATION_WARNING = (
+    "torch.ao.quantization is deprecated and will be removed in 2.10. \n"
+    "For migrations of users: \n"
+    "1. Eager mode quantization (torch.ao.quantization.quantize, "
+    "torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode "
+    "quantize_ API instead \n"
+    "2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,"
+    "torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization "
+    "API instead (prepare_pt2e, convert_pt2e) \n"
+    "3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) \n"
+    "see https://github.com/pytorch/ao/issues/2259 for more details"
+)
+
+
+__all__ = [
+    "NodePattern",
+    "Pattern",
+    "MatchAllNode",
+    "check_node",
+    "get_combined_dict",
+    "is_per_tensor",
+    "is_per_channel",
+    "getattr_from_fqn",
+    "get_qparam_dict",
+    "get_swapped_custom_module_class",
+    "activation_dtype",
+    "weight_dtype",
+    "activation_is_statically_quantized",
+    "activation_is_dynamically_quantized",
+    "activation_is_int8_quantized",
+    "activation_is_int32_quantized",
+    "weight_is_quantized",
+    "weight_is_statically_quantized",
+    "op_is_int8_dynamically_quantized",
+    "get_qconfig_dtypes",
+    "get_quant_type",
+    "check_min_max_valid",
+    "calculate_qmin_qmax",
+    "has_no_children_ignoring_parametrizations",
+    "get_fqn_to_example_inputs",
+    "to_underlying_dtype",
+    "determine_qparams",
+    "validate_qmin_qmax",
+    "DEPRECATION_WARNING",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c55e7477bfeaa358e8cea60b9f8f83766b49e5f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_gpu_trace.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5e4684a41489bf2d116e5da37807c6971bb2541
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41f030eb7229531435763f8ddd5ed22fcd7a646d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/comm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e4d015304837ed7e7b61676c199a984198b56dc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/jiterator.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e20098b30541791318208544dfbce7b73810198
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/nccl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5580cf9d35c439cc51b6d3cc306bd8dd788d5109
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/random.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9563c01a46c519d4f5e02d832bc3c350454e785b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/sparse.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6667431d7ff0ee39a46a93f563859517b59adba6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/streams.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..331e0c59e25295447a08c5e0a72e0b978620ee47
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/__pycache__/tunable.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88ef0b5acac5e5bdeb034169052bcf5aa7456e33
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/__init__.py
@@ -0,0 +1,13 @@
+# pyrefly: ignore [deprecated]
+from .autocast_mode import autocast, custom_bwd, custom_fwd
+from .common import amp_definitely_not_available
+from .grad_scaler import GradScaler
+
+
+__all__ = [
+    "amp_definitely_not_available",
+    "autocast",
+    "custom_bwd",
+    "custom_fwd",
+    "GradScaler",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6b63c708d3f2ddfc162a4431e114a2bcf47e9eb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/autocast_mode.py
@@ -0,0 +1,110 @@
+# mypy: allow-untyped-defs
+import functools
+import sys
+from typing import Any
+from typing_extensions import deprecated
+
+import torch
+
+
+__all__ = ["autocast", "custom_fwd", "custom_bwd"]
+
+
+@deprecated(
+    "`torch.cuda.amp.autocast(args...)` is deprecated. "
+    "Please use `torch.amp.autocast('cuda', args...)` instead.",
+    category=FutureWarning,
+)
+class autocast(torch.amp.autocast_mode.autocast):
+    r"""See :class:`torch.autocast`.
+
+    ``torch.cuda.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cuda", args...)`` instead.
+    """
+
+    # TODO: remove this conditional once we stop supporting Python < 3.13
+    # Prior to Python 3.13, inspect.signature could not retrieve the correct
+    # signature information for classes decorated with @deprecated (unless
+    # the __new__ static method was explicitly defined);
+    #
+    # However, this issue has been fixed in Python 3.13 and later versions.
+    if sys.version_info < (3, 13):
+
+        def __new__(
+            cls,
+            enabled: bool = True,
+            dtype: torch.dtype = torch.float16,
+            cache_enabled: bool = True,
+        ):
+            return super().__new__(cls)
+
+        def __init_subclass__(cls):
+            pass
+
+    def __init__(
+        self,
+        enabled: bool = True,
+        dtype: torch.dtype = torch.float16,
+        cache_enabled: bool = True,
+    ):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = "cuda"
+            self.fast_dtype = dtype
+            return
+        super().__init__(
+            "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
+        )
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            return self
+        return super().__enter__()
+
+    # TODO: discuss a unified TorchScript-friendly API for autocast
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return super().__call__(func)
+
+
+# Preserved only for BC reasons
+@deprecated(
+    "`torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. "
+    "Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.",
+    category=FutureWarning,
+)
+def _cast(value, dtype):
+    return torch.amp.autocast_mode._cast(value, "cuda", dtype)
+
+
+@deprecated(
+    "`torch.cuda.amp.custom_fwd(args...)` is deprecated. "
+    "Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.",
+    category=FutureWarning,
+)
+def custom_fwd(fwd=None, *, cast_inputs=None):
+    """
+    ``torch.cuda.amp.custom_fwd(args...)`` is deprecated. Please use
+    ``torch.amp.custom_fwd(args..., device_type='cuda')`` instead.
+    """
+    return functools.partial(torch.amp.custom_fwd, device_type="cuda")(
+        fwd=fwd, cast_inputs=cast_inputs
+    )
+
+
+@deprecated(
+    "`torch.cuda.amp.custom_bwd(args...)` is deprecated. "
+    "Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.",
+    category=FutureWarning,
+)
+def custom_bwd(bwd):
+    """
+    ``torch.cuda.amp.custom_bwd(args...)`` is deprecated. Please use
+    ``torch.amp.custom_bwd(args..., device_type='cuda')`` instead.
+    """
+    return functools.partial(torch.amp.custom_bwd, device_type="cuda")(bwd)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..915a9b4f4a9ca6c147abefd7c8ab1891ee5a8179
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/common.py
@@ -0,0 +1,11 @@
+# mypy: allow-untyped-defs
+from importlib.util import find_spec
+
+import torch
+
+
+__all__ = ["amp_definitely_not_available"]
+
+
+def amp_definitely_not_available():
+    return not (torch.cuda.is_available() or find_spec("torch_xla"))
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..62e2020073c8ed99f7295edd1aaea4c54d815f63
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/cuda/amp/grad_scaler.py
@@ -0,0 +1,38 @@
+from typing_extensions import deprecated
+
+import torch
+
+# We need to keep this unused import for BC reasons
+from torch.amp.grad_scaler import OptState  # noqa: F401
+
+
+__all__ = ["GradScaler"]
+
+
+class GradScaler(torch.amp.GradScaler):
+    r"""
+    See :class:`torch.amp.GradScaler`.
+    ``torch.cuda.amp.GradScaler(args...)`` is deprecated. Please use ``torch.amp.GradScaler("cuda", args...)`` instead.
+    """
+
+    @deprecated(
+        "`torch.cuda.amp.GradScaler(args...)` is deprecated. "
+        "Please use `torch.amp.GradScaler('cuda', args...)` instead.",
+        category=FutureWarning,
+    )
+    def __init__(
+        self,
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        super().__init__(
+            "cuda",
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4cc8b31ffb08fe3da29174f3b6c6f8d8fea3cb3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/futures/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7649a5e1241acc8adf4cdf15f39b504b0787a4f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h
@@ -0,0 +1,218 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+
+struct IValue;
+using Stack = std::vector<IValue>;
+
+class OperatorHandle;
+class KernelFunction;
+
+// This kernel implements the behavior of falling through to the next available
+// registered dispatch key.  The implementation of this function is FAST; it is
+// no overhead to fallthrough to the next key.  See cpp file for some more
+// implementation notes; notably, this does NOT actually go through the
+// boxing/unboxing codepath.
+TORCH_API void fallthrough_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*unused*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+
+// Note [Ambiguity in AutogradOther kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This error-reporting kernel is registered to the AutogradOther entry in the
+// dispatch table when there is both a CompositeImplicitAutograd kernel and a
+// backend kernel for ANY backend that maps to AutogradOther.  To see why
+// this is necessary in the AutogradOther case, it's helpful to first see
+// why everything works out fine for a backend that has a reserved Autograd
+// entry (see rule 2.2 in [Note] DispatchTable computation):
+//
+//    CPU   AutogradCPU
+//    reg?  registers with...
+//    -------------------------------------------------
+//    y     Autograd registration takes precedence
+//          over CompositeImplicitAutograd.
+//          This is good, because the CPU specific backend
+//          implementation is more specialized and typically better;
+//          if we used the composite, we would bypass it.
+//          (NB: the Autograd key is guaranteed to exist because
+//          the autograd codegen requires it!)
+//
+//    n     CompositeImplicitAutograd takes precedence.
+//          This is also good, because the Autograd
+//          registration (if it exists) would try to redispatch
+//          to the (non-existent) CPU implementation; by
+//          using the composite, we ensure the operator
+//          actually works.
+//
+// As you can see, when we have a specific Autograd key (AutogradCPU), we can
+// decide whether or not to use the CompositeImplicitAutograd kernel or the
+// Autograd kernel based on whether or not the backend kernel exists.
+//
+// However, for AutogradOther (which is the catchall autograd kernel for
+// everything that doesn't have a specific Autograd key), we can't do this
+// trick because there isn't any unique backend to peek at to disambiguate;
+// if there are some backends that have implementations they prefer Autograd,
+// but unimplemented backends would prefer CompositeImplicitAutograd.  Rather
+// than arbitrarily pick one or the other, we just register a kernel that raises
+// an error and let the user decide how to proceed.
+TORCH_API void ambiguous_autogradother_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+
+// Note [named_not_supported_kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This kernel implements reporting an error message saying that named tensor is
+// not supported.  This kernel doesn't rely on the Stack, and so it is special
+// cased in the dispatcher to be triggered before we attempt boxing (so we can
+// give a good error message in cases when boxing is not supported).  When
+// boxing is universally supported this can be removed.
+[[noreturn]] TORCH_API void named_not_supported_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+
+/**
+ * BoxedKernel is similar to a std::function storing a boxed kernel.
+ */
+class TORCH_API BoxedKernel final {
+ public:
+  // This is how boxed kernels are actually stored
+  //
+  // Note [Plumbing Keys Through The Dispatcher]
+  // Benchmarks have shown that it is expensive for the dispatcher to read from
+  // thread-local storage (TLS) upon every dispatch call into order to compute
+  // which kernel to dispatch to.
+  //
+  // To mitigate this, we've updated the calling convention inside the
+  // dispatcher to expect every kernel that it stores to have a first argument
+  // of type DispatchKeySet.
+  //
+  // What are the invariants of the DispatchKeySet when it gets passed to a
+  // kernel?
+  // - All keys to the left of the current dispatch key have been masked out.
+  //   (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the
+  //   highest bit to be DispatchKey::Tracer)
+  // - All other keys that dispatcher normally would have computed through TLS +
+  // global state + op arguments
+  //   are still in the set.
+  //
+  // Kernels can then opt into using this keyset to save the dispatcher from
+  // doing repeated work during redispatches: recalculating the highest-priority
+  // dispatch key, which involves reading from TLS. Instead, the kernels that
+  // opt in will calculate an updated DispatchKeySet directly from the old one,
+  // and pass the updated set directly into the dispatcher upon redispatching.
+  //
+  // This is an opt-in mechanism: Kernels can automatically opt in by setting
+  // the first argument in their signature to be of type DispatchKeySet. See the
+  // kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for
+  // examples.
+  //
+  // The mechanism for optionally passing that DispatchKeySet into the kernel
+  // lives in make_boxed_from_unboxed_functor.h. See Note [Plumbing Keys Through
+  // The Dispatcher 2] for details.
+  using InternalBoxedKernelFunction =
+      void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+  // This is the public API for how boxed kernels are defined
+  using BoxedKernelFunction = void(const OperatorHandle&, Stack*);
+  using BoxedKernelFunction_withDispatchKeys =
+      void(const OperatorHandle&, DispatchKeySet, Stack*);
+
+  BoxedKernel();
+
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValid() const;
+  bool isFallthrough() const;
+
+  /**
+   * Call the function with boxed arguments.
+   */
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>();
+   */
+  template <BoxedKernelFunction* func>
+  static BoxedKernel makeFromFunction();
+
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs
+   * dispatch keys (currently there are none) See Note [Plumbing Keys Through
+   * The Dispatcher] for details.
+   */
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static BoxedKernel makeFromFunction();
+
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > BoxedKernel func =
+   * BoxedKernel::makeFromFunctor(std::make_unique<MyFunctor>());
+   */
+  template <class KernelFunctor>
+  static BoxedKernel makeFromFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor);
+
+  static BoxedKernel makeFallthrough();
+  static BoxedKernel makeAmbiguousAutogradOther();
+  static BoxedKernel makeNamedNotSupported();
+
+ private:
+  friend class KernelFunction;
+
+  template <BoxedKernelFunction* func>
+  static void make_boxed_function(
+      OperatorKernel* /*unused*/,
+      const OperatorHandle& opHandle,
+      DispatchKeySet /*unused*/,
+      Stack* stack);
+
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static void make_boxed_function(
+      OperatorKernel* /*unused*/,
+      const OperatorHandle& opHandle,
+      DispatchKeySet /*ks*/,
+      Stack* stack);
+
+  explicit BoxedKernel(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func);
+
+  OperatorKernel* getFunctor() const;
+  InternalBoxedKernelFunction* getFnPtr() const;
+
+  c10::intrusive_ptr<OperatorKernel> functor_;
+  InternalBoxedKernelFunction* boxed_kernel_func_;
+};
+
+} // namespace c10
+
+#include <ATen/core/boxing/BoxedKernel_impl.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..69c8b2cf65d6f0256193ee3899708ad18c7d6768
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h
@@ -0,0 +1,111 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+namespace c10 {
+
+inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
+
+inline BoxedKernel::BoxedKernel(
+    std::unique_ptr<OperatorKernel> functor,
+    InternalBoxedKernelFunction* boxed_kernel_func)
+    : functor_(std::move(functor)), boxed_kernel_func_(boxed_kernel_func) {}
+
+template <BoxedKernel::BoxedKernelFunction* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& opHandle,
+    DispatchKeySet /*unused*/,
+    Stack* stack) {
+  // Note that we're dropping the DispatchKeySet argument.
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, stack);
+}
+
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& opHandle,
+    DispatchKeySet ks,
+    Stack* stack) {
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, ks, stack);
+}
+
+inline bool BoxedKernel::isValid() const {
+  return boxed_kernel_func_ != nullptr;
+}
+
+inline bool BoxedKernel::isFallthrough() const {
+  return boxed_kernel_func_ == &fallthrough_kernel;
+}
+
+inline void BoxedKernel::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      boxed_kernel_func_ != nullptr,
+      "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel.");
+  (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack);
+}
+
+template <BoxedKernel::BoxedKernelFunction* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+
+inline BoxedKernel BoxedKernel::makeFallthrough() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &fallthrough_kernel);
+}
+
+inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &ambiguous_autogradother_kernel);
+}
+
+inline BoxedKernel BoxedKernel::makeNamedNotSupported() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &named_not_supported_kernel);
+}
+
+template <class KernelFunctor>
+inline BoxedKernel BoxedKernel::makeFromFunctor(
+    std::unique_ptr<KernelFunctor> kernelFunctor) {
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+  return BoxedKernel(
+      std::move(kernelFunctor),
+      [](OperatorKernel* kernel,
+         const OperatorHandle& op,
+         DispatchKeySet ks,
+         Stack* stack) {
+        (*static_cast<KernelFunctor*>(kernel))(op, ks, stack);
+      });
+}
+
+inline OperatorKernel* BoxedKernel::getFunctor() const {
+  return functor_.get();
+}
+inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const {
+  return boxed_kernel_func_;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa53454d22edd1caa9d146b6dd3a5647a0b7dfee
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h
@@ -0,0 +1,346 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/stack.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <atomic>
+#include <memory>
+#include <type_traits>
+
+namespace c10 {
+
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack
+                                 // to the c10 namespace.
+
+class OperatorHandle;
+struct OperatorKernel;
+class KernelFunction;
+
+class KernelToken;
+class SafeKernelFunction;
+
+template <typename T>
+using has_symint = std::disjunction<
+    std::is_same<c10::SymInt, T>,
+    std::is_same<c10::SymIntArrayRef, T>,
+    std::is_same<at::OptionalSymIntArrayRef, T>,
+    std::is_same<std::optional<c10::SymInt>, T>>;
+
+template <typename T>
+struct remove_symint {
+  using type = T;
+};
+
+template <>
+struct remove_symint<c10::SymInt> {
+  using type = int64_t;
+};
+
+template <>
+struct remove_symint<at::OptionalSymIntArrayRef> {
+  using type = OptionalIntArrayRef;
+};
+
+template <>
+struct remove_symint<c10::SymIntArrayRef> {
+  using type = c10::IntArrayRef;
+};
+
+template <>
+struct remove_symint<std::optional<c10::SymInt>> {
+  using type = std::optional<int64_t>;
+};
+
+template <bool symint, typename T>
+struct maybe_keep_symint final {};
+
+template <typename T>
+struct maybe_keep_symint<true, T> {
+  using type = T;
+};
+
+template <typename T>
+struct maybe_keep_symint<false, T> {
+  using type = typename remove_symint<T>::type;
+};
+
+template <typename T>
+using fn_has_symint = typename guts::typelist::true_for_any_type<
+    has_symint,
+    typename guts::infer_function_traits<T>::type::parameter_types>;
+
+template <typename T>
+struct fn_remove_symint;
+
+template <typename Ret, typename... Args>
+struct fn_remove_symint<Ret(Args...)> {
+  using type = Ret(typename remove_symint<Args>::type...);
+};
+
+/**
+ * KernelFunction is similar to std::function but stores a kernel function.
+ * You can create a KernelFunction from a boxed or unboxed
+ * function/functor/lambda and call it in a boxed or unboxed way. If the way it
+ * was created doesn't match the way it was called, it will do boxing or
+ * unboxing as necessary.
+ */
+class TORCH_API KernelFunction final {
+ public:
+  using InternalBoxedKernelFunction = BoxedKernel::InternalBoxedKernelFunction;
+  using BoxedKernelFunction = BoxedKernel::BoxedKernelFunction;
+  using BoxedKernelFunction_withDispatchKeys =
+      BoxedKernel::BoxedKernelFunction_withDispatchKeys;
+
+  KernelFunction();
+  ~KernelFunction();
+
+  KernelFunction(const KernelFunction& other);
+  KernelFunction& operator=(const KernelFunction& other);
+
+  KernelFunction(KernelFunction&&) noexcept = default;
+
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValidUnboxed() const;
+  bool isValidSymUnboxed() const;
+  bool isValid() const;
+  bool isFallthrough() const;
+
+  /**
+   * Call the function in a boxed way.
+   * If the kernel function was created with an unboxed function,
+   * this will call an unboxing wrapper which then calls into that
+   * unboxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.callBoxed(stack);
+   *
+   * Or, with an unboxed implementation:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.callBoxed(stack);
+   */
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+
+  /**
+   * Call the function in an unboxed way.
+   * If the kernel function was created with a boxed function,
+   * this will box all inputs and then call into that boxed function.
+   *
+   * Note that this doesn't work for all types yet.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   *
+   * Or, with a boxed implementation:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   */
+  template <class Return, class... Args>
+  Return call(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) const;
+
+  /**
+   * Create a KernelFunction from a BoxedKernel.
+   */
+  static KernelFunction makeFromBoxedKernel(BoxedKernel boxed_fn);
+
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromBoxedFunction<&boxed_func>();
+   */
+  template <BoxedKernelFunction* func>
+  static KernelFunction makeFromBoxedFunction();
+
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs
+   * dispatch keys (currently there are none) See Note [Plumbing Keys Through
+   * The Dispatcher] for details.
+   */
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static KernelFunction makeFromBoxedFunction();
+
+  /**
+   * Create a KernelFunction from an unboxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * > };
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedFunctor<MyFunctor>(std::make_unique<MyFunctor>());
+   */
+  template <bool AllowLegacyTypes = false, class KernelFunctor>
+  static KernelFunction makeFromUnboxedFunctor(
+      std::unique_ptr<OperatorKernel> kernelFunctor);
+
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > KernelFunction func =
+   * KernelFunction::makeFromBoxedFunctor(std::make_unique<MyFunctor>());
+   */
+  template <class KernelFunctor>
+  static KernelFunction makeFromBoxedFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor);
+
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction
+   * because knowing the function pointer as a template argument (i.e. at
+   * compile time) allows the compiler to inline the function into its
+   * unboxing wrapper and yields better performance when calling the function.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedFunction<decltype(unboxed_func),
+   * &unboxed_func>();
+   */
+  template <class FuncPtr, bool AllowLegacyTypes = false>
+  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
+
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * KernelFunction::makeFromUnboxedFunction is usually a better choice than
+   * this if you know the function pointer at compile time, see doc comment
+   * there for an explanation.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedRuntimeFunction(&unboxed_func);
+   */
+  template <bool AllowLegacyTypes = false, class FuncType>
+  static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func);
+
+  static KernelFunction makeFallthrough();
+  static KernelFunction makeAmbiguousAutogradOther();
+  static KernelFunction makeNamedNotSupported();
+
+  /**
+   * Create a KernelFunction from an unboxed lambda.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   */
+  template <bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<
+      guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+      KernelFunction>
+  makeFromUnboxedLambda(Lambda&& lambda);
+  template <bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<
+      !guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+      KernelFunction>
+  makeFromUnboxedLambda(Lambda&& lambda);
+
+  std::string dumpState() const;
+  // For testing internal invariants only
+  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
+
+  // Register a token to be invalidated when this KernelFunction is destroyed
+  void registerToken(std::weak_ptr<KernelToken> token) const;
+
+ private:
+  explicit KernelFunction(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+  explicit KernelFunction(
+      BoxedKernel boxed_fn,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+
+  BoxedKernel boxed_kernel_func_;
+  void* unboxed_kernel_func_;
+  void* sym_unboxed_kernel_func_;
+  // List of tokens that need to be invalidated when this KernelFunction is
+  // destroyed (lazy allocation to save memory when empty)
+  mutable std::unique_ptr<std::vector<std::weak_ptr<KernelToken>>> tokens_;
+};
+
+// Token held by SafeKernelFunction that gets invalidated when KernelFunction is
+// destroyed
+class KernelToken {
+ public:
+  bool isValid() const;
+  void invalidate();
+
+ private:
+  std::atomic<bool> invalid_{false};
+};
+
+class SafeKernelFunction {
+ public:
+  SafeKernelFunction(
+      const KernelFunction* kernel,
+      std::string debug,
+      std::shared_ptr<OperatorHandle> opHandle);
+
+  // Safe callBoxed - checks token validity first
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+
+  // Get debug information
+  const std::string& debug() const {
+    return debug_;
+  }
+
+  // Get the OpHandle that lives on this SafeKernelFunction
+  const OperatorHandle& opHandle() const {
+    return *opHandle_;
+  }
+
+ private:
+  KernelFunction kernel_;
+  std::shared_ptr<KernelToken> token_;
+  std::string debug_;
+  std::shared_ptr<OperatorHandle> opHandle_;
+};
+
+} // namespace c10
+
+#include <ATen/core/boxing/KernelFunction_impl.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d190e1809da3abeeff6b5ded93cf1694fef94f6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h
@@ -0,0 +1,395 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <ATen/core/boxing/impl/WrapFunctionIntoFunctor.h>
+#include <ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h>
+
+#include <c10/util/C++17.h>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+template <typename Base, typename Child, typename... Args>
+std::enable_if_t<
+    !std::is_array_v<Base> && !std::is_array_v<Child> &&
+        std::is_base_of_v<Base, Child>,
+    std::unique_ptr<Base>>
+make_unique_base(Args&&... args) {
+  return std::make_unique<Child>(std::forward<Args>(args)...);
+}
+} // namespace detail
+
+inline KernelFunction::KernelFunction()
+    : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
+
+inline KernelFunction::~KernelFunction() {
+  if (tokens_) {
+    for (auto& weak_token : *tokens_) {
+      if (auto token = weak_token.lock()) {
+        token->invalidate();
+      }
+    }
+  }
+}
+
+inline KernelFunction::KernelFunction(const KernelFunction& other)
+    : boxed_kernel_func_(other.boxed_kernel_func_),
+      unboxed_kernel_func_(other.unboxed_kernel_func_),
+      sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) {
+  // tokens_ is intentionally not copied as we only care about invalidating
+  // tokens if the original KernelFunction is destroyed
+}
+
+inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) {
+  if (this != &other) {
+    boxed_kernel_func_ = other.boxed_kernel_func_;
+    unboxed_kernel_func_ = other.unboxed_kernel_func_;
+    sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_;
+
+    // tokens_ is intentionally not copied as we only care about invalidating
+    // tokens if the original KernelFunction is destroyed
+  }
+  return *this;
+}
+
+inline KernelFunction::KernelFunction(
+    std::unique_ptr<OperatorKernel> functor,
+    InternalBoxedKernelFunction* boxed_kernel_func,
+    void* unboxed_kernel_func,
+    void* sym_unboxed_kernel_func = nullptr)
+    : boxed_kernel_func_(std::move(functor), boxed_kernel_func),
+      unboxed_kernel_func_(unboxed_kernel_func),
+      sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {}
+
+inline KernelFunction::KernelFunction(
+    BoxedKernel boxed_fn,
+    void* unboxed_kernel_func,
+    void* sym_unboxed_kernel_func = nullptr)
+    : boxed_kernel_func_(std::move(boxed_fn)),
+      unboxed_kernel_func_(unboxed_kernel_func),
+      sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {}
+
+inline bool KernelFunction::isValidUnboxed() const {
+  return unboxed_kernel_func_ != nullptr;
+}
+
+inline bool KernelFunction::isValidSymUnboxed() const {
+  return sym_unboxed_kernel_func_ != nullptr;
+}
+
+inline bool KernelFunction::isValid() const {
+  return boxed_kernel_func_.isValid();
+}
+
+inline bool KernelFunction::isFallthrough() const {
+  return boxed_kernel_func_.isFallthrough();
+}
+
+inline void KernelFunction::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  boxed_kernel_func_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+
+template <class Return, class... Args>
+inline Return callUnboxedKernelFunction(
+    void* unboxed_kernel_func,
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Args&&... args) {
+  using ActualSignature = Return(OperatorKernel*, DispatchKeySet, Args...);
+  ActualSignature* func =
+      reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
+  return (*func)(functor, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+// This template requires you to explicitly specify the argument you want to
+// forward; it doesn't work if you try to deduce it
+// NB: keep this in sync with cloneWithRealTypes in function_schema.cpp
+
+template <typename T>
+inline typename remove_symint<T>::type unpackSymInt(T x) {
+  return x;
+}
+
+template <>
+inline remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
+  return x.guard_int(__FILE__, __LINE__);
+}
+
+template <>
+inline remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
+    c10::SymIntArrayRef x) {
+  return C10_AS_INTARRAYREF_SLOW(x);
+}
+
+template <>
+inline remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
+    std::optional<c10::SymInt> x) {
+  return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__))
+                       : std::nullopt;
+}
+
+template <>
+inline remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
+    at::OptionalSymIntArrayRef x) {
+  return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x))
+                       : std::nullopt;
+}
+
+template <class Return, class... Args>
+C10_ALWAYS_INLINE Return KernelFunction::call(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Args... args) const {
+  // note: Args above is intentionally not Args&&. We don't want perfect
+  // forwarding, which would require Args to be deduced, but instead we
+  // want callers to explicitly specify the Args.
+
+  if constexpr (std::disjunction_v<has_symint<Args>...>) {
+    if (sym_unboxed_kernel_func_ != nullptr) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<Return, Args...>(
+          sym_unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          std::forward<Args>(args)...);
+    }
+
+    if (unboxed_kernel_func_ != nullptr) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<
+          Return,
+          typename remove_symint<Args>::type...>(
+          unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          unpackSymInt<Args>(args)...);
+    }
+  } else {
+    if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<Return, Args...>(
+          unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          std::forward<Args>(args)...);
+    }
+  }
+
+  return impl::BoxedKernelWrapper<Return(Args...)>::call(
+      boxed_kernel_func_,
+      opHandle,
+      dispatchKeySet,
+      std::forward<Args>(args)...);
+}
+
+inline void KernelFunction::registerToken(
+    std::weak_ptr<KernelToken> token) const {
+  if (!tokens_) {
+    tokens_ = std::make_unique<std::vector<std::weak_ptr<KernelToken>>>();
+  }
+  tokens_->push_back(std::move(token));
+}
+
+inline KernelFunction KernelFunction::makeFromBoxedKernel(
+    BoxedKernel boxed_fn) {
+  return KernelFunction(
+      std::move(boxed_fn), nullptr); // no unboxed function pointer
+}
+
+template <KernelFunction::BoxedKernelFunction* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+
+template <KernelFunction::BoxedKernelFunction_withDispatchKeys* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+
+inline KernelFunction KernelFunction::makeFallthrough() {
+  return KernelFunction::makeFromBoxedKernel(BoxedKernel::makeFallthrough());
+}
+
+inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeAmbiguousAutogradOther());
+}
+
+inline KernelFunction KernelFunction::makeNamedNotSupported() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeNamedNotSupported());
+}
+
+template <bool AllowLegacyTypes, class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromUnboxedFunctor(
+    std::unique_ptr<OperatorKernel> kernelFunctor) {
+#ifndef NDEBUG
+  // This assertion is costly for build time so it's debug-gated.
+  static_assert(
+      guts::is_functor<KernelFunctor>::value,
+      "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
+#endif
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+
+  auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed<KernelFunctor>::call;
+  void* void_unboxed_fn = reinterpret_cast<void*>(unboxed_fn);
+  bool is_symint = fn_has_symint<decltype(unboxed_fn)>::value;
+  return KernelFunction(
+      std::move(kernelFunctor),
+      &impl::make_boxed_from_unboxed_functor<KernelFunctor, AllowLegacyTypes>::
+          call,
+      is_symint ? nullptr : void_unboxed_fn,
+      is_symint ? void_unboxed_fn : nullptr);
+}
+
+template <class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromBoxedFunctor(
+    std::unique_ptr<KernelFunctor> kernelFunctor) {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunctor(std::move(kernelFunctor)));
+}
+
+template <class FuncPtr, bool AllowLegacyTypes>
+inline KernelFunction KernelFunction::makeFromUnboxedFunction(
+    FuncPtr func_ptr) {
+  static_assert(
+      is_compile_time_function_pointer<FuncPtr>::value,
+      "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
+  static_assert(
+      !std::is_same_v<typename FuncPtr::FuncType, BoxedKernelFunction>,
+      "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+#if defined(__GNUC__) && defined(__SANITIZE_ADDRESS__) && !defined(__CUDACC__)
+  TORCH_INTERNAL_ASSERT(
+      FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
+#else
+  static_assert(
+      FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
+#endif
+
+#if !defined(C10_MOBILE)
+  (void)func_ptr; // Suppress unused variable warning
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>(
+      detail::make_unique_base<
+          OperatorKernel,
+          typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>());
+#else
+  // On mobile, we rather want to optimize for binary size than for performance,
+  // so let's not inline the kernel into the wrapper but use
+  // makeFromUnboxedRuntimeFunction instead.
+  return makeFromUnboxedRuntimeFunction(func_ptr.func_ptr());
+#endif
+}
+
+template <bool AllowLegacyTypes, class FuncType>
+inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(
+    FuncType* func) {
+  static_assert(
+      guts::is_function_type<FuncType>::value,
+      "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
+  static_assert(
+      !std::is_same_v<FuncType, BoxedKernelFunction>,
+      "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+  TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
+
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func));
+}
+
+template <bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<
+    guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+    KernelFunction>
+KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+  static_assert(
+      guts::is_functor<std::decay_t<Lambda>>::value,
+      "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+
+#if !defined(C10_MOBILE)
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+          std::forward<Lambda>(lambda)));
+#else
+  // On mobile, we rather want to optimize for binary size than for performance,
+  // so let's not inline the kernel into the wrapper but use
+  // makeFromUnboxedRuntimeFunction instead.
+  using FuncType =
+      typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type;
+  return makeFromUnboxedRuntimeFunction<AllowLegacyTypes, FuncType>(lambda);
+#endif
+}
+
+template <bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<
+    !guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+    KernelFunction>
+KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+  static_assert(
+      guts::is_functor<std::decay_t<Lambda>>::value,
+      "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+          std::forward<Lambda>(lambda)));
+}
+
+inline bool KernelToken::isValid() const {
+  return !invalid_.load(std::memory_order_acquire);
+}
+
+inline void KernelToken::invalidate() {
+  invalid_.store(true, std::memory_order_release);
+}
+
+inline SafeKernelFunction::SafeKernelFunction(
+    const KernelFunction* kernel,
+    std::string debug,
+    std::shared_ptr<OperatorHandle> opHandle)
+    : kernel_(kernel ? *kernel : KernelFunction()),
+      token_(std::make_shared<KernelToken>()),
+      debug_(std::move(debug)),
+      opHandle_(std::move(opHandle)) {
+  // Register the token with the original kernel so it gets invalidated when the
+  // kernel is destroyed
+  if (kernel) {
+    kernel->registerToken(token_);
+  }
+}
+
+inline void SafeKernelFunction::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_CHECK(
+      token_ && token_->isValid(),
+      "SafeKernelFunction has been invalidated ",
+      debug_);
+  kernel_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf328983091cf4e02f66e60462c5b9ffb082462
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+
+/**
+ * Inherit from OperatorKernel to implement a c10 kernel.
+ *
+ * Example:
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ *
+ * The kernel class is allowed to have members but these are equivalent
+ * to global variables. The kernel implementation is responsible for
+ * preventing race conditions on them.
+ *
+ * See below for how to register this kernel with PyTorch.
+ */
+struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target {
+  ~OperatorKernel() override = default;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa1e5eb02d879ff1ca90a0261369e9b3e3ead4b2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h
@@ -0,0 +1,43 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/CompileTimeFunctionPointer.h>
+
+namespace c10::impl {
+namespace detail {
+template <class FuncPtr, class ReturnType, class ParameterList>
+class WrapFunctionIntoFunctor_ {};
+template <class FuncPtr, class ReturnType, class... Parameters>
+class WrapFunctionIntoFunctor_<
+    FuncPtr,
+    ReturnType,
+    guts::typelist::typelist<Parameters...>>
+    final : public c10::OperatorKernel {
+ public:
+  C10_ALWAYS_INLINE decltype(auto) operator()(Parameters... args) {
+    return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
+  }
+};
+} // namespace detail
+
+// WrapFunctionIntoFunctor: Wraps a compile time function pointer into a kernel
+// functor. Since it is a compile time function pointer, many compilers can
+// inline it into the wrapper and you don't get any performance overhead for
+// wrapping.
+template <class FuncPtr>
+struct WrapFunctionIntoFunctor final {
+  static_assert(
+      c10::is_compile_time_function_pointer<FuncPtr>::value,
+      "WrapFunctionIntoFunctor can only wrap functions created with TORCH_FN.");
+  using type = detail::WrapFunctionIntoFunctor_<
+      FuncPtr,
+      typename guts::function_traits<typename FuncPtr::FuncType>::return_type,
+      typename guts::function_traits<
+          typename FuncPtr::FuncType>::parameter_types>;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ff4e3dbc917c8dc86605c403c3733539c4779db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
@@ -0,0 +1,46 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/TypeTraits.h>
+
+namespace c10::impl {
+
+namespace detail {
+template <class FuncType, class ReturnType, class ParameterList>
+class WrapFunctionIntoRuntimeFunctor_ {};
+template <class FuncType, class ReturnType, class... Parameters>
+class WrapFunctionIntoRuntimeFunctor_<
+    FuncType,
+    ReturnType,
+    guts::typelist::typelist<Parameters...>>
+    final : public c10::OperatorKernel {
+ public:
+  template <class FuncType_>
+  explicit WrapFunctionIntoRuntimeFunctor_(FuncType_&& kernel_func)
+      : kernel_func_(std::forward<FuncType_>(kernel_func)) {}
+
+  decltype(auto) operator()(Parameters... args) {
+    return kernel_func_(std::forward<Parameters>(args)...);
+  }
+
+ private:
+  FuncType kernel_func_;
+};
+} // namespace detail
+
+// WrapFunctionIntoRuntimeFunctor: Wraps any runtime functor into a functor that
+// inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
+// This can, for example, be used for lambdas, functors or even function
+// pointers. In the case of function pointers, since it is a runtime function
+// pointer, there is an overhead for calling it whenever the kernel is invoked.
+template <class FuncType>
+using WrapFunctionIntoRuntimeFunctor = detail::WrapFunctionIntoRuntimeFunctor_<
+    FuncType,
+    typename guts::infer_function_traits_t<FuncType>::return_type,
+    typename guts::infer_function_traits_t<FuncType>::parameter_types>;
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed93dfef4637046783ab9d7e88c7919e9fc75d04
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h
@@ -0,0 +1,415 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// This file contains boxing (not unboxing) logic,
+// i.e. how to make a vector<IValue> from a set of concrete arguments.
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/core/TensorOptions.h>
+
+#include <ATen/core/boxing/BoxedKernel.h>
+
+#include <c10/util/Metaprogramming.h>
+#include <type_traits>
+
+namespace c10::impl {
+
+//
+// utils
+//
+
+// is_mutable_tensor_ref
+template <class T>
+struct is_mutable_tensor_ref : std::false_type {};
+template <>
+struct is_mutable_tensor_ref<at::Tensor&> : std::true_type {};
+
+// is_tuple_of_mutable_tensor_refs
+//
+template <class T, class Enable = void>
+struct is_tuple_of_mutable_tensor_refs : std::false_type {};
+
+template <class T>
+struct is_tuple_of_mutable_tensor_refs<
+    T,
+    std::enable_if_t<guts::is_instantiation_of<std::tuple, T>::value, void>>
+    : guts::typelist::
+          all<is_mutable_tensor_ref, guts::typelist::from_tuple_t<T>> {};
+
+// has_ivalue_to<T> tests the presence/absence of instance method
+// IValue::to<T>()
+//
+template <class T, class Enable = void>
+struct has_ivalue_to : std::false_type {};
+
+template <class T>
+struct ivalue_to_helper {
+  using type = decltype(std::declval<IValue>().template to<T>());
+};
+template <class T>
+using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
+
+template <class T>
+struct has_ivalue_to<T, std::void_t<ivalue_to_helper_t<T>>> : std::true_type {};
+
+//
+// boxing predicates
+//
+
+// A boxable arg type is one that IValue has a constructor for.
+template <typename T>
+using can_box = std::disjunction<
+    std::is_constructible<IValue, std::decay_t<T>>,
+    // TensorOptions are not directly constructible into IValue,
+    // but torch::jit::push knows how to handle them
+    std::is_same<TensorOptions, std::decay_t<T>>>;
+
+template <typename... Ts>
+using can_box_all = std::conjunction<can_box<Ts>...>;
+
+// an unboxable result is one that can be extracted from an IValue
+template <typename T>
+using can_unbox = std::conjunction<
+    std::disjunction<
+        has_ivalue_to<T>,
+        // void returns are ok
+        std::is_same<void, T>>,
+    std::negation<std::is_lvalue_reference<T>>>;
+
+//
+// boxArgs - utility for pushing unboxed args onto IValue stack
+//
+template <class... Args>
+torch::jit::Stack boxArgs(Args... args) {
+  // TODO Reuse stack vector instead of allocating?
+  torch::jit::Stack stack;
+  stack.reserve(sizeof...(Args));
+  torch::jit::push(stack, std::forward<Args>(args)...);
+  return stack;
+}
+
+template <class T>
+inline constexpr size_t boxed_size_one() {
+  static_assert(
+      !std::is_same_v<std::decay_t<T>, c10::TensorOptions>,
+      "need to patch this path to support TensorOptions passed by reference");
+  return 1;
+}
+
+// torch::jit::push pushes 4 values for a TensorOptions; this needs to
+// be kept in sync.
+template <>
+inline constexpr size_t boxed_size_one<c10::TensorOptions>() {
+  return 4;
+}
+
+// NOTE: this could probably be simplified with C++17 fold expressions.
+template <typename...>
+struct BoxedSize : std::integral_constant<size_t, 0> {};
+template <class T, class... Args>
+struct BoxedSize<T, Args...>
+    : std::integral_constant<
+          size_t,
+          boxed_size_one<T>() + BoxedSize<Args...>::value> {};
+
+template <class... Args>
+static inline constexpr size_t boxed_size() {
+  return BoxedSize<Args...>::value;
+}
+
+template <typename T>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValue*& dest, T& arg) {
+  new (dest++) IValue(arg);
+}
+
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
+    IValue*& dest,
+    c10::TensorOptions options) {
+  new (dest++) IValue(c10::typeMetaToScalarType(options.dtype()));
+  new (dest++) IValue(options.layout());
+  new (dest++) IValue(options.device());
+  new (dest++) IValue(options.pinned_memory());
+}
+
+inline void boxArgsToStack(IValue*& /*unused*/) {}
+
+template <typename T, typename... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
+    IValue*& dest,
+    T& arg,
+    Args&... args) {
+  boxToStack(dest, arg);
+  boxArgsToStack(dest, args...);
+}
+
+//
+// PopResult is a helper class whose specializations handle popping single and
+// multiple return values, respectively.
+//
+template <class Result>
+struct PopResult final {
+  static Result call(Stack& stack) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return one value on the stack, ",
+        "but instead pushed ",
+        stack.size(),
+        " values.");
+    return std::move(stack[0]).to<Result>();
+  }
+};
+
+template <class... Types>
+struct PopResult<std::tuple<Types...>> final {
+  using Result = std::tuple<Types...>;
+
+  static Result call(Stack& stack) {
+    // for tuple return types, boxed kernel has pushed multiple values onto the
+    // stack
+    constexpr int RetCount = sizeof...(Types);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == RetCount,
+        "Boxed kernel was expected to return ",
+        RetCount,
+        " values on the stack, ",
+        "but instead pushed ",
+        stack.size(),
+        " values.");
+    return pop_to_tuple_impl(stack, std::make_index_sequence<RetCount>());
+  }
+
+ private:
+  // note: this has been moved into its own helper only to avoid a parse error
+  // on `indices` otherwise. I'm sure there's an incantation that slips it past
+  // the parser but eh
+  template <size_t... indices>
+  static Result pop_to_tuple_impl(
+      Stack& stack,
+      std::index_sequence<indices...> /*unused*/) {
+    return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
+  }
+};
+
+//
+// BoxedKernelWrapper
+//
+// For a given function type FT, BoxedKernelWrapper<FT> implements
+// a `call` method that
+// - takes a boxed kernel and unboxed arguments as specified by FT,
+// - calls `boxArgs` to box the arguments
+// - calls the boxed kernel
+// - unboxes and returns the result
+//
+// The partial specializations below handle various cases: in
+// particular, not all types appearing in op signatures are supported,
+// and ops returning references have nonstandard wrapper implementations.
+//
+
+// 1. The base specialization of BoxedKernelWrapper should never be
+// instantiated. A "no call method defined on BoxedKernelWrapper" compile error
+// means that an op signature has failed to trigger any of the partial
+// specializations that follow this one.
+//
+template <class FuncType, class Enable = void>
+struct BoxedKernelWrapper {
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to
+  // use template parameters in the expression, e.g. FuncType here. However,
+  // since `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the
+  // same effect.
+  static_assert(
+      sizeof(FuncType) != sizeof(FuncType),
+      "Function signature contains one or more unsupported parameter and/or return types. "
+      "Look for a nearby error like "
+      "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
+      "- (your function type) is the unsupported signature.");
+};
+
+//
+// 2. Supported signatures, other than those involving non-const Tensor refs -
+// i.e., "functional" ops.
+//
+
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+    Result(Args...),
+    std::enable_if_t<
+        can_box_all<Args...>::value && can_unbox<Result>::value &&
+            !is_tuple_of_mutable_tensor_refs<Result>::value,
+        void>> {
+  static Result call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) {
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+
+    if constexpr (!std::is_same_v<void, Result>) {
+      // op has pushed one or more values onto the stack.
+      return PopResult<Result>::call(stack);
+    } else {
+      // op returns void, boxed kernel has pushed nothing onto stack.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          stack.empty(),
+          "Boxed kernel was expected to return no values on the stack, ",
+          "but instead returned ",
+          stack.size(),
+          " values.");
+    }
+  }
+};
+
+//
+// 3. in-place ops take a single non-const Tensor reference
+// as their first argument, and return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// Because of this, the generated BoxedKernelWrapper specializations simply
+// return the in-place argument.
+//
+
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+    at::Tensor&(at::Tensor&, OtherArgs...),
+    std::enable_if_t<can_box_all<OtherArgs...>::value, void>> {
+  static at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      at::Tensor& outArg,
+      OtherArgs... otherArgs) {
+    torch::jit::Stack stack = boxArgs<at::Tensor&, OtherArgs...>(
+        outArg, std::forward<OtherArgs>(otherArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+
+    return outArg;
+  }
+};
+
+//
+// 3.5. In-process migration to make in-place ops take and return
+// const references instead.
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+    const at::Tensor&(const at::Tensor&, OtherArgs...),
+    std::enable_if_t<can_box_all<OtherArgs...>::value, void>> {
+  static const at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      const at::Tensor& outArg,
+      OtherArgs... otherArgs) {
+    torch::jit::Stack stack = boxArgs(outArg, otherArgs...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+
+    return outArg;
+  }
+};
+
+//
+// 4. out of place ops that take a single non-const Tensor reference as their
+// final argument, and also return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to
+// simply return out arguments.
+//
+template <class FirstArg, class... RestArgs>
+struct BoxedKernelWrapper<
+    at::Tensor&(FirstArg, RestArgs...),
+    std::enable_if_t<
+        can_box_all<FirstArg, RestArgs...>::value
+            // this skips over in-place kernels with a non-const Tensor
+            // arg at the front, so those can unambiguously trigger the
+            // preceding specialization.
+            && !is_mutable_tensor_ref<FirstArg>::value,
+        void>> {
+  static at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      FirstArg firstArg,
+      RestArgs... restArgs) {
+    torch::jit::Stack stack = boxArgs<FirstArg, RestArgs...>(
+        std::forward<FirstArg>(firstArg), std::forward<RestArgs>(restArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+
+    // reusing restArgs after it has been forwarded here is ok because we know
+    // that the last element is of type `Tensor&`.
+    return std::get<sizeof...(RestArgs) - 1>(
+        std::tuple<RestArgs...>{restArgs...});
+  }
+};
+
+//
+// 5. out of place ops that take multiple non-const Tensor references as their
+// final arguments, and return them in a std::tuple.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to
+// simply return the out arguments.
+//
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+    Result(Args...),
+    std::enable_if_t<
+        can_box_all<Args...>::value &&
+            is_tuple_of_mutable_tensor_refs<Result>::value,
+        void>> {
+  static Result call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) {
+    using ArgTuple = std::tuple<Args...>;
+    constexpr int RetCount = std::tuple_size<Result>();
+
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == RetCount,
+        "Boxed kernel was expected to return ",
+        RetCount,
+        " values on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+
+    // reusing args after it has been forwarded here is ok because we know
+    // that the last RetCount elements are of type `Tensor&`.
+    auto result = guts::tuple_take<ArgTuple, -RetCount>(
+        ArgTuple{std::forward<Args>(args)...});
+    static_assert(
+        std::is_same_v<Result, decltype(result)>,
+        "The parameter list of an op returning a tuple of Tensor references "
+        "must end with an equal number of Tensor reference parameters.");
+    return result;
+  }
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..fac192e893c4fc6d40714ebc2d24f848d736819a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -0,0 +1,790 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/IListRef.h>
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <utility>
+
+namespace c10 {
+
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack
+                                 // to the c10 namespace.
+class OperatorHandle;
+
+/*
+ * [Note: Argument forwarding in the dispatcher]
+ *
+ * The dispatcher uses a somewhat unusual way to forward arguments through
+ * several layers of wrapper functions. This can be confusing because an
+ * experienced C++ programmer would look at this and think "oh this is supposed
+ * to be forwarding a universal reference but the && is missing. This is a
+ * bug.". It is not a bug. The common way in C++ to forward arguments is to use
+ * universal references:
+ *
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ * but that relies on inferring the correct reference type (i.e. value vs & vs
+ * &&) from the argument. In our case, we cannot rely on the argument as
+ * supplied by the caller, because that could infer a different reference type
+ * than was used in the kernel function. The correct reference type is dictated
+ * by the kernel signature and must be identical since we cast function pointers
+ * through void* pointers and mismatches would be UB. So we need a forwarding
+ * pattern that determines the reference type to use by looking at the
+ * explicitly supplied operator signature, not by looking at the argument we're
+ * calling it with.
+ *
+ * What does std::forward do, exactly?
+ * ------------------------------------
+ * std::forward<T>(t) is a way to cast t to the reference type supplied in T.
+ * Let's assume decay_t<T> == U and T is either U or some reference of U.
+ *  - std::forward<T&>(t) will return U&, no matter what kind of reference t is.
+ *  - std::forward<T&&>(t) will return U&&, no matter what kind of reference t
+ * is.
+ *  - std::forward<T>(t) will return U&& (not U!), no matter what kind of
+ * reference t is.
+ *
+ * For universal references, that means that in the following function
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ *  - when called with arg being a rvalue reference or non-reference value, T
+ * gets inferred to be a non-reference U, and std::forward<T>(t) will return
+ * U&&, correctly moving the argument.
+ *  - when called with arg behind a lvalue reference, T gets inferred to be U&
+ * because that's the only way to match the signature (in C++, a type that is
+ * (T&)&& will collapse to T&). That means std::forward<T>(t) will return U& and
+ * the value will not be moved but passed on as a lvalue reference.
+ *
+ * How do we use that?
+ * ------------------------------------
+ * But std::forward can also be used outside of the common "universal
+ * forwarding" pattern to change reference types. So instead of following the
+ * common C++ pattern, we notice what std::forward<T>() actually does, and that
+ * is it takes a value and changes its reference to the type of reference passed
+ * in as T. If we don't infer T but explicitly specify it, we can use this to
+ * forward based on an explicitly specified reference type instead of the
+ * inferred argument type.
+ *
+ * This is why many of the dispatcher functions look like
+ * > template<class T> func(T t) { func2<T>(std::forward<T>(t)); }
+ * instead of the common
+ * > template<class T> func(T&& t) { func2(std::forward<T>(t)); }
+ *
+ * and are expected to be called by explicitly specifying the template
+ * parameters in a way that matches the expected operator signature at each call
+ * site.
+ */
+
+namespace impl {
+// supported_primitive_arg_types defines which primitive types we allow in
+// kernel functions as arguments or returns.
+// Additionally, we support lists, dicts and optionals containing these types.
+using supported_primitive_arg_types = guts::typelist::typelist<
+    int64_t,
+    double,
+    bool,
+    std::string_view,
+    at::Tensor,
+    at::Scalar,
+    c10::QScheme,
+    c10::ScalarType,
+    c10::Device,
+    c10::DeviceIndex,
+    c10::Layout,
+    c10::MemoryFormat,
+    at::Dimname>;
+
+// We have an unboxed functor in hand that takes C++ arguments, and
+// we're building a boxed functor wrapper for it that takes IValues.
+// So "outside" is boxed and "inside" is unboxed.
+//
+// So a valid input type is one that our boxed functor wrapper can
+// unbox from an IValue into a C++ value.
+//
+// Whereas a valid output type is one that our wrapper can receive
+// as a C++ value from the unboxed functor, and box into an IValue.
+
+//
+// assert_is_valid_input_type
+// checks that T can be unboxed from an IValue into a C++ value.
+//
+
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct assert_is_valid_input_type {
+  assert_is_valid_input_type() {
+    if constexpr (guts::typelist::contains<supported_primitive_arg_types, T>::
+                      value) {
+      /* everything is ok, this is a primitive type */
+    } else {
+      /* otherwise this must be an instance of a valid custom class, since it
+         can only have been created via IValue(x), which ensures this. */
+    }
+  }
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<std::optional<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {};
+
+template <bool AllowDeprecatedTypes, class... Args>
+struct TypeCheckHelper;
+
+template <bool AllowDeprecatedTypes>
+struct TypeCheckHelper<AllowDeprecatedTypes> {};
+
+template <bool AllowDeprecatedTypes, class Head, class... Rest>
+struct TypeCheckHelper<AllowDeprecatedTypes, Head, Rest...>
+    : TypeCheckHelper<AllowDeprecatedTypes, Rest...> {
+  assert_is_valid_input_type<Head, AllowDeprecatedTypes> check;
+};
+
+template <class... Contained, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    std::tuple<Contained...>,
+    AllowDeprecatedTypes>
+    : TypeCheckHelper<AllowDeprecatedTypes, Contained...> {};
+
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<Dict<Key, Value>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+};
+
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    std::unordered_map<Key, Value>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<List<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<c10::ArrayRef<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: ArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    c10::OptionalArrayRef<T>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: OptionalArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+
+template <class T, size_t N, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<std::array<T, N>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<float, T>>> {
+  // There is no reason to support float when we have double. Keep the API lean.
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const char*, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: const char*. Please use std::string_view instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<std::vector<bool>, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: vector<bool>. Please use List<bool> instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<
+        std::is_integral_v<T> &&
+        !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const c10::SymInt&, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead.");
+};
+
+// TODO: it probably would be good to tighten this up quite a bit more with
+// an explicit list for everything
+
+//
+// assert_is_valid_output_type
+//
+
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct assert_is_valid_output_type {
+  assert_is_valid_output_type() {
+    if constexpr (guts::typelist::contains<supported_primitive_arg_types, T>::
+                      value) {
+      /* everything is ok, this is a primitive type */
+    } else {
+      /* otherwise T is verified to be a registered custom class in the IValue
+        constructor, so no benefit in double-checking here */
+    }
+  }
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::optional<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    c10::OptionalArrayRef<T>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<Dict<Key, Value>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  static_assert(
+      !std::is_same_v<Value, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+};
+
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    std::unordered_map<Key, Value>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  static_assert(
+      !std::is_same_v<Value, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<List<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::vector<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::vector<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel
+  // with an unsupported output type: std::vector<T>. Please use List<T>
+  // instead.");
+};
+
+template <class T, size_t N, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::array<T, N>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+};
+
+// The following specialisations of assert_is_valid_output_type are technically
+// not necessary since we would hit the base case and show an error message
+// there if they didn't exist, but we can show a better error message
+// in some common error scenarios.
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<float, T>>> {
+  // There is no reason to support float when we have double. Keep the API lean.
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const char*, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: const char*. Please use std::string_view instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<std::vector<bool>, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: vector<bool>. Please use List<bool> instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<
+        std::is_integral_v<T> &&
+        !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
+};
+
+// ivalue_to_arg
+
+template <class T>
+struct decay_if_not_tensor final {
+  using type = std::decay_t<T>;
+};
+
+template <>
+struct decay_if_not_tensor<at::Tensor&> final {
+  using type = at::Tensor&;
+};
+
+template <>
+struct decay_if_not_tensor<const at::Tensor&> final {
+  using type = const at::Tensor&;
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg final {
+  static decltype(auto) call(IValue& v) {
+    assert_is_valid_input_type<T, AllowDeprecatedTypes>();
+    return std::move(v).to<T>();
+  }
+};
+
+// The following two specializations take advantage of specialized
+// `toTensor()` overloads on IValue to avoid copying.
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<at::Tensor&, AllowDeprecatedTypes> final {
+  // We cannot use the default implementation if they asked for a
+  // `at::Tensor&` because it moves from the IValue, so it can't get
+  // an lvalue reference.
+  static at::Tensor& call(IValue& v) {
+    // Tensor& is valid, don't bother asserting
+    return v.toTensor();
+  }
+};
+
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<const at::Tensor&, AllowDeprecatedTypes> final {
+  // We should not use the default implementation if they asked for
+  // a `const at::Tensor&` because it moves from the IValue and they
+  // didn't ask for that.
+  static const at::Tensor& call(IValue& v) {
+    // const Tensor& is valid, don't bother asserting
+    return v.toTensor();
+  }
+};
+
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<at::ITensorListRef, AllowDeprecatedTypes> final {
+  static List<at::Tensor> call(IValue& v) {
+    return v.toTensorList();
+  }
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<ArrayRef<T>, AllowDeprecatedTypes> final {
+  // If an argument is ArrayRef<T>, convert the IValue to a std::vector<T> and
+  // pass that to the operator. std::vector<T> is implicitly convertible to
+  // ArrayRef<T>.
+  static std::vector<T> call(IValue& v) {
+    return ivalue_to_arg<std::vector<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<c10::SymIntArrayRef, AllowDeprecatedTypes> final {
+  static std::vector<c10::SymInt> call(IValue& v) {
+    if (v.isIntList()) {
+      std::vector<c10::SymInt> r;
+      auto src = v.toIntList();
+      std::transform(
+          src.begin(), src.end(), std::back_inserter(r), [](int64_t i) {
+            return c10::SymInt(i);
+          });
+      return r;
+    } else {
+      return ivalue_to_arg<std::vector<c10::SymInt>, AllowDeprecatedTypes>::
+          call(v);
+    }
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<c10::OptionalArray<c10::SymInt>, AllowDeprecatedTypes>
+    final {
+  static OptionalArray<c10::SymInt> call(IValue& v) {
+    if (v.isIntList()) {
+      std::vector<c10::SymInt> r;
+      auto src = v.toIntList();
+      std::transform(
+          src.begin(), src.end(), std::back_inserter(r), [](int64_t i) {
+            return c10::SymInt(i);
+          });
+      return OptionalArray<c10::SymInt>(std::move(r));
+    } else {
+      return std::move(v).to<OptionalArray<c10::SymInt>>();
+    }
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<std::optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+  // If an argument is std::optional<ArrayRef<T>>, convert the IValue to an
+  // std::optional<std::vector<T>> and pass that to the operator.
+  // OptionalArray<T> is basically a std::optional<std::vector<T>> but
+  // implicitly convertible to std::optional<ArrayRef<T>>.
+  static OptionalArray<T> call(IValue& v) {
+    return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
+  // If an argument is OptionalArrayRef<T>, convert the IValue to an
+  // std::optional<std::vector<T>> and pass that to the operator.
+  // OptionalArray<T> is basically a std::optional<std::vector<T>> but
+  // implicitly convertible to OptionalArrayRef<T>
+  static OptionalArray<T> call(IValue& v) {
+    return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+
+// return_to_ivalue
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct return_to_ivalue final {};
+
+template <class T, bool AllowDeprecatedTypes>
+struct return_to_ivalue<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<!std::is_same_v<at::Tensor&, T>>>
+    final {
+  static IValue call(T&& v) {
+    assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+    return c10::ivalue::from(std::move(v));
+  }
+  static IValue copy(const T& v) {
+    assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+    return IValue(v);
+  }
+};
+
+// Special case to allow kernels to return `Tensor&`.
+// TODO Delete this once kernels don't do that anymore
+template <bool AllowDeprecatedTypes>
+struct return_to_ivalue<at::Tensor&, AllowDeprecatedTypes, void> final {
+  static IValue call(at::Tensor& v) {
+    return c10::ivalue::from(v);
+  }
+  static IValue copy(at::Tensor& v) {
+    return IValue(v);
+  }
+};
+
+// wrap_kernel_functor_unboxed_
+
+template <class KernelFunctor, class OpSignature>
+struct wrap_kernel_functor_unboxed_ final {};
+
+// This specialization is for kernels with a first argument that is NOT of type
+// DispatchKeySet This includes kernels with 0 arguments.
+template <class KernelFunctor, class ReturnType, class... ParameterTypes>
+struct wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    ReturnType(ParameterTypes...)>
+    final {
+  static_assert(
+      std::is_same_v<
+          ReturnType,
+          typename guts::infer_function_traits_t<KernelFunctor>::return_type>,
+      "Return type mismatch");
+  static_assert(
+      std::is_same_v<
+          guts::typelist::typelist<ParameterTypes...>,
+          typename guts::infer_function_traits_t<
+              KernelFunctor>::parameter_types>,
+      "Parameter types mismatch");
+
+  // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes
+  // doesn't use &&
+  static ReturnType call(
+      OperatorKernel* functor,
+      DispatchKeySet /*unused*/,
+      ParameterTypes... args) {
+    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+    // Note [Plumbing Keys Through The Dispatcher 2]
+    // See Note [Plumbing Keys Through The Dispatcher] for the background.
+    // This functor explicitly takes in a dispatchKeySet and drops it on the
+    // floor- it does not forward it to the registered kernel.
+    //
+    // This is due to the calling convention within the dispatcher, which
+    // expects all registered kernels to have a first argument of type
+    // DispatchKeySet.
+    // This is not the case for pretty much all manually written kernels,
+    // however- this functor serves to separate the calling convention of the
+    // dispatcher from the calling convention of manually written kernels.
+    return (*functor_)(std::forward<ParameterTypes>(args)...);
+  }
+};
+
+// This specialization is for kernels with a first argument of type
+// DispatchKeySet
+template <class KernelFunctor, class ReturnType, class... ParameterTypes>
+struct wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    ReturnType(DispatchKeySet, ParameterTypes...)>
+    final {
+  static_assert(
+      std::is_same_v<
+          ReturnType,
+          typename guts::infer_function_traits_t<KernelFunctor>::return_type>,
+      "Return type mismatch");
+  static_assert(
+      std::is_same_v<
+          guts::typelist::typelist<DispatchKeySet, ParameterTypes...>,
+          typename guts::infer_function_traits_t<
+              KernelFunctor>::parameter_types>,
+      "Parameter types mismatch");
+
+  // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes
+  // doesn't use &&
+  static ReturnType call(
+      OperatorKernel* functor,
+      DispatchKeySet dispatchKeySet,
+      ParameterTypes... args) {
+    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+    // We're explicitly taking in a dispatchKeySet and forwarding it to the
+    // registered kernel. See Note [Plumbing Keys Through The Dispatcher 2] for
+    // details.
+    return (*functor_)(dispatchKeySet, std::forward<ParameterTypes>(args)...);
+  }
+};
+
+template <class KernelFunctor>
+using wrap_kernel_functor_unboxed = wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    typename guts::infer_function_traits_t<KernelFunctor>::func_type>;
+
+// call_functor_with_args_from_stack
+
+template <
+    class Functor,
+    bool AllowDeprecatedTypes,
+    size_t... ivalue_arg_indices,
+    typename... ArgTypes>
+std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+call_functor_with_args_from_stack_(
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack,
+    std::index_sequence<ivalue_arg_indices...> /*unused*/,
+    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
+  (void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would
+               // be unused and we have to silence the compiler warning.
+
+  // We're explicitly filtering out DispatchKeySet from the argument list.
+  // Some kernels take a DispatchKeySet as their first argument in order to
+  // plumb keys through the dispatcher. We don't want to expose the
+  // DispatchKeySet type to jit, so we don't include this argument on the stack.
+  // See Note [Plumbing Keys Through The Dispatcher] for the background.
+  return wrap_kernel_functor_unboxed<Functor>::call(
+      functor,
+      dispatchKeySet,
+      ivalue_to_arg<
+          typename decay_if_not_tensor<ArgTypes>::type,
+          AllowDeprecatedTypes>::
+          call(torch::jit::peek(
+              *stack, ivalue_arg_indices, sizeof...(ivalue_arg_indices)))...);
+}
+
+template <class Functor, bool AllowDeprecatedTypes>
+std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+call_functor_with_args_from_stack(
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) {
+  // We're explicitly filtering out DispatchKeySet from the argument list.
+  // Some kernels take a DispatchKeySet as their first argument in order to
+  // plumb keys through the dispatcher. We don't want to expose the
+  // DispatchKeySet type to jit, so we don't include this argument on the stack.
+  // See Note [Plumbing Keys Through The Dispatcher] for the background.
+  using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<
+      Functor>::parameter_types;
+  constexpr size_t num_ivalue_args = guts::typelist::size<ArgTypes>::value;
+  return call_functor_with_args_from_stack_<Functor, AllowDeprecatedTypes>(
+      functor,
+      dispatchKeySet,
+      stack,
+      std::make_index_sequence<num_ivalue_args>(),
+      static_cast<ArgTypes*>(nullptr));
+}
+
+// push_outputs
+
+template <class OutputType, bool AllowDeprecatedTypes>
+struct push_outputs final {
+  // Contrary to [Note: Argument forwarding in the dispatcher], we use
+  // OutputType&& here to avoid one extra call to the move constructor in this
+  // case. This is still not a universal reference though because OutputType is
+  // an explicitly specified class template parameter.
+  static void call(OutputType&& output, Stack* stack) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputType, AllowDeprecatedTypes>::call(
+            std::forward<OutputType>(output)));
+  }
+  static void copy(const OutputType& output, Stack* stack) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputType, AllowDeprecatedTypes>::copy(output));
+  }
+};
+template <class... OutputTypes, bool AllowDeprecatedTypes>
+struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
+  static void call(std::tuple<OutputTypes...>&& output, Stack* stack) {
+    call_(
+        std::move(output),
+        stack,
+        std::make_index_sequence<sizeof...(OutputTypes)>());
+  }
+  static void copy(const std::tuple<OutputTypes...>& output, Stack* stack) {
+    copy_(output, stack, std::make_index_sequence<sizeof...(OutputTypes)>());
+  }
+
+ private:
+  template <size_t... indices>
+  static void call_(
+      std::tuple<OutputTypes...>&& output,
+      Stack* stack,
+      std::index_sequence<indices...> /*unused*/) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
+            std::forward<OutputTypes>(std::get<indices>(output)))...);
+  }
+  template <size_t... indices>
+  static void copy_(
+      const std::tuple<OutputTypes...>& output,
+      Stack* stack,
+      std::index_sequence<indices...> /*unused*/) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
+            std::get<indices>(output))...);
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct push_outputs<void, AllowDeprecatedTypes> final {
+  static void call(int /*dummy*/, Stack* /*stack*/) {}
+  static void copy(int /*dummy*/, Stack* /*stack*/) {}
+};
+
+// make_boxed_from_unboxed_functor
+
+template <class KernelFunctor, bool AllowDeprecatedTypes>
+struct make_boxed_from_unboxed_functor final {
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+
+  static void call(
+      OperatorKernel* functor,
+      const OperatorHandle& /*unused*/,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) {
+    using ReturnType =
+        typename guts::infer_function_traits_t<KernelFunctor>::return_type;
+    // We're explicitly filtering out DispatchKeySet from the argument list.
+    // Some kernels take a DispatchKeySet as their first argument in order to
+    // plumb keys through the dispatcher. We don't want to expose the
+    // DispatchKeySet type to jit, so we don't include this argument on the
+    // stack. See Note [Plumbing Keys Through The Dispatcher] for the
+    // background.
+    using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<
+        KernelFunctor>::parameter_types;
+    constexpr bool has_outputs = !std::is_same_v<void, ReturnType>;
+    constexpr size_t num_inputs = guts::typelist::size<ArgTypes>::value;
+    if constexpr (has_outputs) {
+      // Decay ReturnType to ReturnType_ so that if a reference gets returned,
+      // we actually store it by value and don't get a dangling reference. This
+      // is only required because some kernels still return `Tensor&`. [Note:
+      // VC++ and 'std': ambiguous symbol]
+      using ReturnType_ = ::std::decay_t<ReturnType>;
+      ReturnType_ output = call_functor_with_args_from_stack<
+          KernelFunctor,
+          AllowDeprecatedTypes>(functor, dispatchKeySet, stack);
+      torch::jit::drop(*stack, num_inputs);
+      // See note [ VC++ and 'std': ambiguous symbol]
+      push_outputs<ReturnType_, AllowDeprecatedTypes>::call(
+          ::std::move(output), stack);
+    } else {
+      call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(
+          functor, dispatchKeySet, stack);
+      torch::jit::drop(*stack, num_inputs);
+    }
+  }
+};
+} // namespace impl
+
+} // namespace c10
+
+namespace torch {
+using OperatorKernel = c10::OperatorKernel;
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..aecf24471b02853caed9872783e3fdb3f3aaf011
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h
@@ -0,0 +1,145 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/irange.h>
+
+template <class... Inputs>
+inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+  return {std::forward<Inputs>(inputs)...};
+}
+
+inline at::Tensor dummyTensor(
+    c10::DispatchKeySet ks,
+    bool requires_grad = false) {
+  auto* allocator = c10::GetCPUAllocator();
+  int64_t nelements = 1;
+  auto dtype = caffe2::TypeMeta::Make<float>();
+  int64_t size_bytes = nelements * dtype.itemsize();
+  auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size_bytes,
+      allocator->allocate(size_bytes),
+      allocator,
+      /*resizable=*/true);
+  at::Tensor t =
+      at::detail::make_tensor<c10::TensorImpl>(storage_impl, ks, dtype);
+  // TODO: We add this to simulate the ideal case where we only have Autograd
+  // backend keys
+  //       on Tensor when it requires grad. But currently Autograd keys are
+  //       added in TensorImpl constructor by default.
+  if (!requires_grad) {
+    t.unsafeGetTensorImpl()->remove_autograd_key();
+  }
+  return t;
+}
+
+inline at::Tensor dummyTensor(
+    c10::DispatchKey dispatch_key,
+    bool requires_grad = false) {
+  return dummyTensor(c10::DispatchKeySet(dispatch_key), requires_grad);
+}
+
+template <class... Args>
+inline std::vector<c10::IValue> callOp(
+    const c10::OperatorHandle& op,
+    Args... args) {
+  auto stack = makeStack(std::forward<Args>(args)...);
+  op.callBoxed(&stack);
+  return stack;
+}
+
+template <class Result, class... Args>
+inline Result callOpUnboxed(const c10::OperatorHandle& op, Args... args) {
+  return op.typed<Result(Args...)>().call(std::forward<Args>(args)...);
+}
+
+template <class Result, class... Args>
+inline Result callOpUnboxedWithDispatchKey(
+    const c10::OperatorHandle& op,
+    c10::DispatchKey dispatchKey,
+    Args... args) {
+  return op.typed<Result(Args...)>().callWithDispatchKey(
+      dispatchKey, std::forward<Args>(args)...);
+}
+
+template <class Result, class... Args>
+inline Result callOpUnboxedWithPrecomputedDispatchKeySet(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet ks,
+    Args... args) {
+  return op.typed<Result(Args...)>().redispatch(
+      ks, std::forward<Args>(args)...);
+}
+
+inline void expectDoesntFindKernel(
+    const char* op_name,
+    c10::DispatchKey dispatch_key) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_ANY_THROW(callOp(*op, dummyTensor(dispatch_key), 5););
+}
+
+inline void expectDoesntFindOperator(const char* op_name) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_FALSE(op.has_value());
+}
+
+template <class Exception, class Functor>
+inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
+  try {
+    std::forward<Functor>(functor)();
+  } catch (const Exception& e) {
+    EXPECT_THAT(e.what(), testing::HasSubstr(expectMessageContains));
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception containing \""
+                << expectMessageContains << "\" but didn't throw";
+}
+
+template <class T, size_t N>
+void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual.get(i));
+  }
+}
+
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+// NB: This is not really sound, but all of the type sets constructed here
+// are singletons so it's fine
+static inline c10::DispatchKey extractDispatchKey(const at::Tensor& t) {
+  return legacyExtractDispatchKey(t.key_set());
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h
new file mode 100644
index 0000000000000000000000000000000000000000..6812e6c1dc0d6656f3522fa1832a90101d4d80e7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h
@@ -0,0 +1,72 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/Type.h>
+#include <typeindex>
+
+namespace c10::impl {
+
+// A CppSignature object holds RTTI information about a C++ function signature
+// at runtime and can compare them or get a debug-printable name.
+class TORCH_API CppSignature final {
+ public:
+  CppSignature(const CppSignature&) = default;
+  CppSignature(CppSignature&&) noexcept = default;
+  CppSignature& operator=(const CppSignature&) = default;
+  CppSignature& operator=(CppSignature&&) noexcept = default;
+
+  template <class FuncType>
+  static CppSignature make() {
+    // Normalize functors, lambdas, function pointers, etc. into the plain
+    // function type The first argument of the schema might be of type
+    // DispatchKeySet, in which case we remove it. We do this to guarantee that
+    // all CppSignature's for an operator will match, even if they're registered
+    // with different calling conventions.
+    // See Note [Plumbing Keys Through The Dispatcher]
+    using decayed_function_type =
+        typename c10::remove_DispatchKeySet_arg_from_func<
+            std::decay_t<FuncType>>::func_type;
+
+    return CppSignature(std::type_index(typeid(decayed_function_type)));
+  }
+
+  std::string name() const {
+    return c10::demangle(signature_.name());
+  }
+
+  friend bool operator==(const CppSignature& lhs, const CppSignature& rhs) {
+    if (lhs.signature_ == rhs.signature_) {
+      return true;
+    }
+    // Without RTLD_GLOBAL, the type_index comparison could yield false because
+    // they point to different instances of the RTTI data, but the types would
+    // still be the same. Let's check for that case too.
+    // Note that there still is a case where this might not work, i.e. when
+    // linking libraries of different compilers together, they might have
+    // different ways to serialize a type name. That, together with a missing
+    // RTLD_GLOBAL, would still fail this.
+    if (0 == strcmp(lhs.signature_.name(), rhs.signature_.name())) {
+      return true;
+    }
+
+    return false;
+  }
+
+ private:
+  explicit CppSignature(std::type_index signature)
+      : signature_(std::move(signature)) {}
+  std::type_index signature_;
+};
+
+inline bool operator!=(const CppSignature& lhs, const CppSignature& rhs) {
+  return !(lhs == rhs);
+}
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h
new file mode 100644
index 0000000000000000000000000000000000000000..78b8cecac1db5d571ec9fb88de9f294505f4b271
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -0,0 +1,285 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/Variadic.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/Bitset.h>
+#include <c10/util/irange.h>
+#include <cstdint>
+
+namespace c10 {
+
+namespace impl {
+
+// Take a DispatchKeySet for a Tensor and determine what the actual dispatch
+// DispatchKey should be, taking into account TLS, and skipping backends which
+// fall through.
+//
+// Unlike Tensor::key_set(), the value of this on a tensor can change depending
+// on TLS.
+//
+// NB: If there is no valid dispatch key, this will return Undefined
+inline DispatchKeySet computeDispatchKeySet(
+    DispatchKeySet ks,
+    // The key mask lets us eliminate (by zero entries) keys which should not
+    // be considered for dispatch.  There are two cases when we use this:
+    //
+    // - If an operator's dispatch table contains a fallthrough entry, we
+    //   should bypass it entirely when finding the key
+    // - If a user invokes with redispatch, the mask lets us
+    //   zero out the key the user asked us to stop.
+    //
+    // These excluded backends are NOT tracked in the TLS, but must be applied
+    // AFTER TLS (since the backend may have been introduced for consideration
+    // by the included TLS), which is why you have to pass them in to this
+    // function (as opposed to just applying it to the input 'ks').
+    DispatchKeySet key_mask) {
+  c10::impl::LocalDispatchKeySet local =
+      c10::impl::tls_local_dispatch_key_set();
+  // TODO: It's a bit irritating that we have to do logical ORs here, it would
+  // be nice to only do one.  Can always_included be folded into the TLS?  Well,
+  // it's a bit troublesome, because fastpath TLS access requires the type of
+  // the TLS in question to be zero-initialized, so you don't actually win
+  // anything in that case.
+  return (((ks | local.included_) - local.excluded_) & key_mask);
+}
+
+} // namespace impl
+
+namespace detail {
+// A small gadget to extract the DispatchKeySet from types which are known
+// to have it.  Used to extract dispatch keys from unboxed calls.
+struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
+  DispatchKeySet ts;
+  void operator()(const at::Tensor& x) {
+    ts = ts | x.key_set();
+  }
+  void operator()(const std::optional<at::Tensor>& x) {
+    if (x.has_value()) {
+      ts = ts | x->key_set();
+    }
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    for (const auto& x : xs) {
+      ts = ts | x.key_set();
+    }
+  }
+  // Tensor?[] translates to this case.
+  void operator()(const c10::List<std::optional<at::Tensor>>& xs) {
+    for (std::optional<at::Tensor> x : xs) {
+      if (x.has_value()) {
+        ts = ts | x.value().key_set();
+      }
+    }
+  }
+  // Structured Tensor[] translates to this case
+  void operator()(const at::ITensorListRef& xs) {
+    for (const auto& x : xs) {
+      ts = ts | x.key_set();
+    }
+  }
+  [[noreturn]] void operator()(
+      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
+    // Just checking that the handling of Tensor?[] didn't change.
+    TORCH_INTERNAL_ASSERT(false);
+  }
+  void operator()(const at::Generator& gen) {
+    if (gen.defined()) {
+      ts = ts | gen.key_set();
+    }
+  }
+  void operator()(const std::optional<at::Generator>& gen) {
+    if (gen.has_value() && gen->defined()) {
+      ts = ts | gen->key_set();
+    }
+  }
+  template <typename T>
+  void operator()(const T& /*unused*/) {
+    // do nothing
+  }
+};
+
+// NB: take by const reference (Don't do universal forwarding here! You
+// don't want to move into this function!)
+template <typename... Args>
+DispatchKeySet multi_dispatch_key_set(const Args&... args) {
+  return MultiDispatchKeySet().apply(args...).ts;
+}
+} // namespace detail
+
+/**
+ * An instance of DispatchKeyExtractor knows how to get a dispatch key given
+ * a list of arguments for an operator call.
+ *
+ * The instance is specific for a certain operator as:
+ *  - In boxed dispatch, different operators have different ways to extract
+ *    the dispatch key (e.g. different numbers of arguments), and we precompute
+ *    the stack locations we should look at; and
+ *  - In all dispatch, some backends should be excluded from dispatch because
+ *    they have been registered as fallthrough.  The set of excluded backends
+ *    varies from operator, as some operators may have overridden the
+ *    fallthrough with custom behavior.
+ *
+ *   Note - this should maintain identical impl to the py dispatcher key
+ * extraction logic at pytorch/torch/dispatcher.py
+ */
+struct TORCH_API DispatchKeyExtractor final {
+ public:
+  static DispatchKeyExtractor make(const FunctionSchema& schema) {
+    return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema));
+  }
+
+  static DispatchKeyExtractor makeUninitialized() {
+    return DispatchKeyExtractor(c10::utils::bitset());
+  }
+
+  void registerSchema(const FunctionSchema& schema) {
+    TORCH_INTERNAL_ASSERT(dispatch_arg_indices_reverse_.is_entirely_unset());
+    dispatch_arg_indices_reverse_ = makeBitsetForDispatchArgs(schema);
+  }
+  void deregisterSchema() {
+    dispatch_arg_indices_reverse_ = c10::utils::bitset();
+  }
+
+  DispatchKeySet getDispatchKeySetBoxed(const torch::jit::Stack* stack) const {
+    DispatchKeySet ks;
+    dispatch_arg_indices_reverse_.for_each_set_bit([&](size_t
+                                                           reverse_arg_index) {
+      const auto& ivalue = torch::jit::peek(*stack, 0, reverse_arg_index + 1);
+      if (C10_LIKELY(ivalue.isTensor())) {
+        // NB: Take care not to introduce a refcount bump (there's
+        // no safe toTensorRef method, alas)
+        ks = ks | ivalue.unsafeToTensorImpl()->key_set();
+      } else if (C10_UNLIKELY(ivalue.isTensorList())) {
+        // NB: use toListRef as it doesn't induce refcount bumps
+        // (toTensorListRef is not a thing)
+        for (const auto& nv : ivalue.toListRef()) {
+          auto* tensor = nv.unsafeToTensorImpl();
+          ks = ks | tensor->key_set();
+        }
+      }
+      // Tensor?[] translates to a c10::List<IValue> so we need to peek inside
+      else if (C10_UNLIKELY(ivalue.isList())) {
+        for (const auto& elt : ivalue.toListRef()) {
+          if (elt.isTensor()) {
+            ks = ks | elt.toTensor().key_set();
+          }
+        }
+      }
+    });
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      c10::impl::LocalDispatchKeySet tls =
+          c10::impl::tls_local_dispatch_key_set();
+      auto backend_idx =
+          ((ks | tls.included_) - tls.excluded_).getBackendIndex();
+      return impl::computeDispatchKeySet(
+          ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+
+  template <class... Args>
+  DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
+    auto ks = detail::multi_dispatch_key_set(args...);
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      c10::impl::LocalDispatchKeySet tls =
+          c10::impl::tls_local_dispatch_key_set();
+      auto backend_idx =
+          ((ks | tls.included_) - tls.excluded_).getBackendIndex();
+      return impl::computeDispatchKeySet(
+          ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+
+  void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);
+
+  std::string dumpState() const;
+  void checkInvariants(const FunctionSchema& schema) const;
+
+ private:
+  static bool isDispatchType(const Type& type) {
+    // Checking isSubtypeOf on a DynamicType heap-allocates a
+    // DynamicType version of the argument if it's not a DynamicType
+    // already, and this has measurable overhead during startup.
+#ifdef C10_MOBILE
+    struct CachedTypes {
+      DynamicTypePtr listOfTensors;
+      DynamicTypePtr listOfOptionalTensors;
+      DynamicTypePtr optionalOfTensor;
+    };
+    static const CachedTypes ct = {
+        DynamicType::create(*ListType::ofTensors()),
+        DynamicType::create(*ListType::ofOptionalTensors()),
+        DynamicType::create(*OptionalType::ofTensor())};
+    return type.isSubtypeOf(c10::TypeFactory::get<TensorType>()) ||
+        type.isSubtypeOf(ct.listOfTensors) ||
+        type.isSubtypeOf(ct.listOfOptionalTensors) ||
+        type.isSubtypeOf(ct.optionalOfTensor);
+#else // C10_MOBILE
+    return type.isSubtypeOf(*TensorType::get()) ||
+        type.isSubtypeOf(*ListType::ofTensors()) ||
+        type.isSubtypeOf(*ListType::ofOptionalTensors()) ||
+        type.isSubtypeOf(*OptionalType::ofTensor());
+#endif // C10_MOBILE
+  }
+  static c10::utils::bitset makeBitsetForDispatchArgs(
+      const FunctionSchema& schema) {
+    TORCH_CHECK(
+        schema.arguments().size() <= c10::utils::bitset::NUM_BITS(),
+        "The function schema has ",
+        schema.arguments().size(),
+        " arguments but this PyTorch build only supports ",
+        c10::utils::bitset::NUM_BITS());
+    c10::utils::bitset dispatch_arg_indices_reverse;
+    for (const auto index : c10::irange(schema.arguments().size())) {
+      if (isDispatchType(*schema.arguments()[index].type())) {
+        dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index);
+      }
+    }
+    return dispatch_arg_indices_reverse;
+  }
+
+  explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
+      : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse),
+        nonFallthroughKeys_(DispatchKeySet::FULL) {
+    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
+    }
+  }
+
+  // this is a bitset that has ones for each argument index which has to be
+  // considered for dispatch. This avoids having to iterate over the stack
+  // to find all the tensors. The bits are stored in reverse order, i.e.
+  // dispatch_arg_indices_reverse_[i] == true, then the i-th argument from
+  // the top of the stack (i.e. the i-th last argument of the function)
+  // is relevant for dispatch.
+  // dispatch_arg_indices_reverse_ is allowed to have zero bits set; that just
+  // means you must do the fallthrough
+  c10::utils::bitset dispatch_arg_indices_reverse_;
+
+  // Set of functionality keys for which the operator does NOT have fallthrough
+  // kernel.
+  DispatchKeySet nonFallthroughKeys_;
+  // Set of functionality keys for which the operator does NOT have fallthrough
+  // kernel, defined PER BACKEND. This is only needed if we know that the
+  // operator has a different set of fallthroughs defined for some backends.
+  std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
+  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast
+  // path), or if we need to fall back to the slower path and check
+  // nonFallthroughKeysPerBackend_
+  bool requiresBitsetPerBackend_{false};
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc51027a01bb6fa8e83c3542d06e3c1008a4db5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h
@@ -0,0 +1,955 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/SequenceNumber.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/OperatorEntry.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/record_function.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/util/Exception.h>
+#include <c10/util/LeftRight.h>
+#include <condition_variable>
+#include <list>
+#include <mutex>
+#include <type_traits>
+
+#include <ATen/core/enum_tag.h>
+#include <ATen/core/grad_mode.h>
+
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+namespace c10 {
+
+TORCH_API bool show_dispatch_trace();
+TORCH_API void dispatch_trace_nesting_incr();
+TORCH_API void dispatch_trace_nesting_decr();
+TORCH_API int64_t dispatch_trace_nesting_value();
+
+struct DispatchTraceNestingGuard {
+  DispatchTraceNestingGuard() {
+    dispatch_trace_nesting_incr();
+  }
+  ~DispatchTraceNestingGuard() {
+    dispatch_trace_nesting_decr();
+  }
+};
+
+class TORCH_API OperatorHandle;
+template <class FuncType>
+class TypedOperatorHandle;
+
+/**
+ * Implement this interface and register your instance with the dispatcher
+ * to get notified when operators are registered or deregistered with
+ * the dispatcher.
+ *
+ * NB: registration events only occur when a 'def' occurs; we don't trigger
+ * on 'impl' or 'fallback' calls.
+ */
+class TORCH_API OpRegistrationListener {
+ public:
+  virtual ~OpRegistrationListener();
+
+  virtual void onOperatorRegistered(const OperatorHandle& op) = 0;
+  virtual void onOperatorDeregistered(const OperatorHandle& op) = 0;
+};
+
+namespace detail {
+class RegistrationListenerList;
+}
+class SchemaRegistrationHandleRAII;
+
+/**
+ * Top-level dispatch interface for dispatching via the dynamic dispatcher.
+ * Most end users shouldn't use this directly; if you're trying to register
+ * ops look in op_registration
+ */
+class TORCH_API Dispatcher final {
+ private:
+  // For direct access to backend fallback information
+  friend class impl::OperatorEntry;
+
+  struct OperatorDef final {
+    explicit OperatorDef(OperatorName&& op_name) : op(std::move(op_name)) {}
+
+    impl::OperatorEntry op;
+
+    // These refer to the number of outstanding RegistrationHandleRAII
+    // for this operator.  def_count reflects only def() registrations
+    // (in the new world, this should only ever be 1, but old style
+    // registrations may register the schema multiple times, which
+    // will increase this count).  def_and_impl_count reflects the number
+    // of combined def() and impl() registrations.  When the last def() gets
+    // unregistered, we must immediately call the Deregistered listeners, but we
+    // must not actually delete the handle as there are other outstanding RAII
+    // destructors which will try to destruct and they had better still have a
+    // working operator handle in this case
+    size_t def_count = 0;
+    size_t def_and_impl_count = 0;
+  };
+  friend class OperatorHandle;
+  template <class>
+  friend class TypedOperatorHandle;
+
+  struct Guard final {
+    Guard() : alive(true) {}
+    std::atomic<bool> alive;
+    std::mutex mutex;
+  };
+
+ public:
+  ~Dispatcher();
+
+  // Implementation note: this class abstracts over the fact that we have
+  // per-operator dispatch tables.  This could be easily adjusted to have a
+  // single global hash table.
+  static Dispatcher& realSingleton();
+
+  C10_ALWAYS_INLINE static Dispatcher& singleton() {
+#if !defined C10_MOBILE
+    // Implemented inline so that steady-state code needn't incur
+    // function-call overhead. We can't just inline `realSingleton`
+    // because the function-local static would get duplicated across
+    // all DSOs that include & use this header, leading to multiple
+    // singleton instances.
+    static Dispatcher& s = realSingleton();
+    return s;
+#else
+    // For C10_MOBILE, we should never inline a static function that
+    // has a static member, since the generated code calls
+    // __cxa_guard_acquire and __cxa_guard_release which help
+    // implement exactly once semantics for the initialization of the
+    // static Dispatcher& s above (for the non-mobile case). That
+    // additional code when duplicated across all operator stubs
+    // for every backend results in a lot of additional code
+    // being generated by the compiler.
+    return realSingleton();
+#endif
+  }
+
+  // ------------------------------------------------------------------------
+  //
+  // Accessing operators by schema
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Looks for an operator schema with the given name and overload name
+   * and returns it if it is registered WITH A SCHEMA.
+   * Returns nullopt otherwise.
+   */
+  std::optional<OperatorHandle> findSchema(const OperatorName& operator_name);
+
+  /**
+   * Variant of findSchema that results in less code generated at the call site.
+   * It (1) takes const char* pointer rather than OperatorName (so we skip
+   * generating std::string constructor calls at the call site), and (2)
+   * it raises an exception if the operator is not found (so we skip
+   * generating exception raising code at the call site)
+   *
+   * Irritatingly, we still have to generate the handful of instructions
+   * for dealing with an exception being thrown during static initialization
+   * (e.g. __cxa_guard_abort).  If we could annotate this method noexcept we
+   * could avoid this code too, but as the name of the function suggests,
+   * it does throw exceptions.
+   */
+  OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name);
+
+  // Like findSchema, but also returns OperatorHandle even if there is no schema
+  std::optional<OperatorHandle> findOp(const OperatorName& operator_name);
+
+  // Returns a list of all operator names present in the operatorLookupTable_
+  const std::vector<OperatorName> getAllOpNames();
+
+  // Returns a list of all operator names present in the operatorLookupTable_
+  // for a given dispatch key
+  const std::vector<OperatorName> getAllOpNamesForDispatchKey(DispatchKey k);
+
+  // ------------------------------------------------------------------------
+  //
+  // Invoking operators
+  //
+  // ------------------------------------------------------------------------
+
+  template <class Return, class... Args>
+  Return call(const TypedOperatorHandle<Return(Args...)>& op, Args... args)
+      const;
+
+  template <class Return, class... Args>
+  static Return callWithDispatchKeySlowPath(
+      const TypedOperatorHandle<Return(Args...)>& op,
+      at::StepCallbacks& stepCallbacks,
+      DispatchKeySet dispatchKeySet,
+      const KernelFunction& kernel,
+      Args... args);
+
+  // Like call, but intended for use in a redispatch in kernels that have
+  // explicitly performed the DispatchKey update calculatulation. This will take
+  // the DispatchKeySet completely as is and dispatch to the kernel of the
+  // corresponding highest priority key in the set. Note that this version of
+  // redispatch treats the inputted DispatchKeySet *as is*, and does NOT mask
+  // out the highest priority key. See Note [Plumbing Keys Through The
+  // Dispatcher]
+  template <class Return, class... Args>
+  Return redispatch(
+      const TypedOperatorHandle<Return(Args...)>& op,
+      DispatchKeySet currentDispatchKeySet,
+      Args... args) const;
+
+  // Invoke an operator via the boxed calling convention using an IValue stack
+  void callBoxed(const OperatorHandle& op, Stack* stack) const;
+  void callBoxedForDispatchKey(
+      const OperatorHandle& op,
+      DispatchKey dk,
+      Stack* stack) const;
+
+  // TODO: This will only be useful if we write a backend fallback that plumbs
+  // dispatch keys (currently there are none) See Note [Plumbing Keys Through
+  // The Dispatcher]
+  void redispatchBoxed(
+      const OperatorHandle& op,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+
+  bool hasBackendFallbackForDispatchKey(DispatchKey dk) {
+    auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk);
+    if (dispatch_ix < 0)
+      return false;
+    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
+  }
+
+  // Used by torchdeploy/multipy for multiple  // codespell:ignore: multipy
+  // interpreters racing.
+  void waitForDef(const FunctionSchema& schema);
+  void waitForImpl(
+      const OperatorName& op_name,
+      std::optional<DispatchKey> dispatch_key);
+
+  // ------------------------------------------------------------------------
+  //
+  // Performing registrations (NON user public; use op_registration)
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Register a new operator schema.
+   *
+   * If a schema with the same operator name and overload name already exists,
+   * this function will check that both schemas are exactly identical.
+   */
+  RegistrationHandleRAII registerDef(
+      FunctionSchema schema,
+      std::string debug,
+      std::vector<at::Tag> tags = {});
+
+  /**
+   * Register a kernel to the dispatch table for an operator.
+   * If dispatch_key is nullopt, then this registers a fallback kernel.
+   *
+   * @return A RAII object that manages the lifetime of the registration.
+   *         Once that object is destructed, the kernel will be deregistered.
+   */
+  // NB: steals the inferred function schema, as we may need to hold on to
+  // it for a bit until the real schema turns up
+  RegistrationHandleRAII registerImpl(
+      OperatorName op_name,
+      std::optional<DispatchKey> dispatch_key,
+      KernelFunction kernel,
+      std::optional<impl::CppSignature> cpp_signature,
+      std::unique_ptr<FunctionSchema> inferred_function_schema,
+      std::string debug);
+
+  /**
+   * Given an operator, tells the Dispatcher that we have implemented a fake
+   * impl for this op in the given Python module. Call this a "pystub".
+   */
+  RegistrationHandleRAII registerPythonModule(
+      const OperatorName& op_name,
+      const char* pymodule,
+      const char* context);
+
+  /**
+   * Given an operator, throws if we have a pystub.
+   */
+  void throwIfHasPythonModule(OperatorName op_name);
+
+  std::optional<std::pair<const char*, const char*>> getPyStub(
+      OperatorName op_name);
+
+  /**
+   * Register a new operator by name.
+   */
+  RegistrationHandleRAII registerName(OperatorName op_name);
+
+  /**
+   * Register a fallback kernel for a backend.
+   * If an operator is called but there is no concrete kernel for the dispatch
+   * key of the given operator arguments, it will check if there is such a
+   * fallback kernel for the given dispatch key and, if yes, call that one.
+   */
+  RegistrationHandleRAII registerFallback(
+      DispatchKey dispatch_key,
+      KernelFunction kernel,
+      std::string debug);
+
+  /**
+   * Use to register whenever we had a TORCH_LIBRARY declaration in the frontend
+   * API.  These invocations are only permitted once per program, so we raise
+   * an error if this is called again for the same namespace.
+   */
+  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
+
+  // ------------------------------------------------------------------------
+  //
+  // Listeners on registrations
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Add a listener that gets called whenever a new op is registered or an
+   * existing op is deregistered. Immediately after registering, this listener
+   * gets called for all previously registered ops, so it can be used to keep
+   * track of ops registered with this dispatcher.
+   */
+  RegistrationHandleRAII addRegistrationListener(
+      std::unique_ptr<OpRegistrationListener> listener);
+
+  void checkInvariants() const;
+
+  //
+  // ------------------------------------------------------------------------
+  //
+  // Assertions
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * For testing purposes.
+   * Returns a list of all operators that were created through calls to
+   * registerImpl(), without any corresponding calls to registerDef(). After
+   * static initialization is done this is almost certainly a bug, as the
+   * created OperatorHandle won't have any schema associated with it and users
+   * calling the op through the dispatcher won't be able to access it
+   *
+   * Note that we cannot enforce this invariant "as we go" during static
+   * initialization, due to undefined static initialization order- we have no
+   * guarantees over the order in which .def() and .impl() calls are registered
+   * in the dispatcher at static initialization time. So this function should
+   * only be called after static initialization.
+   */
+  std::vector<OperatorHandle> findDanglingImpls() const;
+
+  /**
+   * Useful for inspecting global Dispatcher registration state.
+   * Returns the names of all operators with a kernel registered for the
+   * specified DispatchKey. If no DispatchKey is specified, it returns all
+   * registered operators.
+   */
+  std::vector<OperatorName> getRegistrationsForDispatchKey(
+      std::optional<DispatchKey> k) const;
+
+ private:
+  Dispatcher();
+
+  static int64_t sequenceNumberForRunningRecordFunction(
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(
+      at::RecordFunction& guard,
+      at::RecordFunction::schema_ref_t schema_ref,
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(
+      at::RecordFunction& guard,
+      at::RecordFunction::schema_ref_t schema_ref,
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet,
+      c10::ArrayRef<const c10::IValue> args);
+
+#ifdef FBCODE_CAFFE2
+  static bool profilingOperatorEvents();
+  static void fireOpStartUSDT(
+      at::RecordFunction::schema_ref_t schema_ref,
+      std::vector<void*>& argsAddresses,
+      std::vector<const char*>& argsTypes);
+  static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref);
+#endif // FBCODE_CAFFE2
+
+  OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema);
+  OperatorHandle findOrRegisterName_(const OperatorName& op_name);
+
+  void deregisterDef_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterImpl_(
+      const OperatorHandle& op,
+      const OperatorName& op_name,
+      std::optional<DispatchKey> dispatch_key,
+      impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle);
+  void deregisterName_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterFallback_(DispatchKey dispatchKey);
+  void deregisterLibrary_(const std::string& ns);
+  void cleanup(const OperatorHandle& op, const OperatorName& op_name);
+  void checkSchemaCompatibility(
+      const OperatorHandle& op,
+      const FunctionSchema& schema,
+      const std::string& debug);
+
+  std::list<OperatorDef> operators_;
+#if !defined(C10_MOBILE)
+  LeftRight<ska::flat_hash_map<OperatorName, OperatorHandle>>
+      operatorLookupTable_;
+#else
+  RWSafeLeftRightWrapper<ska::flat_hash_map<OperatorName, OperatorHandle>>
+      operatorLookupTable_;
+#endif
+  // Map from namespace to debug string (saying, e.g., where the library was
+  // defined)
+  ska::flat_hash_map<std::string, std::string> libraries_;
+
+  std::array<impl::AnnotatedKernel, num_runtime_entries>
+      backendFallbackKernels_;
+
+  std::unique_ptr<detail::RegistrationListenerList> listeners_;
+
+  // This condition variable gets notified whenever we add a new def/impl to the
+  // dispatch table.  This is primarily used by multiply/torchdeploy, when
+  // we have multiple interpreters trying to register to the dispatch table.
+  // In this situation, whenever the non-primary interpreter would have tried
+  // to register to the dispatch table, instead it will check to see if the
+  // expected registration has already been made, and if it hasn't, wait on
+  // this condition variable to see if it was just racing with the primary
+  // interpreter.
+  //
+  // We expect it to be rare for there to be any waiters on this condition
+  // variable.  This is mostly just to help give better diagnostics if
+  // something goes horribly wrong
+  std::condition_variable cond_var_;
+
+  // Protect concurrent access to the dispatcher.  We store this in a
+  // `shared_ptr` as we return callbacks that call back into dispatcher methods,
+  // and we need to be able to handle and guard against the event when the
+  // `Dispatcher` has been destroyed before the callbacks fire.
+  std::shared_ptr<Guard> guard_;
+};
+
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * This handle can be used to register kernels with the dispatcher or
+ * to lookup a kernel for a certain set of arguments.
+ */
+class TORCH_API OperatorHandle {
+  template <typename T>
+  friend struct std::hash;
+
+ public:
+  OperatorHandle(OperatorHandle&&) noexcept = default;
+  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
+  OperatorHandle(const OperatorHandle&) = default;
+  OperatorHandle& operator=(const OperatorHandle&) = default;
+  // NOLINTNEXTLINE(performance-trivially-destructible)
+  ~OperatorHandle();
+
+  const OperatorName& operator_name() const {
+    return operatorDef_->op.operator_name();
+  }
+
+  bool hasSchema() const {
+    return operatorDef_->op.hasSchema();
+  }
+
+  const FunctionSchema& schema() const {
+    return operatorDef_->op.schema();
+  }
+
+  const std::string& debug() const {
+    return operatorDef_->op.debug();
+  }
+
+  std::string dumpState() const {
+    return operatorDef_->op.dumpState();
+  }
+
+  bool hasKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasKernelForDispatchKey(k);
+  }
+
+  bool isKernelFallthroughKernel(DispatchKey k) const {
+    return operatorDef_->op.kernelForDispatchKey(k).isFallthrough();
+  }
+
+  bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
+    return operatorDef_->op.hasKernelForAnyDispatchKey(k);
+  }
+
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasComputedKernelForDispatchKey(k);
+  }
+
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.getComputedKernelForDispatchKey(k);
+  }
+
+  std::string dumpComputedTable() const {
+    return operatorDef_->op.dumpComputedTable();
+  }
+
+  void checkInvariants() const {
+    operatorDef_->op.checkInvariants();
+  }
+
+  c10::ArrayRef<at::Tag> getTags() const {
+    return operatorDef_->op.getTags();
+  }
+
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback) {
+    operatorDef_->op.setReportErrorCallback_(std::move(callback));
+  }
+
+  bool hasTag(const at::Tag& tag) const {
+    for (const auto& tag_ : getTags()) {
+      if (tag == tag_) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <class FuncType>
+  TypedOperatorHandle<FuncType> typed() const {
+    // NB: This assert is not 100% sound: you can retrieve a typed() operator
+    // handle prior to ANY C++ signature being registered on the operator
+    // and the check will say everything is OK (at which point you can then
+    // smuggle in a kernel that is typed incorrectly).  For everything
+    // in core library this won't happen, because all the static registrations
+    // will be done by the time a typed() handle is acquired.
+#if !defined C10_MOBILE
+    operatorDef_->op.assertSignatureIsCorrect<FuncType>();
+    if (fn_has_symint<FuncType>::value) {
+      operatorDef_->op.assertSignatureIsCorrect<
+          typename fn_remove_symint<FuncType>::type>();
+    }
+#endif
+    return TypedOperatorHandle<FuncType>(operatorIterator_);
+  }
+
+  void callBoxed(Stack* stack) const {
+    c10::Dispatcher::singleton().callBoxed(*this, stack);
+  }
+
+  void callBoxed(Stack& stack) const {
+    callBoxed(&stack);
+  }
+
+  void callBoxedForDispatchKey(DispatchKey dk, Stack& stack) const {
+    c10::Dispatcher::singleton().callBoxedForDispatchKey(*this, dk, &stack);
+  }
+
+  void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
+    c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
+  }
+
+  template <typename F>
+  PyObject* getPythonOp(
+      c10::impl::PyInterpreter* self_interpreter,
+      F slow_accessor) const {
+    return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor);
+  }
+
+  bool operator==(const OperatorHandle& other) const {
+    return operatorDef_ == other.operatorDef_;
+  }
+
+  bool operator!=(const OperatorHandle& other) const {
+    return operatorDef_ != other.operatorDef_;
+  }
+
+ private:
+  explicit OperatorHandle(
+      std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+      : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator) {}
+  friend class Dispatcher;
+  template <class>
+  friend class TypedOperatorHandle;
+
+  // Storing a direct pointer to the OperatorDef even though we
+  // already have the iterator saves an instruction in the critical
+  // dispatch path. The iterator is effectively a
+  // pointer-to-std::list-node, and (at least in libstdc++'s
+  // implementation) the element is at an offset 16 bytes from that,
+  // because the prev/next pointers come first in the list node
+  // struct. So, an add instruction would be necessary to convert from the
+  // iterator to an OperatorDef*.
+  Dispatcher::OperatorDef* operatorDef_;
+
+  // We need to store this iterator in order to make
+  // Dispatcher::cleanup() fast -- it runs a lot on program
+  // termination (and presumably library unloading).
+  std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
+};
+
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * It holds the same information as an OperatorHandle, but it is templated
+ * on the operator arguments and allows calling the operator in an
+ * unboxed way.
+ */
+template <class FuncType>
+class TypedOperatorHandle final {
+  static_assert(
+      guts::false_t<FuncType>(),
+      "FuncType in OperatorHandle::typed<FuncType> was not a valid function type");
+};
+template <class Return, class... Args>
+class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
+ public:
+  TypedOperatorHandle(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle& operator=(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle(const TypedOperatorHandle&) = default;
+  TypedOperatorHandle& operator=(const TypedOperatorHandle&) = default;
+
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use
+  // &&
+  C10_ALWAYS_INLINE Return call(Args... args) const {
+    return c10::Dispatcher::singleton().call<Return, Args...>(
+        *this, std::forward<Args>(args)...);
+  }
+
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use
+  // &&
+  C10_ALWAYS_INLINE Return
+  redispatch(DispatchKeySet currentDispatchKeySet, Args... args) const {
+    return c10::Dispatcher::singleton().redispatch<Return, Args...>(
+        *this, currentDispatchKeySet, std::forward<Args>(args)...);
+  }
+
+ private:
+  explicit TypedOperatorHandle(
+      std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+      : OperatorHandle(operatorIterator) {}
+  friend class OperatorHandle;
+};
+
+namespace detail {
+template <class... Args>
+inline void unused_arg_(const Args&... /*unused*/) {}
+
+// CaptureKernelCall is intended to capture return values from Dispatcher
+// unboxed kernel calls. A record function may request to get outputs from the
+// kernel calls. For boxed kernels, it's straightforward, the returned values
+// are in the stack object. The stack can be passed to record functions. For
+// unboxed kernels, we need to handle different kinds of return values, cache
+// them temporarily, then release the values for the actual function call
+// return.
+template <typename ReturnType>
+struct CaptureKernelCall {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<ReturnType(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args)
+      // Calls the kernel and capture the result in output_.
+      : output_{kernel.template call<ReturnType, Args...>(
+            op,
+            dispatchKeySet,
+            std::forward<Args>(args)...)} {}
+  // Wraps the return values in a Stack.
+  Stack getOutputs() {
+    Stack stack;
+    impl::push_outputs<ReturnType, false>::copy(output_, &stack);
+    return stack;
+  }
+  // Since we are returning the output_, we don't expect the output_ to be used
+  // afterward. Copy elision and RVO do not apply to class data members. Using
+  // move semantic to avoid copies when possible.
+  ReturnType release() && {
+    return std::move(output_);
+  }
+
+ private:
+  ReturnType output_;
+};
+
+// Handle the lvalue reference differently since it should not be moved.
+template <>
+inline at::Tensor& CaptureKernelCall<at::Tensor&>::release() && {
+  return output_;
+}
+
+// Handle case where the kernel returns void.
+template <>
+struct CaptureKernelCall<void> {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<void(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args) {
+    // Calling the kernel and no need to capture void.
+    kernel.template call<void, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+  Stack getOutputs() {
+    return Stack();
+  }
+  void release() && {}
+};
+
+TORCH_API void _print_dispatch_trace(
+    const std::string& label,
+    const std::string& op_name,
+    const DispatchKeySet& dispatchKeySet);
+
+} // namespace detail
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+inline Return Dispatcher::callWithDispatchKeySlowPath(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    at::StepCallbacks& stepCallbacks,
+    DispatchKeySet dispatchKeySet,
+    const KernelFunction& kernel,
+    Args... args) {
+  // If callbacks need inputs, we box the arguments and pass them to the guard.
+  // Note: For perf reasons we wouldn't want to prematurely box the arguments.
+  at::RecordFunction guard(std::move(stepCallbacks));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved());
+  auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+  auto& schema = op.schema();
+  auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+  constexpr auto num_boxed_args = impl::boxed_size<Args...>();
+  if constexpr (num_boxed_args != 0) {
+    if (guard.needsInputs()) {
+      // If we used std::array<IValue, num_boxed_args> here, we would
+      // have to spend time default constructing the IValues in
+      // boxedArgs. aligned_storage has no such requirement.
+      // NOLINTNEXTLINE(*array*)
+      alignas(IValue) std::byte boxedArgs[num_boxed_args * sizeof(IValue)];
+      // For debugging only; could be removed (but the compiler will do
+      // that for us and it's nice to have the extra assurance of
+      // correctness from our debug builds).
+      IValue* boxedArgsPtr = reinterpret_cast<IValue*>(boxedArgs);
+      impl::boxArgsToStack(boxedArgsPtr, args...);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          reinterpret_cast<std::byte*>(boxedArgsPtr) ==
+          boxedArgs + num_boxed_args * sizeof(IValue));
+      // I don't *think* we need std::launder here, because IValue has
+      // no subclasses and no const or reference fields.
+      runRecordFunction(
+          guard,
+          schema_ref,
+          dispatchKey,
+          dispatchKeySet,
+          c10::ArrayRef<const c10::IValue>(
+              reinterpret_cast<IValue*>(boxedArgs), num_boxed_args));
+      boxedArgsPtr = reinterpret_cast<IValue*>(boxedArgs);
+      for (size_t ii = 0; ii < num_boxed_args; ++ii) {
+        (boxedArgsPtr + ii)->~IValue();
+      }
+    } else {
+      runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+    }
+  } else {
+    runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+  }
+
+  if (C10_UNLIKELY(guard.needsOutputs())) {
+    // Calls the kernel and capture the output temporarily to pass to
+    // RecordFunction.
+    detail::CaptureKernelCall<Return> captureKernelCall(
+        kernel, op, dispatchKeySet, std::forward<Args>(args)...);
+    guard.setOutputs(captureKernelCall.getOutputs());
+    // Releases the captured output to return to caller.
+    return std::move(captureKernelCall).release();
+  }
+
+  // keeping the guard alive while executing the kernel
+  return kernel.template call<Return, Args...>(
+      op, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    Args... args) const {
+  auto dispatchKeySet =
+      op.operatorDef_->op.dispatchKeyExtractor()
+          .template getDispatchKeySetUnboxed<Args...>(args...);
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[call]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks =
+      at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(
+          step_callbacks.has_value() && op.operatorDef_->op.isObserved())) {
+    return callWithDispatchKeySlowPath<Return, Args...>(
+        op,
+        *step_callbacks,
+        dispatchKeySet,
+        kernel,
+        std::forward<Args>(args)...);
+  }
+#endif // PYTORCH_DISABLE_PER_OP_PROFILING
+
+#ifdef FBCODE_CAFFE2
+  if (profilingOperatorEvents()) {
+    std::vector<void*> argsAddresses = {(void*)(&args)...};
+    std::vector<const char*> argsTypes = {(typeid(args).name())...};
+    struct FireOpRAII {
+      FireOpRAII(
+          at::RecordFunction::schema_ref_t schema_ref,
+          std::vector<void*>& argsAddresses,
+          std::vector<const char*>& argsTypes)
+          : schema_ref_(schema_ref) {
+        fireOpStartUSDT(schema_ref, argsAddresses, argsTypes);
+      }
+      ~FireOpRAII() {
+        fireOpEndUSDT(schema_ref_);
+      }
+      at::RecordFunction::schema_ref_t schema_ref_;
+    } event(op.schema(), argsAddresses, argsTypes);
+    return kernel.template call<Return, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  } else {
+    return kernel.template call<Return, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+#else
+  return kernel.template call<Return, Args...>(
+      op, dispatchKeySet, std::forward<Args>(args)...);
+#endif // FBCODE_CAFFE2
+}
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+inline Return Dispatcher::redispatch(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    DispatchKeySet currentDispatchKeySet,
+    Args... args) const {
+  // do not use RecordFunction on redispatch
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[redispatch]", toString(op.operator_name()), currentDispatchKeySet);
+  }
+#endif
+  const KernelFunction& kernel =
+      op.operatorDef_->op.lookup(currentDispatchKeySet);
+  return kernel.template call<Return, Args...>(
+      op, currentDispatchKeySet, std::forward<Args>(args)...);
+}
+
+inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack)
+    const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  auto dispatchKeySet =
+      entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[callBoxed]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks =
+      at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(step_callbacks.has_value() && entry.isObserved())) {
+    at::RecordFunction guard(std::move(*step_callbacks));
+    auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+    auto& schema = op.schema();
+    auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+    guard.needsInputs()
+        ? runRecordFunction(
+              guard,
+              schema_ref,
+              dispatchKey,
+              dispatchKeySet,
+              c10::ArrayRef<const c10::IValue>(stack->data(), stack->size()))
+        : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+
+    // keeping the guard alive while executing the kernel
+    kernel.callBoxed(op, dispatchKeySet, stack);
+
+    if (C10_UNLIKELY(guard.needsOutputs())) {
+      guard.setOutputs(*stack);
+    }
+    return;
+  }
+#endif // PYTORCH_DISABLE_PER_OP_PROFILING
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+// NB: this doesn't count as a "true" dispatcher jump, so no instrumentation
+inline void Dispatcher::callBoxedForDispatchKey(
+    const OperatorHandle& op,
+    DispatchKey dk,
+    Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  // We still compute this as we're obligated to pass it on to the internal
+  // kernel, if it is a boxed fallback
+  auto dispatchKeySet =
+      entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+  const auto& kernel = ([&]() {
+    if (op.hasKernelForDispatchKey(dk)) {
+      return entry.kernelForDispatchKey(dk);
+    } else {
+      auto idx = getDispatchTableIndexForDispatchKey(dk);
+      TORCH_INTERNAL_ASSERT(idx >= 0);
+      return backendFallbackKernels_[idx].kernel;
+    }
+  })();
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+inline void Dispatcher::redispatchBoxed(
+    const OperatorHandle& op,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[redispatchBoxed]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+struct hash<c10::OperatorHandle> {
+  size_t operator()(const c10::OperatorHandle& op) const noexcept {
+    return std::hash<void*>{}(static_cast<void*>(op.operatorDef_));
+  }
+};
+
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddd4e653c3f67786dd37e93e3ca1ab1e75acf697
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h
@@ -0,0 +1,22 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/operator_name.h>
+#include <string>
+#include <unordered_set>
+
+namespace c10 {
+
+struct TORCH_API ObservedOperators {
+  ObservedOperators() = delete;
+
+  static bool isObserved(const OperatorName& name);
+
+  static std::unordered_set<std::string>& getUnobservedOperatorList();
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb78faeedd41167e446c29542349acfcb2f2cce5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h
@@ -0,0 +1,342 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/DispatchKeyExtractor.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/PyHandleCache.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/flat_hash_map.h>
+
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/enum_tag.h>
+
+#include <array>
+#include <list>
+#include <optional>
+
+#ifdef C10_MOBILE
+#define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+#endif
+
+namespace c10 {
+
+class Dispatcher;
+
+namespace impl {
+
+// This data structure represents a kernel that was registered to us from a
+// user.  Unlike KernelFunction, AnnotatedKernel contains some extra metadata
+// about the kernel that isn't necessary for actual dispatching (this is why
+// we don't put AnnotatedKernel in the actual DispatchTable), but is useful for
+// giving good error messages.
+struct AnnotatedKernel final {
+  AnnotatedKernel(
+      KernelFunction k,
+      std::unique_ptr<FunctionSchema> s,
+      std::string d)
+      : kernel(std::move(k)),
+        inferred_function_schema(std::move(s)),
+        debug(std::move(d)) {}
+  AnnotatedKernel() = default;
+  KernelFunction kernel;
+  std::unique_ptr<FunctionSchema> inferred_function_schema;
+  // A little debug string to help us identify the kernel in question.
+  // Most importantly it records the TORCH_LIBRARY block that did the
+  // registration.
+  std::string debug;
+};
+
+// This data structure represents operator schema, with metadata specifying
+// where the registration of this schema occurred
+struct AnnotatedSchema final {
+  AnnotatedSchema(FunctionSchema s, std::string d)
+      : schema(std::move(s)), debug(std::move(d)) {}
+  FunctionSchema schema;
+  std::string debug;
+};
+
+// Internal data structure that records information about a specific operator.
+// It's not part of the public API; typically, users will interact with
+// OperatorHandle instead.
+//
+// Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher
+// lock (this is important because some methods in OperatorEntry access
+// dispatcher state)
+class TORCH_API OperatorEntry final {
+ public:
+  explicit OperatorEntry(OperatorName&& operator_name);
+
+  OperatorEntry(const OperatorEntry&) = delete;
+  OperatorEntry(OperatorEntry&&) noexcept = delete;
+  OperatorEntry& operator=(const OperatorEntry&) = delete;
+  OperatorEntry& operator=(OperatorEntry&&) noexcept = delete;
+
+  const FunctionSchema& schema() const {
+    TORCH_INTERNAL_ASSERT(
+        schema_.has_value(),
+        "Tried to access the schema for ",
+        name_,
+        " which doesn't have a schema registered yet");
+    return schema_->schema;
+  }
+  const std::string& debug() const {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    return schema_->debug;
+  }
+  bool hasSchema() const {
+    return schema_.has_value();
+  }
+
+  bool isObserved() const {
+    return is_observed_;
+  }
+
+  // We may allocate an OperatorEntry for an operator even when we don't
+  // have a schema.  When we receive the schema registration, we post
+  // facto register a schema.
+  //
+  // NB: registerSchema/deregisterSchema are not idempotent; if you
+  // attempt to register a schema when one is already present or vice
+  // versa that is an error.  (Refcounting for the registrations is
+  // handled in the OperatorHandle in Dispatcher)
+  void registerSchema(
+      FunctionSchema&& /*schema*/,
+      std::string&& debug,
+      std::vector<at::Tag> tags = {});
+  void deregisterSchema();
+
+  const OperatorName& operator_name() const {
+    return name_;
+  }
+
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+  using AnnotatedKernelContainer = std::array<AnnotatedKernel, 1>;
+#else
+  using AnnotatedKernelContainer = std::list<AnnotatedKernel>;
+#endif
+  using AnnotatedKernelContainerIterator = AnnotatedKernelContainer::iterator;
+
+  // Why are kernels and fallback asymmetric?  It has to do with ownership.
+  // Kernels and the computed dispatch tables for them are canonically
+  // owned by OperatorEntry, but backend fallbacks are specified once
+  // and apply for all operators, so they should be owned by Dispatcher.
+  // However, the registration of a backend fallback affects the
+  // state of the computed dispatch table, so when a backend fallback
+  // is updated, we need to update the operator tables too.  Thus,
+  // registerKernel is the mechanism by which we give kernels to
+  // operator entry to own (and update dispatch table), but we only
+  // need a non-owning mechanism to update fallback.
+
+  // Precondition: Dispatcher::mutex_ is held
+  // Postcondition: caller is responsible for disposing of the kernel
+  AnnotatedKernelContainerIterator registerKernel(
+      const Dispatcher& dispatcher,
+      std::optional<DispatchKey> dispatch_key,
+      KernelFunction kernel,
+      std::optional<CppSignature> cpp_signature,
+      std::unique_ptr<FunctionSchema> inferred_function_schema,
+      std::string debug);
+
+  // Precondition: Dispatcher::mutex_ is held
+  void deregisterKernel_(
+      const Dispatcher& dispatcher,
+      std::optional<DispatchKey> dispatch_key,
+      AnnotatedKernelContainerIterator kernel);
+
+  // Precondition: Dispatcher::mutex_ is held
+  void updateFallback(const Dispatcher& dispatcher, DispatchKey dispatch_key);
+
+  // Precondition: Dispatcher::mutex_ is held
+  void updateSchemaAliasAnalysis(AliasAnalysisKind a) {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    schema_->schema.setAliasAnalysis(a);
+  }
+
+  std::string dumpComputedTable() const;
+  std::string dumpState() const;
+  void checkInvariants() const;
+
+  const DispatchKeyExtractor& dispatchKeyExtractor() const {
+    return dispatchKeyExtractor_;
+  }
+
+  // Asserts that the given FuncType is correct for calling this operator in an
+  // unboxed way.
+  template <class FuncType>
+  inline void assertSignatureIsCorrect() {
+    assertSignatureIsCorrect(
+        CppSignature::make<FuncType>(), fn_has_symint<FuncType>::value);
+  }
+
+  void assertSignatureIsCorrect(
+      const CppSignature& call_signature,
+      bool has_symint) const;
+
+  [[noreturn]] void reportError(DispatchKey dispatchKey) const;
+
+  const KernelFunction& lookup(DispatchKeySet ks) const {
+    const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
+    if (C10_UNLIKELY(idx == -1)) {
+      reportError(ks.highestPriorityTypeId());
+    }
+    const auto& kernel = dispatchTable_[idx];
+    // A valid kernel *always* has a boxed kernel and *may* have an
+    // unboxed kernel. However, we typically do unboxed calls in at::
+    // APIs, where the kernel 1) will very likely be valid and 2)
+    // should have an unboxed kernel. Checking the unboxed kernel
+    // first will allow us to avoid touching the boxed kernel at all
+    // in the common case.
+    if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
+      if (!kernel.isValid()) {
+        reportError(ks.highestPriorityTypeId());
+      }
+    }
+    return kernel;
+  }
+
+  std::string listAllDispatchKeys() const;
+
+  // Returns true if kernel_ has entry for any key in ks.
+  //
+  // Invariant: There are no alias keys in the passed-in dispatch key set.
+  // Note [No Alias Keys in DispatchKeySet]
+  // Alias keys should be checked using `hasKernelForDispatchKey`
+  // Alias keys shouldn't go inside of a DispatchKeySet, since they can
+  // technically have a value > 63 (causing overflow).
+  bool hasKernelForAnyDispatchKey(DispatchKeySet ks) const;
+  // Returns true if kernel_ has entry for a particular key.
+  bool hasKernelForDispatchKey(DispatchKey k) const;
+  // Retrieves the kernel entry at a particular key.  Symmetric with
+  // hasKernelForDispatchKey.  To get the AnnotatedKernel, see
+  // getKernelForDispatchKey (private)
+  const KernelFunction& kernelForDispatchKey(DispatchKey k) const;
+  // Returns true if the "computed table" has an entry for a particular key.
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns a KernelFunction corresponding to the kernel in dispatchTable
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns all the operator tags added at the time of registration
+  const std::vector<at::Tag>& getTags() const;
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback);
+
+  template <typename F>
+  PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor)
+      const {
+    return py_cache_.ptr_or(self_interpreter, slow_accessor);
+  }
+
+ private:
+  OperatorName name_;
+  std::optional<AnnotatedSchema> schema_;
+#ifndef C10_MOBILE
+  std::vector<at::Tag> tags_;
+#endif
+  std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
+  DispatchKeyExtractor dispatchKeyExtractor_;
+  // Pointer to the torch.ops.ns.op.overload object for speed
+  c10::PyHandleCache py_cache_;
+
+  // kernels_ stores all registered kernels for the corresponding dispatch key
+  // and catchAllKernels_ stores the catch-all kernels.
+  // If an operator library gets loaded that overwrites an already existing
+  // kernel, both kernels will be in that list but only the newer one will be in
+  // dispatchTable. If any of the kernels go away (say the library gets
+  // unloaded), we remove the kernel from this list and update the
+  // dispatchTable if necessary.
+  // Kernels in the list are ordered by registration time descendingly,
+  // newer registrations are before older registrations.
+  // We do not combine dispatchTable and kernels into one hash map because
+  // kernels is a larger data structure and accessed quite infrequently
+  // while dispatchTable is accessed often and should be kept small to fit
+  // into CPU caches.
+  // Invariants:
+  //  - dispatchTable[dispatch_key] == kernels_[dispatch_key].front()
+  //  - dispatchTable[dispatch_key] does not exist if and only if
+  //    kernels_[dispatch_key] does not exist
+  //  - If kernels_[dispatch_key] exists, then it has elements.
+  //    It is never an empty list.
+  //
+  // Why do we do that?
+  // -----
+  // We mostly do this to enable Jupyter notebooks where a cell registering
+  // a kernel could be executed multiple times and the later execution
+  // should overwrite the earlier one. Note that this still fails when the
+  // function schema changed between the executions, but it works as long
+  // as the function schema didn't change. A better solution would be to
+  // unload the old extension library from the Jupyter cell when the cell is
+  // re-executed and then only allow one kernel here, i.e. error if a kernel
+  // is already registered, but that's a lot of effort to implement and
+  // currently not high-pri.
+  ska::flat_hash_map<
+      DispatchKey,
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+      // On mobile, we needn't worry about Jupyter notebooks.
+      std::array<AnnotatedKernel, 1>
+#else
+      std::list<AnnotatedKernel>
+#endif
+      >
+      kernels_;
+
+  const AnnotatedKernel& missingKernel() const;
+  const AnnotatedKernel& ambiguousAutogradOtherKernel() const;
+
+  // cpp_signature_ stores function signature if any of
+  // the kernels was created in a way that allowed us to know the function
+  // signature (i.e. by supplying an unboxed C++ kernel function).
+  // If this is set, it will be used to check that future kernel
+  // registrations match and it will be used in unboxed function calls
+  // to verify their arguments against the known function signature.
+  struct CppSignatureWithDebug {
+    CppSignature signature;
+    std::string debug;
+    std::optional<DispatchKey> dispatch_key;
+  };
+  std::optional<CppSignatureWithDebug> cpp_signature_;
+  std::optional<CppSignatureWithDebug> sym_cpp_signature_;
+
+  // A Python custom error handler for OperatorEntry::reportError
+  std::unique_ptr<c10::SafePyObject> report_error_callback_;
+
+  // Whether this operator needs to be observed with RecordFunction
+  const bool is_observed_;
+
+  [[noreturn]] void reportSignatureError(
+      const CppSignature& call_signature,
+      const CppSignatureWithDebug& saved_signature) const;
+  const KernelFunction& computeDispatchTableEntry(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key) const;
+  std::pair<const AnnotatedKernel&, const char*>
+  computeDispatchTableEntryWithDebug(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key) const;
+  // This function re-establishes the invariant that dispatchTable
+  // contains the front element from the kernels list for a given runtime
+  // dispatch key.
+  void updateDispatchTableEntry_(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key);
+  // Like above, but also handles alias dispatch keys.
+  void updateDispatchTable_(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key);
+  // Like above, but for ALL entries in the dispatch table.
+  void updateDispatchTableFull_(const c10::Dispatcher& dispatcher);
+  // Retrieves a pointer to AnnotatedKernel at
+  // kernels_.at(dispatch_key).front().
+  const AnnotatedKernel* getKernelForDispatchKey(
+      DispatchKey dispatch_key) const;
+};
+
+} // namespace impl
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d506e7a43784a38e1646e6ced419bdf8f080aac
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h
@@ -0,0 +1,35 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+
+namespace c10 {
+
+enum class AliasAnalysisKind : uint8_t {
+  INTERNAL_SPECIAL_CASE,
+  CONSERVATIVE, // The most conservative alias analysis type, assumes
+                // side-effects. This is the default analysis.
+  FROM_SCHEMA,
+  PURE_FUNCTION
+};
+
+#if !defined(_MSC_VER)
+constexpr // Our current MSVC version has a bug that doesn't allow this to be
+          // constexpr.
+#endif
+    inline const char*
+    toString(AliasAnalysisKind aliasAnalysisKind) {
+  return (aliasAnalysisKind == AliasAnalysisKind::CONSERVATIVE) ? "CONSERVATIVE"
+      : (aliasAnalysisKind == AliasAnalysisKind::FROM_SCHEMA)   ? "FROM_SCHEMA"
+      : (aliasAnalysisKind == AliasAnalysisKind::PURE_FUNCTION)
+      ? "PURE_FUNCTION"
+      : (aliasAnalysisKind == AliasAnalysisKind::INTERNAL_SPECIAL_CASE)
+      ? "INTERNAL_SPECIAL_CASE"
+      : "UNKNOWN";
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h
new file mode 100644
index 0000000000000000000000000000000000000000..c66b08e8350e355de45777c2e15f71dcdbb8a2f2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <functional>
+
+namespace c10 {
+
+class RegistrationHandleRAII final {
+ public:
+  explicit RegistrationHandleRAII(std::function<void()> onDestruction)
+      : onDestruction_(std::move(onDestruction)) {}
+
+  ~RegistrationHandleRAII() {
+    if (onDestruction_) {
+      onDestruction_();
+    }
+  }
+
+  RegistrationHandleRAII(const RegistrationHandleRAII&) = delete;
+  RegistrationHandleRAII& operator=(const RegistrationHandleRAII&) = delete;
+
+  RegistrationHandleRAII(RegistrationHandleRAII&& rhs) noexcept
+      : onDestruction_(std::move(rhs.onDestruction_)) {
+    rhs.onDestruction_ = nullptr;
+  }
+
+  RegistrationHandleRAII& operator=(RegistrationHandleRAII&& rhs) noexcept {
+    onDestruction_ = std::move(rhs.onDestruction_);
+    rhs.onDestruction_ = nullptr;
+    return *this;
+  }
+
+ private:
+  std::function<void()> onDestruction_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h
new file mode 100644
index 0000000000000000000000000000000000000000..41936f74d3f79450df15220020866d9ca2de492a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h
@@ -0,0 +1,86 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/List.h>
+#include <c10/core/TensorOptions.h>
+
+/*
+ * [Note: hacky wrapper removal for optional tensor]
+ *
+ * The kernel implementation takes an optional tensor marked in the schema as
+ * Tensor? but the C++ function takes Tensor instead of the std::optional<Tensor>
+ * expected by the dispatcher.
+ *
+ * To remove the hacky wrapper, the C++ function is changed to take
+ * std::optional<Tensor> and unwrap the Tensor value at the beginning of
+ * the function, e.g.:
+ *   > c10::MaybeOwned<Tensor> weight_maybe_owned =
+ *   >     at::borrow_from_optional_tensor(weight_opt);
+ *   > const Tensor& weight = *weight_maybe_owned;
+ *
+ * We may want to make the kernel handle optional directly without
+ * going through the creation of a default-constructed Tensor in
+ * at::borrow_from_optional_tensor.
+ */
+
+/*
+ * [Note: hacky wrapper removal for TensorOptions]
+ *
+ * The kernel implementation takes a TensorOptions argument but the dispatcher
+ * expects separate arguments for dtype, layout, device, pin_memory.
+ *
+ * To remove the hacky wrapper, the kernel implementation is changed to take
+ * the 4 arguments (dtype, layout, device, pin_memory), and assemble the
+ * TensorOptions value at the beginning of the function, e.g.:
+ *   > TensorOptions options = TensorOptions().dtype(dtype).layout(layout)
+ *   >    .device(device).pinned_memory(pin_memory);
+ *
+ * We may want make the kernel handle these parameters directly without going
+ * through the creation of a TensorOptions value.
+ */
+
+namespace c10::impl {
+
+TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName);
+
+inline void check_and_update_common_device(std::optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  // TODO: Remove this once the following issue is addressed:
+  // https://github.com/pytorch/pytorch/issues/57380
+  if (!tensor.defined()) {
+    return;
+  }
+
+  if (!common_device.has_value()) {
+    common_device = tensor.device();
+    return;
+  }
+
+  if (C10_UNLIKELY(common_device != tensor.device())) {
+    common_device_check_failure(*common_device, tensor, methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(std::optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  if (tensor.has_value()) {
+    check_and_update_common_device(common_device, tensor.value(), methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(std::optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(std::optional<Device>& common_device, const List<std::optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb01fcab0b4d7314188acbd761f61a12de6d14d8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h
@@ -0,0 +1,162 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+/**
+ * This file contains functionality to take a C++ function and infer its
+ * c10::FunctionSchema.
+ */
+
+#include <ATen/core/function_schema.h>
+#include <c10/util/Metaprogramming.h>
+
+namespace c10 {
+namespace detail::infer_schema {
+
+/// The templated inference code creates `ArgumentDef` instead of `Argument`,
+/// because that can be constructed at compile time and has a much smaller
+/// binary size than having calls to `Argument` constructors in the template.
+/// Creating `Argument` objects from `ArgumentDef` can then be done at
+/// runtime in a non-templated way.
+struct ArgumentDef final {
+  using GetTypeFn = TypePtr();
+  GetTypeFn* getTypeFn;
+  GetTypeFn* getFakeTypeFn;
+  constexpr ArgumentDef(): getTypeFn(nullptr), getFakeTypeFn(nullptr) {}
+  explicit constexpr ArgumentDef(GetTypeFn *getTypeFn, GetTypeFn *getFakeTypeFn): getTypeFn(getTypeFn), getFakeTypeFn(getFakeTypeFn) {}
+};
+
+template<bool V>
+struct bool_t {};
+template<> struct bool_t<true> : std::true_type {};
+template<> struct bool_t<false> : std::false_type {};
+
+/// Checks the static C++ types `Types` for correctness to catch common error cases.
+template <class... Types>
+constexpr int checkStaticTypes() {
+ // Give nice error messages for some of the common error cases.
+ // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT
+ static_assert(std::conjunction_v<
+     bool_t<!std::is_integral_v<Types> || std::is_same_v<Types, int8_t> || std::is_same_v<Types, int64_t> || std::is_same_v<Types, bool>>...
+   >, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type");
+ static_assert(std::conjunction_v<
+     bool_t<!std::is_same_v<Types, float>>...
+   >, "INVALID TYPE: float is not supported as an argument type, use double instead");
+ return 0;
+}
+
+template <typename... Ts, size_t... Is>
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
+  return (
+    // Check types for common errors
+    checkStaticTypes<Ts...>(),
+
+    // Create the return value
+    std::array<ArgumentDef, sizeof...(Ts)>{
+      ArgumentDef(&getTypePtrCopy<std::decay_t<Ts>>, &getFakeTypePtrCopy<std::decay_t<Ts>>)...}
+  );
+}
+
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as template arguments.
+template<class ParameterTypes> struct createArguments final {};
+template<class... ParameterTypes>
+struct createArguments<guts::typelist::typelist<ParameterTypes...>> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ParameterTypes)> call() {
+    return createArgumentVectorFromTypes<ParameterTypes...>(
+        std::make_index_sequence<sizeof...(ParameterTypes)>()
+    );
+  }
+};
+
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as a tuple (i.e. in the way c10 kernels return values).
+/// It can be a tuple<A, B, C> if there's three output arguments with types A, B, C.
+/// It can be an empty tuple<>, or void for kernels that don't return anything.
+/// It can be a single type A (i.e. no tuple) for the case where a kernel just
+/// returns one value.
+template<class ReturnTypeTuple, class Enable = void> struct createReturns final {};
+
+template<class... ReturnTypes>
+struct createReturns<std::tuple<ReturnTypes...>, void> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ReturnTypes)> call() {
+    return createArgumentVectorFromTypes<ReturnTypes...>(
+        std::make_index_sequence<sizeof...(ReturnTypes)>()
+    );
+  }
+};
+
+template<class ReturnType>
+struct createReturns<ReturnType, std::enable_if_t<!std::is_same_v<void, ReturnType> && !guts::is_instantiation_of<std::tuple, ReturnType>::value>> final {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createReturns<std::tuple<ReturnType>>::call();
+  }
+};
+
+template<>
+struct createReturns<void, void> final {
+  static constexpr std::array<ArgumentDef, 0> call() {
+    return createReturns<std::tuple<>>::call();
+  }
+};
+
+template <typename ReturnType>
+struct createSingleReturn {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createArgumentVectorFromTypes<ReturnType>(std::make_index_sequence<1>());
+  }
+};
+
+TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+TORCH_API FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Flattens std::tuple returns into multiple return types
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsFlattenedReturns() {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createReturns<ReturnType>::call();
+
+ return make_function_schema(arguments, returns);
+}
+
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Preserves std::tuple returns as a Tuple return type
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsSingleReturn(std::string&& name, std::string&& overload_name) {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createSingleReturn<ReturnType>::call();
+
+ return make_function_schema(std::move(name), std::move(overload_name), arguments, returns);
+}
+
+}
+
+template<class FuncType>
+FunctionSchema inferFunctionSchemaFlattenedReturns() {
+  return detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns<guts::infer_function_traits_t<FuncType>>();
+}
+
+template<class FuncType>
+FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& overload_name) {
+  return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
+}
+
+TORCH_API std::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..85169f8a1ab8684c84e08188ef66fe9e945ed7ec
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h
@@ -0,0 +1,186 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// TODO: unify to C10_MOBILE. In theory this header could be used in OSS.
+#ifdef TEMPLATE_SELECTIVE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#endif
+
+/**
+ * This header implements functionality to build PyTorch with only a certain
+ * set of operators (+ dependencies) included.
+ *
+ * - Build with -DTORCH_OPERATOR_WHITELIST="aten::add;aten::sub" and only these
+ *   two ops will be included in your build.  The allowlist records operators
+ *   only, no overloads; if you include aten::add, all overloads of aten::add
+ *   will be included.
+ *
+ * Internally, this is done by removing the operator registration calls
+ * using compile time programming, and the linker will then prune all
+ * operator functions that weren't registered.
+ * See Note [Selective build] for more details
+ *
+ * WARNING: The allowlist mechanism doesn't work for all ways you could go about
+ * registering an operator.  If the dispatch key / operator name is not
+ * sufficiently obvious at compile time, then the allowlisting mechanism
+ * will fail (and the operator will be included in the binary anyway).
+ */
+
+#include <string_view>
+#include <c10/core/DispatchKey.h>
+#include <c10/macros/Macros.h>
+
+
+#if defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+#include <ATen/record_function.h>
+#endif
+
+namespace c10::impl {
+
+constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item);  // Forward Declare
+
+/**
+ * In selective build mode returns true/false depending on whether a build
+ * feature is available or not.
+ *
+ * In instrumenting mode (tracing mode), always returns true, and doesn't
+ * trigger any side effects.
+ */
+constexpr bool is_build_feature_available(const char* name) {
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+  // Selective Build mode.
+#if !defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+  (void)name;
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_BUILD_FEATURE_ALLOWLIST),
+    name);
+#endif
+
+#else
+  // Instrumenting mode.
+  (void)name;
+  return true;
+#endif
+}
+
+[[noreturn]] void build_feature_required_feature_not_available(const char* feature);
+
+/**
+ * Use BUILD_FEATURE_REQUIRED macro in user-code.
+ *
+ * In selective build mode becomes a no-op if the build feature passed
+ * in is available. If not available, throws an exception (c10::Error).
+ * The compiler is able to perform dead code elimination for code
+ * following this method if the build feature is not available.
+ *
+ * In instrumenting mode (tracing mode), registers (as a side effect)
+ * the presence of this specific build feature being triggered.
+ */
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)  // selective build mode
+
+#if defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+#define BUILD_FEATURE_REQUIRED(NAME)                                 \
+  if (!c10::impl::is_build_feature_available(NAME)) {                \
+    ::c10::impl::build_feature_required_feature_not_available(NAME); \
+  }
+#else  // Everything trivially selected
+#define BUILD_FEATURE_REQUIRED(NAME)
+
+#endif
+
+#else  // trace mode
+#define BUILD_FEATURE_REQUIRED(NAME)  \
+  RECORD_FUNCTION_WITH_SCOPE(         \
+      at::RecordScope::BUILD_FEATURE, \
+      std::string(NAME),              \
+      {});
+#endif
+
+// Use this macro, and not is_build_feature_available
+#define BUILD_FEATURE_AVAILABLE(NAME) ::c10::impl::is_build_feature_available(NAME)
+
+// returns true iff allowlist contains item
+// allowlist_contains("a;bc;d", "bc") == true
+constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item) {
+    //Choose a really big value for next so that if something goes wrong
+    //this code will blow up in a hopefully detectable way.
+    size_t next = std::numeric_limits<size_t>::max();
+    for (size_t cur = 0; cur <= allowlist.size(); cur = next) {
+      next = allowlist.find(';', cur);
+      if (next != std::string_view::npos) {
+        if (allowlist.substr(cur, next - cur) == item) {
+          return true;
+        }
+        next++;
+      } else {
+        if (allowlist.substr(cur) == item) {
+          return true;
+        }
+        break;
+      }
+    }
+    return false;
+}
+
+// Returns true iff the given op name is on the allowlist
+// and should be registered
+constexpr bool op_allowlist_check(std::string_view op_name [[maybe_unused]]) {
+  assert(op_name.find("::") != std::string_view::npos);
+  // Use assert() instead of throw() due to a gcc bug. See:
+  // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function
+  // https://github.com/fmtlib/fmt/issues/682
+  assert(op_name.find('(') == std::string_view::npos);
+#if !defined(TORCH_OPERATOR_WHITELIST)
+  // If the TORCH_OPERATOR_WHITELIST parameter is not defined,
+  // all ops are to be registered
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_OPERATOR_WHITELIST),
+    // This function is majorly used for mobile selective build with
+    // root operators, where the overload is included in the allowlist.
+    op_name);
+    // // Strip overload name (as allowlist doesn't contain overloads)
+    // // Another function based on this may be added when there's usage
+    // // on op names without overload.
+    // OperatorNameView::parse(op_name).name);
+#endif
+}
+
+// Returns true iff the given schema string is on the allowlist
+// and should be registered
+constexpr bool schema_allowlist_check(std::string_view schema) {
+#if defined(TORCH_FORCE_SCHEMA_REGISTRATION)
+  return true;
+#else
+  return op_allowlist_check(schema.substr(0, schema.find('(')));
+#endif
+}
+
+// Returns true iff the given custom class name is on the allowlist
+// and should be registered
+constexpr bool custom_class_allowlist_check(std::string_view custom_class_name [[maybe_unused]]) {
+#if !defined(TORCH_CUSTOM_CLASS_ALLOWLIST)
+  // If the TORCH_CUSTOM_CLASS_ALLOWLIST parameter is not defined,
+  // all custom classes are to be registered
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_CUSTOM_CLASS_ALLOWLIST),
+    custom_class_name);
+#endif
+}
+
+// schema_allowlist_check() implicitly depends on a macro, TORCH_OPERATOR_WHITELIST.
+// Add this API to pass arbitrary allowlist.
+constexpr bool op_allowlist_contains_name_in_schema(std::string_view allowlist, std::string_view schema) {
+  return allowlist_contains(allowlist, schema.substr(0, schema.find('(')));
+}
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e5f8ffe59479fb8e8da0dcf4716b6d14c9d15db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h
@@ -0,0 +1,599 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+/**
+ * Include this file if you want to register operators. It includes all
+ * functionality needed to do so for you.
+ */
+
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/op_registration/infer_schema.h>
+#if defined(EXPOSE_C2_OPS) || !defined(CAFFE2_IS_XPLAT_BUILD)
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#endif
+#include <ATen/core/ATenOpList.h>
+
+namespace c10 {
+
+namespace detail {
+// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
+// We do this because every argument in a function schema is expected to be convertible
+// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
+// See Note [Plumbing Keys Through The Dispatcher]
+template<class KernelFunctor>
+std::unique_ptr<FunctionSchema> inferFunctionSchemaFromFunctor() {
+  using func_type = typename c10::remove_DispatchKeySet_arg_from_func<KernelFunctor>::func_type;
+  return std::make_unique<FunctionSchema>(inferFunctionSchemaFlattenedReturns<func_type>());
+}
+}
+
+/**
+ * An instance of this class handles the registration for one or more operators.
+ * Make sure you keep the RegisterOperators instance around since it will
+ * deregister the operator it's responsible for in its destructor.
+ *
+ * Example:
+ *
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ * >
+ * > static auto registry = c10::RegisterOperators()
+ * >     .op(c10::RegisterOperators::options()
+ * >         .schema("my_op")
+ * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+ */
+class TORCH_API RegisterOperators final {
+public:
+  RegisterOperators() = default;
+  ~RegisterOperators() = default;
+
+  RegisterOperators(const RegisterOperators&) = delete;
+  RegisterOperators& operator=(const RegisterOperators&) = delete;
+  RegisterOperators(RegisterOperators&&) noexcept = default;
+  RegisterOperators& operator=(RegisterOperators&&) noexcept = default;
+
+  class TORCH_API Options final {
+  public:
+    Options(const Options&) = delete;
+    Options(Options&&) noexcept = delete;
+    Options& operator=(const Options&) = delete;
+    Options& operator=(Options&&) noexcept = delete;
+
+    // internal-only for registering stack based kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& kernel(DispatchKey dispatch_key) && {
+      return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
+    }
+
+    // internal-only for registering stack based catch-all kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& catchAllKernel() && {
+      return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
+    }
+
+    // internal only for registering caffe2 ops
+    Options&& schema(FunctionSchema&& schema) {
+        TORCH_CHECK(!schemaOrName_.has_value(), "You can only specify the schema once per operator registration.");
+        schemaOrName_ = FunctionSchema(std::move(schema));
+        return std::move(*this);
+    }
+
+    /**
+     * Use this to specify the schema for an operator. You can also specify
+     * the operator name only to have the function signature part of the
+     * schema be inferred from the kernel function.
+     *
+     * Example:
+     *
+     * > // Infer function signature from my_kernel_cpu
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     * >
+     * >
+     * > // Explicitly specify full schema
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op(Tensor a) -> Tensor")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     */
+    Options&& schema(const std::string& schemaOrName) {
+      TORCH_CHECK(!schemaOrName_.has_value(), "Tried to register operator ", schemaOrName," but specified schema multiple times. You can only specify the schema once per operator registration.");
+
+      #if !defined(EXPOSE_C2_OPS) && defined(CAFFE2_IS_XPLAT_BUILD)
+        throw std::logic_error("Tried to register operator " + schemaOrName + ". We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
+      #else
+        schemaOrName_ = torch::jit::parseSchemaOrName(schemaOrName);
+      #endif
+
+      return std::move(*this);
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU, "some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>());
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>("some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>(DispatchKey::CPU));
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel() && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel(FuncType* kernel_func) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel(DispatchKey::CPU, [] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same_v<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>,
+        Options&&> kernel(DispatchKey dispatch_key, Lambda&& functor) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, std::decay_t<Lambda>>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious. A functor kernel with cache gets a new instance of
+      // its cache each time the kernel is looked up from the dispatch table.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // So, instead of making users having to think about it (including the thread-safety
+      // issues this causes), let's just forbid stateful lambdas altogether.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(functor)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel([] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same_v<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>,
+        Options&&> catchAllKernel(Lambda&& lambda) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, std::decay_t<Lambda>>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // This would be a likely source for unexpected race conditions, so we forbid it.
+      // If a kernel really needs global state, they can just have regular global state
+      // in their .cpp file next to the kernel lambda.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+
+    Options&& aliasAnalysis(AliasAnalysisKind aliasAnalysisKind) && {
+      TORCH_CHECK(!aliasAnalysisKind_.has_value(), "You can only call aliasAnalysis() once per operator registration.");
+      aliasAnalysisKind_ = aliasAnalysisKind;
+      return std::move(*this);
+    }
+
+  private:
+    Options&& kernel(std::optional<DispatchKey> dispatch_key, KernelFunction&& func, std::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
+      KernelRegistrationConfig config;
+      config.dispatch_key = dispatch_key;
+      config.func = std::move(func);
+      config.cpp_signature = cpp_signature;
+      config.inferred_function_schema = std::move(inferred_function_schema);
+      kernels.push_back(std::move(config));
+      return std::move(*this);
+    }
+
+    Options()
+    : schemaOrName_(std::nullopt)
+    , aliasAnalysisKind_(std::nullopt)
+    {}
+
+    // KernelRegistrationConfig accumulates all information from the config
+    // parameters passed to a RegisterOperators::op() call into one object.
+    struct KernelRegistrationConfig final {
+      KernelRegistrationConfig()
+        : dispatch_key(std::nullopt)
+        , cpp_signature(std::nullopt)
+        , inferred_function_schema(nullptr)
+      {}
+
+      std::optional<DispatchKey> dispatch_key;
+      KernelFunction func;
+      std::optional<impl::CppSignature> cpp_signature;
+      std::unique_ptr<FunctionSchema> inferred_function_schema;
+    };
+
+    std::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
+
+    std::vector<KernelRegistrationConfig> kernels;
+    std::optional<AliasAnalysisKind> aliasAnalysisKind_;
+    friend class RegisterOperators;
+    friend class Library;
+  };
+
+  /**
+   * Call this to get an instance of registration options, which
+   * can be passed to a call to RegisterOperators::op() to specify
+   * these options for the operator registration.
+   * See class doc comment for examples.
+   */
+  static Options options() {
+    return {};
+  }
+
+  /**
+   * Call this to register an operator. See class doc comment for examples.
+   */
+  RegisterOperators&& op(Options&& options) && {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return std::move(*this);
+  }
+
+  // Regular mutator version of the && version above
+  RegisterOperators& op(Options&& options) & {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return *this;
+  }
+
+  /**
+   * This is a shorthand for RegisterOperators::op(Options) where you can
+   * specify the operator schema outside of the options parameter.
+   * See class doc comment for examples.
+   */
+  RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && {
+    return std::move(*this).op(std::move(options).schema(schemaOrName));
+  }
+
+  // internal only for registering caffe2 ops
+  RegisterOperators&& op(FunctionSchema schema, Options&& options) && {
+    return std::move(*this).op(std::move(options).schema(std::move(schema)));
+  }
+
+  template<class FuncType>
+  explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options())
+  : RegisterOperators() {
+    std::move(*this).op(schemaOrName, std::forward<FuncType>(func), std::move(options));
+  }
+
+  /**
+   * This API registers an operator based on a kernel function pointer.
+   *
+   * Given a kernel
+   *
+   * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+   *
+   * This API looks like:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", &my_kernel_cpu);
+   *
+   * If your kernel is small and the overhead of calling it matters,
+   * then this API might be the wrong choice since the following API
+   * has a slightly lower overhead for calling into the kernel:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+   *
+   * Or, alternatively, write your kernel as a functor:
+   *
+   * > namespace {
+   * >   class my_kernel_cpu final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * >   };
+   * > }
+   * >
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<my_kernel_cpu>());
+   */
+   template<class FuncType>
+   // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction.
+   std::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, RegisterOperators&&>
+   op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
+     constexpr bool AllowLegacyTypes = true;
+     return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+       std::nullopt,
+       KernelFunction::makeFromUnboxedRuntimeFunction<AllowLegacyTypes>(func),
+       impl::CppSignature::make<FuncType>(),
+       // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+       detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+     ));
+   }
+
+   /**
+    * This API registers an operator based on a kernel lambda.
+    *
+    * This API looks like:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", [] (Tensor a, Tensor b) {...});
+    *
+    * This is equivalent to:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", c10::RegisterOperators::options()
+    * >         .catchAllKernel([] (Tensor a, Tensor b) {...}));
+    *
+    */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is actually a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+
+    template<class Lambda>
+    C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.")
+    // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && !guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+
+private:
+  void checkSchemaAndRegisterOp_(Options&& config);
+
+  static c10::FunctionSchema inferSchemaFromKernels_(const OperatorName& opNameStr, const Options& options);
+  void checkNoDuplicateKernels_(const Options& options);
+  void registerOp_(Options&& options);
+
+  std::vector<RegistrationHandleRAII> registrars_;
+};
+
+} // namespace c10
+
+namespace torch {
+  // Old-style API
+  using RegisterOperators = c10::RegisterOperators;
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..60e0025a2d63d264c9baef2fce846ae400b73cc5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/sve_helper.h
@@ -0,0 +1,85 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// Define the data type of VLS(vector-length specific).
+typedef svbool_t vls_pred_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint8_t vls_int8_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint16_t vls_int16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint32_t vls_int32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint64_t vls_int64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint8_t vls_uint8_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint16_t vls_uint16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint32_t vls_uint32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint64_t vls_uint64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat16_t vls_float16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svbfloat16_t vls_bfloat16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat32_t vls_float32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat64_t vls_float64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+
+#define ptrue svptrue_b8()
+#define ZERO_S8 svdup_n_s8(0)
+#define ZERO_S16 svdup_n_s16(0)
+#define ZERO_S32 svdup_n_s32(0)
+#define ZERO_S64 svdup_n_s64(0)
+#define ZERO_U8 svdup_n_u8(0)
+#define ZERO_U16 svdup_n_u16(0)
+#define ZERO_U32 svdup_n_u32(0)
+#define ZERO_U64 svdup_n_u64(0)
+#define ZERO_F16 svdup_n_f16(0.f)
+#define ZERO_F32 svdup_n_f32(0.f)
+#define ZERO_F64 svdup_n_f64(0.0)
+#define ONE_S8 svdup_n_s8(1)
+#define ONE_S16 svdup_n_s16(1)
+#define ONE_S32 svdup_n_s32(1)
+#define ONE_S64 svdup_n_s64(1)
+#define ONE_U8 svdup_n_u8(1)
+#define ONE_U16 svdup_n_u16(1)
+#define ONE_U32 svdup_n_u32(1)
+#define ONE_U64 svdup_n_u64(1)
+#define ONE_F16 svdup_n_f16(1.f)
+#define ONE_BF16 svdup_n_bf16(1.f)
+#define ONE_F32 svdup_n_f32(1.f)
+#define ONE_F64 svdup_n_f64(1.0)
+#define ALL_S8_TRUE_MASK svdup_n_s8(0xff)
+#define ALL_S8_FALSE_MASK svdup_n_s8(0x0)
+#define ALL_S16_TRUE_MASK svdup_n_s16(0xffff)
+#define ALL_S16_FALSE_MASK svdup_n_s16(0x0)
+#define ALL_S32_TRUE_MASK svdup_n_s32(0xffffffff)
+#define ALL_S32_FALSE_MASK svdup_n_s32(0x0)
+#define ALL_S64_TRUE_MASK svdup_n_s64(0xffffffffffffffff)
+#define ALL_S64_FALSE_MASK svdup_n_s64(0x0)
+#define ALL_U8_TRUE_MASK svdup_n_u8(0x01)
+#define ALL_U8_FALSE_MASK svdup_n_u8(0x00)
+#define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK)
+#define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK)
+#define ALL_BF16_TRUE_MASK svreinterpret_bf16_s16(ALL_S16_TRUE_MASK)
+#define ALL_BF16_FALSE_MASK svreinterpret_bf16_s16(ALL_S16_FALSE_MASK)
+#define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK)
+#define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK)
+#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
+#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb712e8d7ee510503f0a812fdcd4617b7678922a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_bfloat16.h
@@ -0,0 +1,598 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/bit_cast.h>
+#include <cmath>
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+template <>
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> {
+ private:
+  vls_bfloat16_t values;
+
+ public:
+  using value_type = BFloat16;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(BFloat16);
+  }
+
+  Vectorized();
+  Vectorized(svbfloat16_t v) : values(v) {}
+  Vectorized(int val);
+  Vectorized(BFloat16 val);
+
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ BFloat16 buffer[size()] = {vals...};
+    values = svld1_bf16(ptrue, reinterpret_cast<const bfloat16_t*>(buffer));
+  }
+
+  operator svbfloat16_t() const {
+    return values;
+  }
+  static Vectorized<BFloat16> blendv(
+      const Vectorized<BFloat16>& a,
+      const Vectorized<BFloat16>& b,
+      const Vectorized<BFloat16>& mask_) {
+    svbool_t mask =
+        svcmpeq_s16(ptrue, svreinterpret_s16_bf16(mask_), ALL_S16_TRUE_MASK);
+    return svsel_bf16(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<BFloat16> arange(
+      BFloat16 base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    __at_align__ BFloat16 buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer));
+  }
+  static Vectorized<BFloat16> set(
+      const Vectorized<BFloat16>& a,
+      const Vectorized<BFloat16>& b,
+      int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_bf16(svwhilelt_b16(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<BFloat16> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_bf16(ptrue, reinterpret_cast<const bfloat16_t*>(ptr));
+    svbool_t pg = svwhilelt_b16(0ull, count);
+    return svld1_bf16(pg, reinterpret_cast<const bfloat16_t*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    __at_align__ bfloat16_t tmp[size()];
+    std::memset(tmp, 0, sizeof(tmp));
+    if (count == size()) {
+      svst1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(tmp), values);
+    } else {
+      svbool_t pg = svwhilelt_b16(0ull, count);
+      svst1_bf16(pg, reinterpret_cast<bfloat16_t*>(tmp), values);
+    }
+    std::memcpy(
+        reinterpret_cast<bfloat16_t*>(ptr),
+        reinterpret_cast<const bfloat16_t*>(tmp),
+        count * sizeof(bfloat16_t));
+  }
+  const BFloat16& operator[](int idx) const = delete;
+  BFloat16& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    int64_t mask = 0;
+    // returns an integer mask where all zero elements are translated to
+    // 1-bit and others are translated to 0-bit int64_t mask = 0;
+    __at_align__ int16_t mask_array[size()];
+
+    svbool_t svbool_mask =
+        svcmpeq_f16(ptrue, svreinterpret_f16_bf16(values), ZERO_F16);
+    svst1_s16(
+        ptrue,
+        mask_array,
+        svsel_s16(svbool_mask, ALL_S16_TRUE_MASK, ALL_S16_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<BFloat16> isnan() const;
+  bool has_inf_nan() const;
+  Vectorized<BFloat16> map(BFloat16 (*f)(BFloat16)) const {
+    __at_align__ BFloat16 tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<BFloat16> abs() const {
+    auto mask = svdup_n_u16(0x7FFF);
+    auto vals = svreinterpret_u16_bf16(values);
+    vals = svand_u16_x(ptrue, vals, mask);
+    return svreinterpret_bf16_u16(vals);
+  }
+  Vectorized<BFloat16> angle() const;
+  Vectorized<BFloat16> real() const {
+    return values;
+  }
+  Vectorized<BFloat16> imag() const {
+    return Vectorized<BFloat16>(0.f);
+  }
+  Vectorized<BFloat16> conj() const {
+    return values;
+  }
+  Vectorized<BFloat16> acos() const;
+  Vectorized<BFloat16> acosh() const;
+  Vectorized<BFloat16> asin() const;
+  Vectorized<BFloat16> atan() const;
+  Vectorized<BFloat16> atanh() const;
+  Vectorized<BFloat16> atan2(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> copysign(const Vectorized<BFloat16>& sign) const;
+  Vectorized<BFloat16> erf() const;
+  Vectorized<BFloat16> erfc() const;
+  Vectorized<BFloat16> erfinv() const;
+  Vectorized<BFloat16> exp() const;
+  Vectorized<BFloat16> exp2() const;
+  Vectorized<BFloat16> expm1() const;
+  Vectorized<BFloat16> exp_u20() const {
+    return exp();
+  }
+  Vectorized<BFloat16> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
+  Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> i0() const;
+  Vectorized<BFloat16> i0e() const;
+  Vectorized<BFloat16> digamma() const;
+  Vectorized<BFloat16> igamma(const Vectorized<BFloat16>& x) const;
+  Vectorized<BFloat16> igammac(const Vectorized<BFloat16>& x) const;
+  Vectorized<BFloat16> nextafter(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> log() const;
+  Vectorized<BFloat16> log2() const;
+  Vectorized<BFloat16> log10() const;
+  Vectorized<BFloat16> log1p() const;
+  Vectorized<BFloat16> frac() const;
+  Vectorized<BFloat16> sin() const;
+  Vectorized<BFloat16> sinh() const;
+  Vectorized<BFloat16> cos() const;
+  Vectorized<BFloat16> cosh() const;
+  Vectorized<BFloat16> ceil() const;
+  Vectorized<BFloat16> floor() const;
+  Vectorized<BFloat16> neg() const {
+    auto mask = svdup_n_u16(0x8000);
+    auto vals = svreinterpret_u16_bf16(values);
+    vals = sveor_u16_x(ptrue, vals, mask);
+    return svreinterpret_bf16_u16(vals);
+  }
+  Vectorized<BFloat16> round() const;
+  Vectorized<BFloat16> tan() const;
+  Vectorized<BFloat16> tanh() const;
+  Vectorized<BFloat16> trunc() const;
+  Vectorized<BFloat16> lgamma() const;
+  Vectorized<BFloat16> sqrt() const;
+  Vectorized<BFloat16> reciprocal() const;
+  Vectorized<BFloat16> rsqrt() const;
+  Vectorized<BFloat16> pow(const Vectorized<BFloat16>& b) const;
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<BFloat16> operator==(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator!=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator<(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator<=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator>(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator>=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+#if defined(__GNUC__) && __GNUC__ == 14
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline std::tuple<Vectorized<float>, Vectorized<float>>
+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
+  auto bf16_vec1 = svzip1_bf16(zero, a);
+  auto bf16_vec2 = svzip2_bf16(zero, a);
+  auto x1 = svreinterpret_f32_bf16(bf16_vec1);
+  auto x2 = svreinterpret_f32_bf16(bf16_vec2);
+  return {Vectorized<float>(x1), Vectorized<float>(x2)};
+}
+
+inline Vectorized<c10::BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
+  svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
+  return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
+}
+
+inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_bf16(
+    const BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
+  auto floats = convert_bfloat16_float(bf16_vec);
+  out1 = std::get<0>(floats);
+  out2 = std::get<1>(floats);
+}
+
+template <typename Op>
+Vectorized<c10::BFloat16> binary_operator_via_float(
+    Op op,
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
+  const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
+  return convert_float_bfloat16(
+      op(a_float_low, b_float_low), op(a_float_high, b_float_high));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator+(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator-(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator*(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator/(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
+}
+
+inline Vectorized<BFloat16>::Vectorized() {
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
+}
+
+inline Vectorized<BFloat16>::Vectorized(int val) {
+  auto vals_f = svdup_n_f32(val);
+  values = convert_float_bfloat16(vals_f, vals_f);
+}
+
+inline Vectorized<BFloat16>::Vectorized(BFloat16 val) {
+  auto vals_f = svdup_n_f32((float)val);
+  values = convert_float_bfloat16(vals_f, vals_f);
+}
+
+bool inline Vectorized<c10::BFloat16>::has_inf_nan() const {
+  auto [v1, v2] = convert_bfloat16_float(values);
+  return v1.has_inf_nan() || v2.has_inf_nan();
+}
+// frac. Implement this here so we can use subtraction
+Vectorized<BFloat16> inline Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+#define DEFINE_BF16_FUNC_VIA_FLOAT(func_name)                           \
+  Vectorized<BFloat16> inline Vectorized<BFloat16>::func_name() const { \
+    auto [v1, v2] = convert_bfloat16_float(*this);                      \
+    v1 = v1.func_name();                                                \
+    v2 = v2.func_name();                                                \
+    return convert_float_bfloat16(v1, v2);                              \
+  }
+
+#define DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(func_name)            \
+  Vectorized<BFloat16> inline Vectorized<BFloat16>::func_name( \
+      const Vectorized<BFloat16>& a) const {                   \
+    auto [v1, v2] = convert_bfloat16_float(*this);             \
+    auto [v3, v4] = convert_bfloat16_float(a);                 \
+    v1 = v1.func_name(v3);                                     \
+    v2 = v2.func_name(v4);                                     \
+    return convert_float_bfloat16(v1, v2);                     \
+  }
+
+DEFINE_BF16_FUNC_VIA_FLOAT(isnan)
+DEFINE_BF16_FUNC_VIA_FLOAT(angle)
+DEFINE_BF16_FUNC_VIA_FLOAT(acos)
+DEFINE_BF16_FUNC_VIA_FLOAT(acosh)
+DEFINE_BF16_FUNC_VIA_FLOAT(asin)
+DEFINE_BF16_FUNC_VIA_FLOAT(atan)
+DEFINE_BF16_FUNC_VIA_FLOAT(atanh)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign)
+DEFINE_BF16_FUNC_VIA_FLOAT(erf)
+DEFINE_BF16_FUNC_VIA_FLOAT(erfc)
+DEFINE_BF16_FUNC_VIA_FLOAT(exp)
+DEFINE_BF16_FUNC_VIA_FLOAT(exp2)
+DEFINE_BF16_FUNC_VIA_FLOAT(expm1)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot)
+DEFINE_BF16_FUNC_VIA_FLOAT(i0)
+DEFINE_BF16_FUNC_VIA_FLOAT(i0e)
+DEFINE_BF16_FUNC_VIA_FLOAT(digamma)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter)
+DEFINE_BF16_FUNC_VIA_FLOAT(log)
+DEFINE_BF16_FUNC_VIA_FLOAT(log2)
+DEFINE_BF16_FUNC_VIA_FLOAT(log10)
+DEFINE_BF16_FUNC_VIA_FLOAT(log1p)
+DEFINE_BF16_FUNC_VIA_FLOAT(sin)
+DEFINE_BF16_FUNC_VIA_FLOAT(sinh)
+DEFINE_BF16_FUNC_VIA_FLOAT(cos)
+DEFINE_BF16_FUNC_VIA_FLOAT(cosh)
+DEFINE_BF16_FUNC_VIA_FLOAT(ceil)
+DEFINE_BF16_FUNC_VIA_FLOAT(floor)
+DEFINE_BF16_FUNC_VIA_FLOAT(round)
+DEFINE_BF16_FUNC_VIA_FLOAT(tan)
+DEFINE_BF16_FUNC_VIA_FLOAT(tanh)
+DEFINE_BF16_FUNC_VIA_FLOAT(trunc)
+DEFINE_BF16_FUNC_VIA_FLOAT(lgamma)
+DEFINE_BF16_FUNC_VIA_FLOAT(sqrt)
+DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal)
+DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt)
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow)
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator==(
+    const Vectorized<BFloat16>& other) const {
+  auto [f1, f2] = convert_bfloat16_float(values);
+  auto [f3, f4] = convert_bfloat16_float(other);
+  svbool_t mask1 = svcmpeq_f32(ptrue, f1, f3);
+  svbool_t mask2 = svcmpeq_f32(ptrue, f2, f4);
+  auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+
+  auto bf16_1 = svreinterpret_bf16_f32(res1);
+  auto bf16_2 = svreinterpret_bf16_f32(res2);
+  return svuzp1_bf16(bf16_1, bf16_2);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator!=(
+    const Vectorized<BFloat16>& other) const {
+  auto [f1, f2] = convert_bfloat16_float(values);
+  auto [f3, f4] = convert_bfloat16_float(other);
+  svbool_t mask1 = svcmpne_f32(ptrue, f1, f3);
+  svbool_t mask2 = svcmpne_f32(ptrue, f2, f4);
+  auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+
+  auto bf16_1 = svreinterpret_bf16_f32(res1);
+  auto bf16_2 = svreinterpret_bf16_f32(res2);
+  return svuzp1_bf16(bf16_1, bf16_2);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator>(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 > v3, v2 > v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator>=(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 >= v3, v2 >= v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator<(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 < v3, v2 < v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator<=(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 <= v3, v2 <= v4);
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+      a,
+      b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+      a,
+      b);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&clamp_max),
+      a,
+      max);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&clamp_min),
+      a,
+      min);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+  return clamp_min(clamp_max(a, max), min);
+}
+
+template <>
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      svand_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+template <>
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      svorr_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+template <>
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      sveor_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<BFloat16>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<BFloat16>::size()) {
+    svst1_bf16(
+        ptrue,
+        const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(dst)) + i,
+        svldnt1_bf16(
+            ptrue,
+            const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(src)) +
+                i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<BFloat16>::size()) {
+    svbool_t pg = svwhilelt_b16(i, n);
+    svst1_bf16(
+        pg,
+        const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(dst)) + i,
+        svldnt1_bf16(
+            pg,
+            const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(src)) +
+                i));
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+  return a * b + c;
+}
+
+#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h
new file mode 100644
index 0000000000000000000000000000000000000000..d11be323e05416cb0d7ef821e8bd0dde7ad1d0c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_common_sve.h
@@ -0,0 +1,241 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/sve/vec_bfloat16.h>
+#include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
+#include <ATen/cpu/vec/sve/vec_int.h>
+#include <ATen/cpu/vec/sve/vec_qint.h>
+#endif
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix)                 \
+  template <>                                                             \
+  inline Vectorized<t1_t> cast<t1_t, t2_t>(const Vectorized<t2_t>& src) { \
+    return svreinterpret_##t1_prefix##_##t2_prefix(src);                  \
+  }                                                                       \
+  template <>                                                             \
+  inline Vectorized<t2_t> cast<t2_t, t1_t>(const Vectorized<t1_t>& src) { \
+    return svreinterpret_##t2_prefix##_##t1_prefix(src);                  \
+  }
+
+DEFINE_SVE_CAST(int64_t, s64, double, f64)
+DEFINE_SVE_CAST(int32_t, s32, double, f64)
+DEFINE_SVE_CAST(int16_t, s16, double, f64)
+DEFINE_SVE_CAST(int64_t, s64, float, f32)
+DEFINE_SVE_CAST(int32_t, s32, float, f32)
+DEFINE_SVE_CAST(int16_t, s16, float, f32)
+DEFINE_SVE_CAST(float, f32, double, f64)
+
+#ifdef __ARM_FEATURE_BF16
+DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
+#endif // __ARM_FEATURE_BF16
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex_) {
+  svint64_t vindex =
+      svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svld1_gather_s64index_f64(ptrue, base_addr, vindex);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
+  svint32_t vindex =
+      svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svld1_gather_s32index_f32(ptrue, base_addr, vindex);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex_,
+        const Vectorized<double>& mask_) {
+  svbool_t mask =
+      svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK);
+  svint64_t vindex =
+      svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svsel_f64(
+      mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src);
+}
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex_,
+        const Vectorized<float>& mask_) {
+  svbool_t mask =
+      svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK);
+  svint32_t vindex =
+      svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svsel_f32(
+      mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000));
+  return svsub_s64_x(
+      ptrue,
+      svreinterpret_s64_f64(x),
+      svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000)));
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+  return svcvt_s32_f32_x(ptrue, src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      Vectorized<double>(svzip1_f64(a, b)),
+      Vectorized<double>(svzip2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  return std::make_pair(
+      Vectorized<float>(svzip1_f32(a, b)), Vectorized<float>(svzip2_f32(a, b)));
+}
+
+#ifdef __ARM_FEATURE_BF16
+template <>
+std::pair<
+    Vectorized<c10::BFloat16>,
+    Vectorized<c10::BFloat16>> inline interleave2<c10::
+                                                      BFloat16>(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
+      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
+}
+#endif // __ARM_FEATURE_BF16
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(
+      Vectorized<double>(svuzp1_f64(a, b)),
+      Vectorized<double>(svuzp2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(
+      Vectorized<float>(svuzp1_f32(a, b)), Vectorized<float>(svuzp2_f32(a, b)));
+}
+
+#ifdef __ARM_FEATURE_BF16
+template <>
+std::pair<
+    Vectorized<c10::BFloat16>,
+    Vectorized<c10::BFloat16>> inline deinterleave2<c10::
+                                                        BFloat16>(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
+      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
+}
+#endif // __ARM_FEATURE_BF16
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..8abd6d275e80db7658c8c187ccc78031b6c600b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_double.h
@@ -0,0 +1,622 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  vls_float64_t values;
+
+ public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(double);
+  }
+  Vectorized() {
+    values = svdup_n_f64(0);
+  }
+  Vectorized(svfloat64_t v) : values(v) {}
+  Vectorized(double val) {
+    values = svdup_n_f64(val);
+  }
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ double buffer[size()] = {vals...};
+    values = svld1_f64(ptrue, buffer);
+  }
+  operator svfloat64_t() const {
+    return values;
+  }
+  template <uint64_t mask>
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    // Build an array of flags: each element is 1 if the corresponding bit in
+    // 'mask' is set, 0 otherwise.
+    __at_align__ int64_t flag_arr[size()];
+    for (int i = 0; i < size(); i++) {
+      flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
+    }
+    // Load the flag array into an SVE int64 vector.
+    svint64_t int_mask = svld1_s64(svptrue_b64(), flag_arr);
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
+    // true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0);
+
+    // Use svsel to select elements from b where the predicate is true, else
+    // from a.
+    svfloat64_t result = svsel(blend_mask, b.values, a.values);
+    return Vectorized<double>(result);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask_) {
+    svbool_t mask =
+        svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK);
+    return svsel_f64(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    __at_align__ double buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f64(ptrue, buffer);
+  }
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f64(svwhilelt_b64(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f64(ptrue, reinterpret_cast<const double*>(ptr));
+    svbool_t pg = svwhilelt_b64(0ull, count);
+    return svld1_f64(pg, reinterpret_cast<const double*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f64(ptrue, reinterpret_cast<double*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b64(0ull, count);
+      svst1_f64(pg, reinterpret_cast<double*>(ptr), values);
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int64_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64);
+    svst1_s64(
+        ptrue,
+        mask_array,
+        svsel_s64(svbool_mask, ALL_S64_TRUE_MASK, ALL_S64_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<double> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(
+        ptrue,
+        svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
+  }
+  Vectorized<double> map(double (*f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    return svabs_f64_x(ptrue, values);
+  }
+  Vectorized<double> angle() const {
+    const auto nan_vec = svdup_n_f64(NAN);
+    const auto nan_mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    const auto pi = svdup_n_f64(c10::pi<double>);
+
+    const auto neg_mask = svcmplt_f64(ptrue, values, ZERO_F64);
+    auto angle = svsel_f64(neg_mask, pi, ZERO_F64);
+    angle = svsel_f64(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>(0.0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acosdx_u10sve(values)), map(std::acos));
+  }
+  Vectorized<double> acosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acoshdx_u10sve(values)), map(std::acosh));
+  }
+  Vectorized<double> asin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asindx_u10sve(values)), map(std::asin));
+  }
+  Vectorized<double> asinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asinhdx_u10sve(values)), map(std::asinh));
+  }
+  Vectorized<double> atan() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atandx_u10sve(values)), map(std::atan));
+  }
+  Vectorized<double> atanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atanhdx_u10sve(values)), map(std::atanh));
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_atan2dx_u10sve(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> copysign(const Vectorized<double>& sign) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_copysigndx_sve(values, sign)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_sign[size()];
+            store(tmp);
+            sign.store(tmp_sign);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> erf() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfdx_u10sve(values)), map(std::erf));
+  }
+  Vectorized<double> erfc() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfcdx_u15sve(values)), map(std::erfc));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expdx_u10sve(values)), map(std::exp));
+  }
+  Vectorized<double> exp2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_exp2dx_u10sve(values)), map(std::exp2));
+  }
+  Vectorized<double> expm1() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expm1dx_u10sve(values)), map(std::expm1));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> hypot(const Vectorized<double>& b) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_hypotdx_u05sve(values, b)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_b[size()];
+            store(tmp);
+            b.store(tmp_b);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_nextafterdx_sve(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> log() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_logdx_u10sve(values)), map(std::log));
+  }
+  Vectorized<double> log2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log2dx_u10sve(values)), map(std::log2));
+  }
+  Vectorized<double> log10() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log10dx_u10sve(values)), map(std::log10));
+  }
+  Vectorized<double> log1p() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log1pdx_u10sve(values)), map(std::log1p));
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> sin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sindx_u10sve(values)), map(std::sin));
+  }
+  Vectorized<double> sinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sinhdx_u10sve(values)), map(std::sinh));
+  }
+  Vectorized<double> cos() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_cosdx_u10sve(values)), map(std::cos));
+  }
+  Vectorized<double> cosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_coshdx_u10sve(values)), map(std::cosh));
+  }
+  Vectorized<double> ceil() const {
+    return svrintp_f64_x(ptrue, values);
+  }
+  Vectorized<double> floor() const {
+    return svrintm_f64_x(ptrue, values);
+  }
+  Vectorized<double> neg() const {
+    return svneg_f64_x(ptrue, values);
+  }
+  Vectorized<double> round() const {
+    return svrinti_f64_x(ptrue, values);
+  }
+  Vectorized<double> tan() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tandx_u10sve(values)), map(std::tan));
+  }
+  Vectorized<double> tanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tanhdx_u10sve(values)), map(std::tanh));
+  }
+  Vectorized<double> trunc() const {
+    return svrintz_f64_x(ptrue, values);
+  }
+  Vectorized<double> lgamma() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_lgammadx_u10sve(values)), map(std::lgamma));
+  }
+  Vectorized<double> sqrt() const {
+    return svsqrt_f64_x(ptrue, values);
+  }
+  Vectorized<double> reciprocal() const {
+    return svdivr_f64_x(ptrue, values, ONE_F64);
+  }
+  Vectorized<double> rsqrt() const {
+    return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64);
+  }
+  Vectorized<double> pow(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_powdx_u10sve(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::pow(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} // Comparison using the _CMP_**_OQ predicate.
+          //   `O`: get false if an operand is NaN
+          //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpeq_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpne_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    svbool_t mask = svcmplt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmple_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpgt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpge_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svadd_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svsub_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svmul_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svdiv_f64_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<double> inline Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svmax_f64_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svmin_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+  return svmax_f64_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+Vectorized<double> inline Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<double>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<double>::size()) {
+    svst1_f64(ptrue, dst + i, svldnt1_f64(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<double>::size()) {
+    svbool_t pg = svwhilelt_b64(i, n);
+    svst1_f64(pg, dst + i, svldnt1_f64(pg, src + i));
+  }
+}
+
+template <>
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return svmad_f64_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return svmsb_f64_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return svnmsb_f64_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return svnmad_f64_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..008b7bb711ad0888d8ba8fac509c6e8f31599c28
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_float.h
@@ -0,0 +1,760 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  vls_float32_t values;
+
+ public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(float);
+  }
+  Vectorized() {
+    values = svdup_n_f32(0);
+  }
+  Vectorized(svfloat32_t v) : values(v) {}
+  Vectorized(float val) {
+    values = svdup_n_f32(val);
+  }
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ float buffer[size()] = {vals...};
+    values = svld1_f32(ptrue, buffer);
+  }
+  operator svfloat32_t() const {
+    return values;
+  }
+  template <uint64_t mask>
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    // Build an array of flags: each element is 1 if the corresponding bit in
+    // 'mask' is set, 0 otherwise.
+    __at_align__ int32_t flag_arr[size()];
+    for (int i = 0; i < size(); i++) {
+      flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
+    }
+    // Load the flag array into an SVE int32 vector.
+    svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
+    // true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
+    // Use svsel to select elements from b where the predicate is true, else
+    // from a.
+    svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
+    return Vectorized<float>(result);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask_) {
+    svbool_t mask =
+        svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK);
+    return svsel_f32(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    __at_align__ float buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f32(ptrue, buffer);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f32(svwhilelt_b32(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b32(0ull, count);
+      svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
+    }
+  }
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int32_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
+    svst1_s32(
+        ptrue,
+        mask_array,
+        svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<float> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(
+        ptrue,
+        svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+  }
+  Vectorized<float> map(float (*f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return svabs_f32_x(ptrue, values);
+  }
+  Vectorized<float> angle() const {
+    const auto nan_vec = svdup_n_f32(NAN);
+    const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    const auto pi = svdup_n_f32(c10::pi<float>);
+
+    const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
+    auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
+    angle = svsel_f32(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return values;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return values;
+  }
+  Vectorized<float> acos() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
+  }
+  Vectorized<float> acosh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
+  }
+  Vectorized<float> asin() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
+  }
+  Vectorized<float> asinh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
+  }
+  Vectorized<float> atan() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
+  }
+  Vectorized<float> atanh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<float> copysign(const Vectorized<float>& sign) const {
+
+      USE_SLEEF(
+          { return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
+          {
+            __at_align__ float tmp[size()];
+            __at_align__ float tmp_sign[size()];
+            store(tmp);
+            sign.store(tmp_sign);
+            for (int64_t i = 0; i < size(); ++i) {
+              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<float> erf() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
+  }
+  Vectorized<float> erfc() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
+  }
+  Vectorized<float> exp2() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
+  }
+  Vectorized<float> expm1() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
+  }
+  // Implementation copied from Arm Optimized Routines:
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
+  Vectorized<float> exp_u20() const {
+    // special case to handle special inputs that are too large or too small
+    // i.e. where there's at least one element x, s.t. |x| >= 87.3...
+    svbool_t is_special_case = svacgt(svptrue_b32(), values, 0x1.5d5e2ap+6f);
+    if (svptest_any(svptrue_b32(), is_special_case)) {
+      return exp();
+    }
+    const svfloat32_t ln2_hi = svdup_n_f32(0x1.62e4p-1f);
+    const svfloat32_t ln2_lo = svdup_n_f32(0x1.7f7d1cp-20f);
+    const svfloat32_t c1 = svdup_n_f32(0.5f);
+    const svfloat32_t inv_ln2 = svdup_n_f32(0x1.715476p+0f);
+
+    const float shift = 0x1.803f8p17f;
+
+    /* n = round(x/(ln2/N)).  */
+    svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, values, shift);
+    svfloat32_t n = svsub_x(svptrue_b32(), z, shift);
+
+    /* r = x - n*ln2/N.  */
+    svfloat32_t r = values;
+    r = svmls_x(svptrue_b32(), r, n, ln2_hi);
+    r = svmls_x(svptrue_b32(), r, n, ln2_lo);
+
+    /* scale = 2^(n/N).  */
+    svfloat32_t scale = svexpa(svreinterpret_u32(z));
+
+    /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2.  */
+    svfloat32_t r2 = svmul_x(svptrue_b32(), r, r);
+    svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
+    return svmla_x(svptrue_b32(), scale, scale, poly);
+  }
+  Vectorized<float> fexp_u20() const {
+    return exp_u20();
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<float> hypot(const Vectorized<float>& b) const {
+      USE_SLEEF(
+          { return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
+          {
+            __at_align__ float tmp[size()];
+            __at_align__ float tmp_b[size()];
+            store(tmp);
+            b.store(tmp_b);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<float> log() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
+  }
+  Vectorized<float> log2() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
+  }
+  Vectorized<float> log10() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
+  }
+  Vectorized<float> log1p() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
+  }
+  Vectorized<float> sinh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
+  }
+  Vectorized<float> cos() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
+  }
+  Vectorized<float> cosh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
+  }
+  Vectorized<float> ceil() const {
+    return svrintp_f32_x(ptrue, values);
+  }
+  Vectorized<float> floor() const {
+    return svrintm_f32_x(ptrue, values);
+  }
+  Vectorized<float> neg() const {
+    return svneg_f32_x(ptrue, values);
+  }
+  Vectorized<float> round() const {
+    return svrinti_f32_x(ptrue, values);
+  }
+  Vectorized<float> tan() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
+  }
+  // Implementation is picked from
+  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
+  Vectorized<float> tanh() const {
+    // Constants used for the tanh calculation.
+    const svfloat32_t CONST_1 =
+        svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
+    const svfloat32_t CONST_2 = svdup_n_f32(
+        2.f); // Constant 2.0f for the tanh formula (used in exp(2x)).
+    const svfloat32_t CONST_MIN_TANH = svdup_n_f32(
+        -10.f); // Minimum threshold for input values to prevent overflow.
+    const svfloat32_t CONST_MAX_TANH = svdup_n_f32(
+        10.f); // Maximum threshold for input values to prevent overflow.
+
+    // Step 1: Clamp the values within the range [-10, 10] to prevent overflow
+    // during exponentiation. The tanh function approaches ±1 rapidly as the
+    // input grows large, so we limit the input range to avoid numerical
+    // instability. svmax_f32_z ensures values are greater than -10, and
+    // svmin_f32_z ensures they are less than 10.
+    svfloat32_t x = svmin_f32_z(
+        ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
+
+    // Step 2: Calculate exp(2 * x), where x is the clamped value.
+    // svmul_f32_z computes 2 * x, and exp_u20() computes the exponential of
+    // the result (via Vectorized<float>, then auto-converts back to
+    // svfloat32_t).
+    svfloat32_t exp2x =
+        Vectorized<float>(svmul_f32_z(ptrue, CONST_2, x)).exp_u20();
+
+    // Step 3: Calculate the numerator of the tanh function, which is exp(2x)
+    // - 1.
+    svfloat32_t num = svsub_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 4: Calculate the denominator of the tanh function, which is exp(2x)
+    // + 1.
+    svfloat32_t den = svadd_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 5: Calculate the tanh function as the ratio of the numerator and
+    // denominator: num / den.
+    svfloat32_t tanh = svdiv_f32_z(ptrue, num, den);
+
+    // Return the calculated tanh values.
+    return tanh;
+  }
+  Vectorized<float> trunc() const {
+    return svrintz_f32_x(ptrue, values);
+  }
+  Vectorized<float> lgamma() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
+  }
+  Vectorized<float> sqrt() const {
+    return svsqrt_f32_x(ptrue, values);
+  }
+  Vectorized<float> reciprocal() const {
+    return svdivr_f32_x(ptrue, values, ONE_F32);
+  }
+  Vectorized<float> rsqrt() const {
+    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
+  }
+  Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::pow(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} // Comparison using the _CMP_**_OQ predicate.
+          //   `O`: get false if an operand is NaN
+          //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpeq_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpne_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    svbool_t mask = svcmplt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmple_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpgt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpge_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svadd_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svsub_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svmul_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svdiv_f32_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<float> inline Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svmax_f32_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svmin_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+  return svmax_f32_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+Vectorized<float> inline Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    svbool_t pg = svwhilelt_b32(i, n);
+    svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
+  }
+}
+
+template <>
+inline void convert(const float* src, at::Half* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svuzp1_f16(
+        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svuzp1_f16(
+        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+}
+
+template <>
+inline void convert(const at::Half* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svzip1_f16(
+        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+        ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svzip1_f16(
+        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+        ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return svmad_f32_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return svmsb_f32_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return svnmsb_f32_x(ptrue, a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return svnmad_f32_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dee484491f505993e1c523591b88747e782ede0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_int.h
@@ -0,0 +1,504 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+#define VEC_INT_SVE_TEMPLATE(vl, bit)                                         \
+  template <>                                                                 \
+  struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {};  \
+                                                                              \
+  template <>                                                                 \
+  class Vectorized<int##bit##_t> {                                            \
+   private:                                                                   \
+    vls_int##bit##_t values;                                                  \
+                                                                              \
+   public:                                                                    \
+    using value_type = int##bit##_t;                                          \
+    using size_type = int;                                                    \
+    static constexpr size_type size() {                                       \
+      return vl;                                                              \
+    }                                                                         \
+    Vectorized() {                                                            \
+      values = svdup_n_s##bit(0);                                             \
+    }                                                                         \
+    Vectorized(svint##bit##_t v) : values(v) {}                               \
+    Vectorized(int##bit##_t val) {                                            \
+      values = svdup_n_s##bit(val);                                           \
+    }                                                                         \
+    template <                                                                \
+        typename... Args,                                                     \
+        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
+    Vectorized(Args... vals) {                                                \
+      __at_align__ int##bit##_t buffer[size()] = {vals...};                   \
+      values = svld1_s##bit(ptrue, buffer);                                   \
+    }                                                                         \
+    operator svint##bit##_t() const {                                         \
+      return values;                                                          \
+    }                                                                         \
+    template <uint64_t mask>                                                  \
+    static Vectorized<int##bit##_t> blend(                                    \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b) {                                  \
+      __at_align__ int##bit##_t flag_arr[size()];                             \
+      for (int i = 0; i < size(); ++i) {                                      \
+        flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0;               \
+      }                                                                       \
+      svbool_t blend_mask = svcmpne_n_s##bit(                                 \
+          svptrue_b##bit(), svld1_s##bit(svptrue_b##bit(), flag_arr), 0);     \
+      return Vectorized<int##bit##_t>(                                        \
+          svsel_s##bit(blend_mask, b.values, a.values));                      \
+    }                                                                         \
+    static Vectorized<int##bit##_t> blendv(                                   \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        const Vectorized<int##bit##_t>& mask_) {                              \
+      svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK);   \
+      return svsel_s##bit(mask, b, a);                                        \
+    }                                                                         \
+    /* step sometimes requires a higher precision type (e.g., T=int,          \
+     * step_t=double) */                                                      \
+    template <typename step_t>                                                \
+    static Vectorized<int##bit##_t> arange(                                   \
+        int##bit##_t base = 0,                                                \
+        step_t step = static_cast<step_t>(1)) {                               \
+      __at_align__ int##bit##_t buffer[size()];                               \
+      for (int64_t i = 0; i < size(); i++) {                                  \
+        buffer[i] = base + i * step;                                          \
+      }                                                                       \
+      return svld1_s##bit(ptrue, buffer);                                     \
+    }                                                                         \
+    static Vectorized<int##bit##_t> set(                                      \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        int##bit##_t count = size()) {                                        \
+      if (count == 0) {                                                       \
+        return a;                                                             \
+      } else if (count < size()) {                                            \
+        return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a);             \
+      }                                                                       \
+      return b;                                                               \
+    }                                                                         \
+    static Vectorized<int##bit##_t> loadu(                                    \
+        const void* ptr,                                                      \
+        int64_t count = size()) {                                             \
+      if (count == size())                                                    \
+        return svld1_s##bit(                                                  \
+            ptrue, reinterpret_cast<const int##bit##_t*>(ptr));               \
+      svbool_t pg = svwhilelt_b##bit(0ull, count);                            \
+      return svld1_s##bit(pg, reinterpret_cast<const int##bit##_t*>(ptr));    \
+    }                                                                         \
+    void store(void* ptr, int64_t count = size()) const {                     \
+      if (count == size()) {                                                  \
+        svst1_s##bit(ptrue, reinterpret_cast<int##bit##_t*>(ptr), values);    \
+      } else {                                                                \
+        svbool_t pg = svwhilelt_b##bit(0ull, count);                          \
+        svst1_s##bit(pg, reinterpret_cast<int##bit##_t*>(ptr), values);       \
+      }                                                                       \
+    }                                                                         \
+    const int##bit##_t& operator[](int idx) const = delete;                   \
+    int##bit##_t& operator[](int idx) = delete;                               \
+    Vectorized<int##bit##_t> abs() const {                                    \
+      return svabs_s##bit##_x(ptrue, values);                                 \
+    }                                                                         \
+    Vectorized<int##bit##_t> real() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> imag() const {                                   \
+      return svdup_n_s##bit(0);                                               \
+    }                                                                         \
+    Vectorized<int##bit##_t> conj() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> frac() const;                                    \
+    Vectorized<int##bit##_t> neg() const {                                    \
+      return svneg_s##bit##_x(ptrue, values);                                 \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator==(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpeq_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator!=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpne_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmplt_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmple_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpgt_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpge_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
+  };                                                                          \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator+(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svadd_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator-(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svsub_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator*(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmul_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline maximum(                                    \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmax_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline minimum(                                    \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmin_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp(                                      \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& min,                                    \
+      const Vectorized<int##bit##_t>& max) {                                  \
+    return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a));     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp_max(                                  \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& max) {                                  \
+    return svmin_s##bit##_x(ptrue, max, a);                                   \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp_min(                                  \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& min) {                                  \
+    return svmax_s##bit##_x(ptrue, min, a);                                   \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator&(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svand_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator|(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svorr_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator^(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return sveor_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  inline Vectorized<int##bit##_t> operator~(                                  \
+      const Vectorized<int##bit##_t>& a) {                                    \
+    return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1));                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this == other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this != other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this > other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this >= other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this < other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this <= other) & Vectorized<int##bit##_t>(1);                    \
+  }
+
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8)
+
+template <typename T>
+Vectorized<T> inline intdiv_nosve(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] /= values_b[i];
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return svdiv_s64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return svdiv_s32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+inline void convert(const int32_t* src, int64_t* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size())
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+  }
+}
+
+template <>
+inline void convert(const int64_t* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 =
+        svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 =
+        svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg = svwhilelt_b32(i, n);
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool* src, int64_t* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 =
+        svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 =
+        svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+}
+
+template <>
+inline void convert(const bool* src, int32_t* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int32_t>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+}
+
+template <>
+inline void convert(const uint8_t* src, bool* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<uint8_t>::size();
+  svbool_t pg = svwhilelt_b8(0ull, Vectorized<uint8_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<uint8_t>::size()) {
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(
+        pg,
+        reinterpret_cast<uint8_t*>(dst) + i,
+        svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<uint8_t>::size()) {
+    pg = svwhilelt_b8(i, n);
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(
+        pg,
+        reinterpret_cast<uint8_t*>(dst) + i,
+        svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h
new file mode 100644
index 0000000000000000000000000000000000000000..98d45ba0790f208cb165d29974d99ff1547999b1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/sve/vec_qint.h
@@ -0,0 +1,611 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// NOTE: These are low-performance implementations that we fall back on
+// if we are not building with SVE. This may not be an issue, because
+// currently for quantization we assume the user has at least SVE
+// installed, so these can simply act as a reference implementation.
+//
+// If in the future we relax this requirement (SVE+), we should probably
+// revisit these implementations
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  using size_type = int;
+  static constexpr size_type size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / Vectorized<float>::size();
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / Vectorized<int32_t>::size();
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (size_t i = 0; i < size(); ++i) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            tmp_scale[j],
+            tmp_zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            tmp_scale[j],
+            tmp_zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     VECTOR_WIDTH / 4> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(ptr) {}
+#if 1
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::qint32> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return svld1_s32(ptrue, reinterpret_cast<const int32_t*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_s32(pg, reinterpret_cast<const int32_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] =
+          nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+#if 1
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::quint8> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return svld1_u8(ptrue, reinterpret_cast<const uint8_t*>(ptr));
+    svbool_t pg = svwhilelt_b8(0ull, count);
+    return svld1_u8(pg, reinterpret_cast<const uint8_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h
new file mode 100644
index 0000000000000000000000000000000000000000..766f980da7088f7f7f830bf84299de836e361837
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128.h
@@ -0,0 +1,22 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// ARM NEON uses 128-bit vector registers.
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#ifdef __aarch64__
+#if !defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_double_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
+#include <ATen/cpu/vec/vec128/vec128_uint_aarch64.h>
+#endif
+
+#include <ATen/cpu/vec/vec128/vec128_convert.h>
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ae7920fa4a90b434bfba8238c96926bcc522f96
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@@ -0,0 +1,703 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Following vec128_half_neon.h, we only support aarch64.
+#if !defined(C10_MOBILE) && defined(__aarch64__)
+#ifdef __BIG_ENDIAN__
+#error "Big endian is not supported."
+#endif
+
+// GCC does not properly optimize bf16 operators
+#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
+#define BF16_ARITHMETIC_SUPPORTED() 1
+#else
+#define BF16_ARITHMETIC_SUPPORTED() 0
+#endif
+
+// Unlike the float16_t family of types, bfloat16_t is not available
+// when we're not targeting bfloat16 hardware support on some
+// platforms (but not Mac, so we have to be careful not to shadow the
+// definitions in case they are actually there!). (See
+// https://godbolt.org/z/orv6e94n4 ) So, we need to handle it as
+// uint16_t in that case.
+#define IMPLEMENT_AT_BF16_SHIM(vec_suffix)                               \
+  inline at_bfloat16x4_t at_vget_low_bf16(at_bfloat16x8_t a) {           \
+    return vget_low_##vec_suffix(a);                                     \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x4_t at_vget_high_bf16(at_bfloat16x8_t a) {          \
+    return vget_high_##vec_suffix(a);                                    \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vcombine_bf16(                               \
+      at_bfloat16x4_t low, at_bfloat16x4_t high) {                       \
+    return vcombine_##vec_suffix(low, high);                             \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vdupq_n_bf16(at_bfloat16_t value) {          \
+    return vdupq_n_##vec_suffix(value);                                  \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vld1q_bf16(const at_bfloat16_t* ptr) {       \
+    return vld1q_##vec_suffix(ptr);                                      \
+  }                                                                      \
+                                                                         \
+  inline void at_vst1q_bf16(at_bfloat16_t* ptr, at_bfloat16x8_t value) { \
+    vst1q_##vec_suffix(ptr, value);                                      \
+  }                                                                      \
+                                                                         \
+  template <typename T>                                                  \
+  inline at_bfloat16x8_t at_vreinterpretq_bf16_u16(T val) {              \
+    if constexpr (std::is_same_v<at_bfloat16x8_t, uint16x8_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpretq_bf16_u16(val);                                \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline at_bfloat16x4_t at_vreinterpret_bf16_u16(T val) {               \
+    if constexpr (std::is_same_v<at_bfloat16x4_t, uint16x4_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpret_bf16_u16(val);                                 \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline uint16x8_t at_vreinterpretq_u16_bf16(T val) {                   \
+    if constexpr (std::is_same_v<at_bfloat16x8_t, uint16x8_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpretq_u16_bf16(val);                                \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline uint16x4_t at_vreinterpret_u16_bf16(T val) {                    \
+    if constexpr (std::is_same_v<at_bfloat16x4_t, uint16x4_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpret_u16_bf16(val);                                 \
+    }                                                                    \
+  }
+
+#ifdef __ARM_FEATURE_BF16
+using at_bfloat16x8_t = bfloat16x8_t;
+using at_bfloat16x4_t = bfloat16x4_t;
+using at_bfloat16_t = bfloat16_t;
+IMPLEMENT_AT_BF16_SHIM(bf16)
+#define at_vsetq_lane_bf16 vsetq_lane_bf16
+#define at_vgetq_lane_bf16 vgetq_lane_bf16
+#else
+using at_bfloat16x8_t = uint16x8_t;
+using at_bfloat16x4_t = uint16x4_t;
+using at_bfloat16_t = uint16_t;
+IMPLEMENT_AT_BF16_SHIM(u16)
+#define at_vsetq_lane_bf16 vsetq_lane_u16
+#define at_vgetq_lane_bf16 vgetq_lane_u16
+#endif // __ARM_FEATURE_BF16
+
+template <int index, bool mask_val>
+struct BlendBFloat16Regs {
+  static at_bfloat16x8_t impl(
+      const at_bfloat16x8_t& a,
+      const at_bfloat16x8_t& b,
+      at_bfloat16x8_t& res);
+};
+
+template <int index>
+struct BlendBFloat16Regs<index, true> {
+  static at_bfloat16x8_t impl(
+      const at_bfloat16x8_t& a,
+      const at_bfloat16x8_t& b,
+      at_bfloat16x8_t& res) {
+    return at_vsetq_lane_bf16(at_vgetq_lane_bf16(b, index), res, index);
+  }
+};
+
+template <int index>
+struct BlendBFloat16Regs<index, false> {
+  static at_bfloat16x8_t impl(
+      const at_bfloat16x8_t& a,
+      const at_bfloat16x8_t& b,
+      at_bfloat16x8_t& res) {
+    return at_vsetq_lane_bf16(at_vgetq_lane_bf16(a, index), res, index);
+  }
+};
+
+template <>
+struct is_vec_specialized_for<c10::BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<c10::BFloat16> : public Vectorized16<
+                                      at_bfloat16x8_t,
+                                      c10::BFloat16,
+                                      BlendBFloat16Regs,
+                                      Vectorized<c10::BFloat16>> {
+  using Base = Vectorized16<
+      at_bfloat16x8_t,
+      c10::BFloat16,
+      BlendBFloat16Regs,
+      Vectorized<c10::BFloat16>>;
+  friend Base;
+  friend std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+      const Vectorized<c10::BFloat16>& a);
+  friend Vectorized<c10::BFloat16> convert_float_bfloat16(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b);
+
+ private:
+  Vectorized<c10::BFloat16> map2(
+      const Vectorized<c10::BFloat16>& second,
+      c10::BFloat16 (*const f)(c10::BFloat16, c10::BFloat16)) const {
+    __at_align__ c10::BFloat16 tmp_first[size()];
+    __at_align__ c10::BFloat16 tmp_second[size()];
+    store(tmp_first); // store this to tmp_first
+    second.store(tmp_second);
+    for (const auto i : c10::irange(size())) {
+      tmp_first[i] = f(tmp_first[i], tmp_second[i]);
+    }
+    return loadu(tmp_first);
+  }
+
+  static float32x4_t convert_f32_bf16(at_bfloat16x4_t bf16) {
+#ifdef __ARM_FEATURE_BF16
+    return vcvt_f32_bf16(bf16);
+#else
+    int32x4_t shift = vdupq_n_s32(16);
+    return vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(bf16), shift));
+#endif // __ARM_FEATURE_BF16
+  }
+
+  static at_bfloat16x4_t convert_bf16_f32(const Vectorized<float>& f32) {
+#ifdef __ARM_FEATURE_BF16
+    return vcvt_bf16_f32(f32);
+#else
+    static_assert(std::is_same_v<uint16x4_t, at_bfloat16x4_t>);
+    uint32x4_t as_uint32 = vreinterpretq_u32_f32(f32);
+    uint32x4_t rounding_bias = vaddq_u32(
+        vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)),
+        vdupq_n_u32(0x7FFF));
+    at_bfloat16x4_t rounded =
+        vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16);
+    const auto bf16_nan = vdup_n_u16(0x7FC0);
+    return vbsl_u16(
+        vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded);
+#endif // __ARM_FEATURE_BF16
+  }
+
+  Vectorized<c10::BFloat16> map_with_vec_float_method(
+      Vectorized<float> (Vectorized<float>::*m)() const) const {
+    float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values));
+    float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values));
+    Vectorized<float> mv0 = (Vectorized<float>(v00).*m)();
+    Vectorized<float> mv1 = (Vectorized<float>(v01).*m)();
+    at_bfloat16x4_t r00 = convert_bf16_f32(mv0);
+    at_bfloat16x4_t r01 = convert_bf16_f32(mv1);
+    return Vectorized<c10::BFloat16>(at_vcombine_bf16(r00, r01));
+  }
+
+  Vectorized<c10::BFloat16> map2_with_vec_float_method(
+      const Vectorized<c10::BFloat16>& second,
+      Vectorized<float> (Vectorized<float>::*m)(const Vectorized<float>&)
+          const) const {
+    float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values));
+    float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values));
+    float32x4_t second_v00 = convert_f32_bf16(at_vget_low_bf16(second.values));
+    float32x4_t second_v01 = convert_f32_bf16(at_vget_high_bf16(second.values));
+    Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(second_v00);
+    Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(second_v01);
+    at_bfloat16x4_t r00 = convert_bf16_f32(mv0);
+    at_bfloat16x4_t r01 = convert_bf16_f32(mv1);
+    return Vectorized<c10::BFloat16>(at_vcombine_bf16(r00, r01));
+  }
+
+  Vectorized<c10::BFloat16> map2_bitmask_with_vec_float_method(
+      const Vectorized<c10::BFloat16>& second,
+      Vectorized<float> (Vectorized<float>::*m)(const Vectorized<float>&)
+          const) const {
+    float32x4_t v00 = convert_f32_bf16(at_vget_low_bf16(values));
+    float32x4_t v01 = convert_f32_bf16(at_vget_high_bf16(values));
+    float32x4_t second_v00 = convert_f32_bf16(at_vget_low_bf16(second.values));
+    float32x4_t second_v01 = convert_f32_bf16(at_vget_high_bf16(second.values));
+    Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(second_v00);
+    Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(second_v01);
+    // Assume the operator returns a bitmask, not "real" floats, and
+    // just narrow the bits. All-ones is a NaN and will get mangled by
+    // conversion!
+    at_bfloat16x4_t r00 =
+        at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0)));
+    at_bfloat16x4_t r01 =
+        at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1)));
+    return Vectorized<c10::BFloat16>(at_vcombine_bf16(r00, r01));
+  }
+
+ public:
+  using Vectorized16::Vectorized16;
+
+  Vectorized() = default;
+
+  Vectorized(c10::BFloat16 val)
+      : Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
+  Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
+  Vectorized(
+      value_type val0,
+      value_type val1,
+      value_type val2,
+      value_type val3,
+      value_type val4,
+      value_type val5,
+      value_type val6,
+      value_type val7)
+      : Vectorized16(at_bfloat16x8_t{
+            c10::bit_cast<at_bfloat16_t>(val0.x),
+            c10::bit_cast<at_bfloat16_t>(val1.x),
+            c10::bit_cast<at_bfloat16_t>(val2.x),
+            c10::bit_cast<at_bfloat16_t>(val3.x),
+            c10::bit_cast<at_bfloat16_t>(val4.x),
+            c10::bit_cast<at_bfloat16_t>(val5.x),
+            c10::bit_cast<at_bfloat16_t>(val6.x),
+            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
+
+  static Vectorized<c10::BFloat16> blendv(
+      const Vectorized<c10::BFloat16>& a,
+      const Vectorized<c10::BFloat16>& b,
+      const Vectorized<c10::BFloat16>& mask) {
+    // NOTE: blendv has the same problems as it does for Half; see comments in
+    // vec128_half_neon.h.
+    Vectorized<c10::BFloat16> vec(mask.values);
+    vec.values = at_vreinterpretq_bf16_u16(vbslq_u16(
+        at_vreinterpretq_u16_bf16(vec.values),
+        at_vreinterpretq_u16_bf16(b.values),
+        at_vreinterpretq_u16_bf16(a.values)));
+    return vec;
+  }
+  static Vectorized<c10::BFloat16> set(
+      const Vectorized<c10::BFloat16>& a,
+      const Vectorized<c10::BFloat16>& b,
+      int64_t count = size()) {
+    uint16_t pre_mask[size()] = {0};
+    for (int i = 0; i < count; i++) {
+      pre_mask[i] = 0xFFFF;
+    }
+    uint16x8_t mask = vld1q_u16(pre_mask);
+
+    Vectorized<c10::BFloat16> vec(at_vreinterpretq_bf16_u16(vbslq_u16(
+        mask,
+        at_vreinterpretq_u16_bf16(b.values),
+        at_vreinterpretq_u16_bf16(a.values))));
+
+    return vec;
+  }
+  static Vectorized<c10::BFloat16> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size()) {
+      return at_vld1q_bf16(reinterpret_cast<const at_bfloat16_t*>(ptr));
+    }
+    __at_align__ at_bfloat16_t tmp_values[size()];
+    std::memset(tmp_values, 0, sizeof(tmp_values));
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const at_bfloat16_t*>(ptr),
+        count * sizeof(at_bfloat16_t));
+    return at_vld1q_bf16(reinterpret_cast<const at_bfloat16_t*>(tmp_values));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      at_vst1q_bf16(reinterpret_cast<at_bfloat16_t*>(ptr), values);
+      return;
+    } else {
+      at_bfloat16_t tmp_values[size()];
+      at_vst1q_bf16(reinterpret_cast<at_bfloat16_t*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(at_bfloat16_t));
+    }
+  }
+  Vectorized<c10::BFloat16> isnan() const {
+    // NOTE: we could make this faster by doing vectorized checks of
+    // exponent/payload bits.
+    __at_align__ c10::BFloat16 tmp[size()];
+    __at_align__ c10::BFloat16 res[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i])) {
+        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(c10::BFloat16));
+      } else {
+        std::memset(static_cast<void*>(&res[i]), 0, sizeof(c10::BFloat16));
+      }
+    }
+    return loadu(res);
+  }
+  bool has_inf_nan() const {
+    __at_align__ c10::BFloat16 tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+#define DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(name)    \
+  Vectorized name() const {                                     \
+    return map_with_vec_float_method(&Vectorized<float>::name); \
+  }
+
+#define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name) \
+  Vectorized name(const Vectorized& other) const {               \
+    return map2_bitmask_with_vec_float_method(                   \
+        other, &Vectorized<float>::name);                        \
+  }
+
+  Vectorized frac() const;
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
+
+#ifdef __ARM_FEATURE_BF16
+  // Flip sign bit
+  Vectorized<c10::BFloat16> neg() const {
+    return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
+  }
+  // Fast reciprocal is fine because we are truncating results
+  Vectorized<c10::BFloat16> reciprocal() const {
+    auto x = vcvtq_low_f32_bf16(values);
+    auto y = vcvtq_high_f32_bf16(values);
+    x = vrecpeq_f32(x);
+    y = vrecpeq_f32(y);
+    return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
+  }
+  // Clearing the sign bit
+  Vectorized<c10::BFloat16> abs() const {
+    return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
+  }
+#else
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
+#endif
+
+// These functions are optimized on clang-21+
+#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
+  Vectorized<c10::BFloat16> operator==(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values == other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator!=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values != other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator<(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values < other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator<=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values <= other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator>(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values > other.values;
+  }
+
+  Vectorized<c10::BFloat16> operator>=(
+      const Vectorized<c10::BFloat16>& other) const {
+    return values >= other.values;
+  }
+#else
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
+  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
+#endif
+
+#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
+#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
+
+  Vectorized eq(const Vectorized& other) const;
+  Vectorized ne(const Vectorized& other) const;
+  Vectorized gt(const Vectorized& other) const;
+  Vectorized ge(const Vectorized& other) const;
+  Vectorized lt(const Vectorized& other) const;
+  Vectorized le(const Vectorized& other) const;
+}; // Vectorized<c10::BFloat16>
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<c10::BFloat16>& a) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  at_bfloat16x8_t x = a;
+  float32x4_t x1 =
+      Vectorized<c10::BFloat16>::convert_f32_bf16(at_vget_low_bf16(x));
+  float32x4_t x2 =
+      Vectorized<c10::BFloat16>::convert_f32_bf16(at_vget_high_bf16(x));
+  return {Vectorized<float>(x1), Vectorized<float>(x2)};
+}
+inline Vectorized<c10::BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  at_bfloat16x4_t x1 = Vectorized<c10::BFloat16>::convert_bf16_f32(a);
+  at_bfloat16x4_t x2 = Vectorized<c10::BFloat16>::convert_bf16_f32(b);
+  return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
+}
+
+template <typename Op>
+Vectorized<c10::BFloat16> binary_operator_via_float(
+    Op op,
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
+  const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
+  return convert_float_bfloat16(
+      op(a_float_low, b_float_low), op(a_float_high, b_float_high));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator+(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x + y;
+#else
+  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator-(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x - y;
+#else
+  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator*(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x * y;
+#else
+  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator/(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  return x / y;
+#else
+  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
+#endif
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+template <>
+Vectorized<c10::BFloat16> inline maximum(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+      a,
+      b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline minimum(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+      a,
+      b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline clamp(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& min,
+    const Vectorized<c10::BFloat16>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline clamp_max(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline clamp_min(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator&(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      vandq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator|(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      vorrq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator^(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      veorq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::eq(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this == other) & Vectorized<c10::BFloat16>(1);
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::ne(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this != other) & Vectorized<c10::BFloat16>(1);
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::gt(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this > other) & Vectorized<c10::BFloat16>(1);
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::ge(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this >= other) & Vectorized<c10::BFloat16>(1);
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::lt(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this < other) & Vectorized<c10::BFloat16>(1);
+}
+
+inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::le(
+    const Vectorized<c10::BFloat16>& other) const {
+  return (*this <= other) & Vectorized<c10::BFloat16>(1);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline fmadd(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b,
+    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return x * y + z;
+#else
+  // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,
+  // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
+  // elements, not the bottom and top half, so they don't seem
+  // particularly useful here. Ideally we would include dot product in
+  // the Vectorized interface...
+  return a * b + c;
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline fnmadd(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b,
+    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return (-x) * y + z;
+#else
+  // See NOTE [BF16 FMA] above.
+  return -a * b + c;
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline fmsub(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b,
+    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return x * y - z;
+#else
+  // See NOTE [BF16 FMA] above.
+  return a * b - c;
+#endif
+}
+
+template <>
+Vectorized<c10::BFloat16> inline fnmsub(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b,
+    const Vectorized<c10::BFloat16>& c) {
+#if BF16_ARITHMETIC_SUPPORTED()
+  bfloat16x8_t x = a;
+  bfloat16x8_t y = b;
+  bfloat16x8_t z = c;
+  return (-x) * y - z;
+#else
+  // See NOTE [BF16 FMA] above.
+  return -a * b - c;
+#endif
+}
+
+#endif // !defined(C10_MOBILE) && defined(__aarch64__)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..da9fb21eb24e3e9ad179fea82ad1ce6d242bc1a3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_convert.h
@@ -0,0 +1,383 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+
+// Enable auto-vectorization for clang-17+
+// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
+#if defined(__clang__) && (__clang_major__ >= 17)
+
+template <typename from_type, typename to_type>
+inline void convertImpl(
+    const from_type* __restrict src,
+    to_type* __restrict dst,
+    int64_t n) {
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dst[i] = static_cast<to_type>(src[i]);
+  }
+}
+
+template <typename to_type>
+inline void convertFromBool(
+    const bool* __restrict src,
+    to_type* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
+  }
+}
+
+template <typename from_type>
+inline void convertToBool(
+    const from_type* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
+  }
+}
+
+#define CONVERT_TEMPLATE(from_type, to_type)                           \
+  template <>                                                          \
+  inline void convert(const from_type* src, to_type* dst, int64_t n) { \
+    return convertImpl<from_type, to_type>(src, dst, n);               \
+  }
+
+#define CONVERT_FROM_BOOL_TEMPLATE(to_type)                       \
+  inline void convert(const bool* src, to_type* dst, int64_t n) { \
+    return convertFromBool<to_type>(src, dst, n);                 \
+  }
+
+#define CONVERT_TO_BOOL_TEMPLATE(from_type)                         \
+  inline void convert(const from_type* src, bool* dst, int64_t n) { \
+    return convertToBool<from_type>(src, dst, n);                   \
+  }
+
+CONVERT_TEMPLATE(uint8_t, uint8_t)
+CONVERT_TEMPLATE(uint8_t, int8_t)
+CONVERT_TEMPLATE(uint8_t, int16_t)
+CONVERT_TEMPLATE(uint8_t, int32_t)
+CONVERT_TEMPLATE(uint8_t, int64_t)
+CONVERT_TEMPLATE(uint8_t, float)
+CONVERT_TEMPLATE(uint8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(uint8_t)
+CONVERT_TEMPLATE(int8_t, uint8_t)
+CONVERT_TEMPLATE(int8_t, int8_t)
+CONVERT_TEMPLATE(int8_t, int16_t)
+CONVERT_TEMPLATE(int8_t, int32_t)
+CONVERT_TEMPLATE(int8_t, int64_t)
+CONVERT_TEMPLATE(int8_t, float)
+CONVERT_TEMPLATE(int8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int8_t)
+CONVERT_TEMPLATE(int16_t, uint8_t)
+CONVERT_TEMPLATE(int16_t, int8_t)
+CONVERT_TEMPLATE(int16_t, int16_t)
+CONVERT_TEMPLATE(int16_t, int32_t)
+CONVERT_TEMPLATE(int16_t, int64_t)
+CONVERT_TEMPLATE(int16_t, float)
+CONVERT_TEMPLATE(int16_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int16_t)
+CONVERT_TEMPLATE(int32_t, uint8_t)
+CONVERT_TEMPLATE(int32_t, int8_t)
+CONVERT_TEMPLATE(int32_t, int16_t)
+CONVERT_TEMPLATE(int32_t, int32_t)
+CONVERT_TEMPLATE(int32_t, int64_t)
+CONVERT_TEMPLATE(int32_t, float)
+CONVERT_TEMPLATE(int32_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int32_t)
+CONVERT_TEMPLATE(int64_t, uint8_t)
+CONVERT_TEMPLATE(int64_t, int8_t)
+CONVERT_TEMPLATE(int64_t, int16_t)
+CONVERT_TEMPLATE(int64_t, int32_t)
+CONVERT_TEMPLATE(int64_t, int64_t)
+CONVERT_TEMPLATE(int64_t, float)
+CONVERT_TEMPLATE(int64_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int64_t)
+CONVERT_TEMPLATE(float, uint8_t)
+CONVERT_TEMPLATE(float, int8_t)
+CONVERT_TEMPLATE(float, int16_t)
+CONVERT_TEMPLATE(float, int32_t)
+CONVERT_TEMPLATE(float, int64_t)
+CONVERT_TEMPLATE(float, float)
+CONVERT_TEMPLATE(float, double)
+CONVERT_TO_BOOL_TEMPLATE(float)
+CONVERT_TEMPLATE(double, uint8_t)
+CONVERT_TEMPLATE(double, int8_t)
+CONVERT_TEMPLATE(double, int16_t)
+CONVERT_TEMPLATE(double, int32_t)
+CONVERT_TEMPLATE(double, int64_t)
+CONVERT_TEMPLATE(double, float)
+CONVERT_TEMPLATE(double, double)
+CONVERT_TO_BOOL_TEMPLATE(double)
+CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int16_t)
+CONVERT_FROM_BOOL_TEMPLATE(int32_t)
+CONVERT_FROM_BOOL_TEMPLATE(int64_t)
+CONVERT_FROM_BOOL_TEMPLATE(float)
+CONVERT_FROM_BOOL_TEMPLATE(double)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define CONVERT_FROM_FP16_TEMPLATE(to_type)                            \
+  template <>                                                          \
+  inline void convert(const at::Half* src, to_type* dst, int64_t n) {  \
+    const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \
+    return convertImpl<float16_t, to_type>(srcPtr, dst, n);            \
+  }
+
+#define CONVERT_TO_FP16_TEMPLATE(from_type)                             \
+  template <>                                                           \
+  inline void convert(const from_type* src, at::Half* dst, int64_t n) { \
+    float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);              \
+    return convertImpl<from_type, float16_t>(src, dstPtr, n);           \
+  }
+
+CONVERT_FROM_FP16_TEMPLATE(uint8_t)
+CONVERT_FROM_FP16_TEMPLATE(int8_t)
+CONVERT_FROM_FP16_TEMPLATE(int16_t)
+CONVERT_FROM_FP16_TEMPLATE(int32_t)
+CONVERT_FROM_FP16_TEMPLATE(int64_t)
+CONVERT_FROM_FP16_TEMPLATE(float16_t)
+CONVERT_FROM_FP16_TEMPLATE(float)
+CONVERT_FROM_FP16_TEMPLATE(double)
+CONVERT_TO_FP16_TEMPLATE(uint8_t)
+CONVERT_TO_FP16_TEMPLATE(int8_t)
+CONVERT_TO_FP16_TEMPLATE(int16_t)
+CONVERT_TO_FP16_TEMPLATE(int32_t)
+CONVERT_TO_FP16_TEMPLATE(int64_t)
+CONVERT_TO_FP16_TEMPLATE(float)
+CONVERT_TO_FP16_TEMPLATE(double)
+
+inline void convertBoolToFp16Impl(
+    const bool* __restrict src,
+    at::Half* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, at::Half* dst, int64_t n) {
+  return convertBoolToFp16Impl(src, dst, n);
+}
+
+inline void convertFp16ToBoolImpl(
+    const at::Half* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
+  }
+}
+
+template <>
+inline void convert(const at::Half* src, bool* dst, int64_t n) {
+  return convertFp16ToBoolImpl(src, dst, n);
+}
+
+#endif
+
+template <typename to_type>
+inline void convertFromBf16Impl(
+    const c10::BFloat16* __restrict src,
+    to_type* __restrict dst,
+    int64_t n) {
+  const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    uint32_t tmp = static_cast<uint32_t>(srcPtr[i]) << 16;
+    float tmpF;
+    __builtin_memcpy(&tmpF, &tmp, sizeof(float));
+    dst[i] = static_cast<to_type>(tmpF);
+  }
+}
+#define CONVERT_FROM_BF16_TEMPLATE(to_type)                                \
+  template <>                                                              \
+  inline void convert(const c10::BFloat16* src, to_type* dst, int64_t n) { \
+    return convertFromBf16Impl<to_type>(src, dst, n);                      \
+  }
+
+CONVERT_FROM_BF16_TEMPLATE(uint8_t)
+CONVERT_FROM_BF16_TEMPLATE(int8_t)
+CONVERT_FROM_BF16_TEMPLATE(int16_t)
+CONVERT_FROM_BF16_TEMPLATE(int32_t)
+CONVERT_FROM_BF16_TEMPLATE(int64_t)
+CONVERT_FROM_BF16_TEMPLATE(float)
+CONVERT_FROM_BF16_TEMPLATE(double)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+CONVERT_FROM_BF16_TEMPLATE(float16_t)
+#endif
+
+#ifdef __ARM_FEATURE_BF16
+
+// clang-[17, 20] crashes when autovectorizing static cast to bf16
+// Below is a workaround to have some vectorization
+// Works decently well for smaller int types
+template <typename from_type>
+inline void convertToBf16Impl(
+    const from_type* __restrict src,
+    c10::BFloat16* __restrict dst,
+    uint64_t n) {
+  bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
+  uint64_t loopBound = n - (n % 16);
+  uint64_t i = 0;
+  for (; i < loopBound; i += 16) {
+    float32x4_t a, b, c, d;
+    a[0] = static_cast<float>(src[i]);
+    a[1] = static_cast<float>(src[i + 1]);
+    a[2] = static_cast<float>(src[i + 2]);
+    a[3] = static_cast<float>(src[i + 3]);
+    b[0] = static_cast<float>(src[i + 4]);
+    b[1] = static_cast<float>(src[i + 5]);
+    b[2] = static_cast<float>(src[i + 6]);
+    b[3] = static_cast<float>(src[i + 7]);
+    c[0] = static_cast<float>(src[i + 8]);
+    c[1] = static_cast<float>(src[i + 9]);
+    c[2] = static_cast<float>(src[i + 10]);
+    c[3] = static_cast<float>(src[i + 11]);
+    d[0] = static_cast<float>(src[i + 12]);
+    d[1] = static_cast<float>(src[i + 13]);
+    d[2] = static_cast<float>(src[i + 14]);
+    d[3] = static_cast<float>(src[i + 15]);
+
+    vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
+    vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
+  }
+
+#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
+  for (; i < n; i++) {
+    float a = static_cast<float>(src[i]);
+    dstPtr[i] = vcvth_bf16_f32(a);
+  }
+}
+
+#define CONVERT_TO_BF16_TEMPLATE(from_type)                                  \
+  template <>                                                                \
+  inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \
+    return convertToBf16Impl<from_type>(src, dst, n);                        \
+  }
+
+CONVERT_TO_BF16_TEMPLATE(uint8_t)
+CONVERT_TO_BF16_TEMPLATE(int8_t)
+CONVERT_TO_BF16_TEMPLATE(int16_t)
+CONVERT_TO_BF16_TEMPLATE(int32_t)
+
+#endif
+
+inline void convertBoolToBfloat16Impl(
+    const bool* __restrict src,
+    c10::BFloat16* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
+  return convertBoolToBfloat16Impl(src, dst, n);
+}
+
+inline void convertBfloat16ToBoolImpl(
+    const c10::BFloat16* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    // Check if all non-sign bits are 0
+    bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
+    dstPtr[i] = isBf16Zero ? 0 : 1;
+  }
+}
+
+template <>
+inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
+  return convertBfloat16ToBoolImpl(src, dst, n);
+}
+
+#endif
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_half_register_to_float(src[0]);
+  }
+};
+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    const auto [v0, v1] = convert_int8_to_float(src[0]);
+    return VectorizedN<float, 2>(v0, v1);
+  }
+};
+
+template <>
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 2> result;
+    uint16x8_t u16_8 = vld1q_u16(reinterpret_cast<const uint16_t*>(&src[0]));
+    auto u16_low1 = vget_low_u16(u16_8);
+    auto u16_high1 = vget_high_u16(u16_8);
+    float32x4_t f32x4_0 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
+    float32x4_t f32x4_1 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
+    result[0] = f32x4_0;
+    result[1] = f32x4_1;
+    return result;
+  }
+};
+// Half register to full register.
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    uint16x4_t u16_8 = vld1_u16(reinterpret_cast<const uint16_t*>(&src[0]));
+    float32x4_t f32x4_0 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16));
+    result[0] = f32x4_0;
+    return result;
+  }
+};
+
+#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..f27f9b272224af260be8b9d25ce1b0f2d2f7be90
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_double_neon.h
@@ -0,0 +1,591 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+#include <cmath>
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  float64x2_t values;
+
+ public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 2;
+  }
+  Vectorized() {
+    values = vdupq_n_f64(0.0);
+  }
+  Vectorized(float64x2_t v) : values(v) {}
+  Vectorized(double val) {
+    values = vdupq_n_f64(val);
+  }
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ double buffer[size()] = {vals...};
+    values = vld1q_f64(buffer);
+  }
+  operator float64x2_t() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint64x2_t maskArray = {
+        (mask & 1ULL) ? 0xFFFFFFFFFFFFFFFF : 0,
+        (mask & 2ULL) ? 0xFFFFFFFFFFFFFFFF : 0};
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_f64(maskArray, b.values, a.values);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask_) {
+    return vbslq_f64(vreinterpretq_u64_f64(mask_.values), b.values, a.values);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return {base, base + static_cast<double>(step)};
+  }
+  static inline Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count >= 2) {
+      return b;
+    } else {
+      float64x2_t c = {b.values[0], a.values[1]};
+      return c;
+    }
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size()) {
+      return vld1q_f64(reinterpret_cast<const double*>(ptr));
+    } else if (count == 1) {
+      float64x1_t x = vld1_f64(reinterpret_cast<const double*>(ptr));
+      float64x1_t z = {0.0};
+      return vcombine_f64(x, z);
+    } else {
+      return vdupq_n_f64(0.0);
+    }
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      vst1q_f64(reinterpret_cast<double*>(ptr), values);
+    } else if (count == 1) {
+      vst1_f64(reinterpret_cast<double*>(ptr), vget_low_f64(values));
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    uint64x2_t cmpReg = vceqzq_f64(values);
+    uint64x2_t mask = {1, 2};
+    uint64x2_t res = vandq_u64(cmpReg, mask);
+    return res[0] | res[1];
+  }
+  Vectorized<double> isnan() const {
+    // NaN check
+    return vreinterpretq_f64_u32(
+        vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, values))));
+  }
+  bool has_inf_nan() const {
+    Vectorized<double> x = vsubq_f64(values, values);
+    float64x2_t r = x.isnan();
+    uint64x2_t u = vreinterpretq_u64_f64(r);
+    return u[0] | u[1];
+  }
+  Vectorized<double> map(double (*f)(double)) const {
+    float64x2_t result;
+    result[0] = f(values[0]);
+    result[1] = f(values[1]);
+    return result;
+  }
+  Vectorized<double> map2(
+      const Vectorized<double>& second,
+      double (*const f)(double, double)) const {
+    float64x2_t result;
+    result[0] = f(values[0], second.values[0]);
+    result[1] = f(values[1], second.values[1]);
+    return result;
+  }
+  Vectorized<double> abs() const {
+    return vabsq_f64(values);
+  }
+  Vectorized<double> angle() const {
+    auto zero = Vectorized<double>(0.0);
+    auto pi = Vectorized<double>(c10::pi<double>);
+    auto tmp = blendv(zero, pi, vreinterpretq_f64_u64(vcltzq_f64(values)));
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>(0.0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acosd2_u10(values)), map(std::acos));
+  }
+  Vectorized<double> acosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acoshd2_u10(values)), map(std::acosh));
+  }
+  Vectorized<double> asin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asind2_u10(values)), map(std::asin));
+  }
+  Vectorized<double> asinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asinhd2_u10(values)), map(std::asinh));
+  }
+  Vectorized<double> atan() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atand2_u10(values)), map(std::atan));
+  }
+  Vectorized<double> atanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atanhd2_u10(values)), map(std::atanh));
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_atan2d2_u10(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> copysign(const Vectorized<double>& sign) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_copysignd2(values, sign)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_sign[size()];
+            store(tmp);
+            sign.store(tmp_sign);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> erf() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfd2_u10(values)), map(std::erf));
+  }
+  Vectorized<double> erfc() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfcd2_u15(values)), map(std::erfc));
+  }
+  Vectorized<double> exp() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expd2_u10(values)), map(std::exp));
+  }
+  Vectorized<double> exp2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_exp2d2_u10(values)), map(std::exp2));
+  }
+  Vectorized<double> expm1() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expm1d2_u10(values)), map(std::expm1));
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_fmodd2(values, q)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> hypot(const Vectorized<double>& b) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_hypotd2_u05(values, b)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_b[size()];
+            store(tmp);
+            b.store(tmp_b);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_nextafterd2(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<double> log() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_logd2_u10(values)), map(std::log));
+  }
+  Vectorized<double> log2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log2d2_u10(values)), map(std::log2));
+  }
+  Vectorized<double> log10() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log10d2_u10(values)), map(std::log10));
+  }
+  Vectorized<double> log1p() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log1pd2_u10(values)), map(std::log1p));
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> sin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sind2_u10(values)), map(std::sin));
+  }
+  Vectorized<double> sinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sinhd2_u10(values)), map(std::sinh));
+  }
+  Vectorized<double> cos() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_cosd2_u10(values)), map(std::cos));
+  }
+  Vectorized<double> cosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_coshd2_u10(values)), map(std::cosh));
+  }
+  Vectorized<double> pow(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_powd2_u10(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::pow(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} // Comparison using the _CMP_**_OQ predicate.
+          //   `O`: get false if an operand is NaN
+          //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> tan() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tand2_u10(values)), map(std::tan));
+  }
+  Vectorized<double> tanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tanhd2_u10(values)), map(std::tanh));
+  }
+  Vectorized<double> lgamma() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_lgammad2_u10(values)), map(std::lgamma));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> ceil() const {
+    return vrndpq_f64(values);
+  }
+  Vectorized<double> floor() const {
+    return vrndmq_f64(values);
+  }
+  Vectorized<double> neg() const {
+    return vnegq_f64(values);
+  }
+  Vectorized<double> round() const {
+    return vrndiq_f64(values);
+  }
+  Vectorized<double> trunc() const {
+    return vrndq_f64(values);
+  }
+  Vectorized<double> sqrt() const {
+    return vsqrtq_f64(values);
+  }
+  Vectorized<double> reciprocal() const {
+    return vdivq_f64(vdupq_n_f64(1.0), values);
+  }
+  Vectorized<double> rsqrt() const {
+    return vdivq_f64(vdupq_n_f64(1.0), vsqrtq_f64(values));
+  }
+  double reduce_add() const {
+    return vaddvq_f64(values);
+  }
+  double reduce_max() const {
+    return vmaxvq_f64(values);
+  }
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    return Vectorized<double>(
+        vreinterpretq_f64_u64(vceqq_f64(values, other.values)));
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    float64x2_t r0 = vreinterpretq_f64_u32(
+        vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, other.values))));
+    return Vectorized<double>(r0);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    return Vectorized<double>(
+        vreinterpretq_f64_u64(vcltq_f64(values, other.values)));
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    return Vectorized<double>(
+        vreinterpretq_f64_u64(vcleq_f64(values, other.values)));
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    return Vectorized<double>(
+        vreinterpretq_f64_u64(vcgtq_f64(values, other.values)));
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    return Vectorized<double>(
+        vreinterpretq_f64_u64(vcgeq_f64(values, other.values)));
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vaddq_f64(a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vsubq_f64(a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vmulq_f64(a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vdivq_f64(a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<double> inline Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vmaxq_f64(a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vminq_f64(a, b);
+}
+
+template <>
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+  return vminq_f64(max, vmaxq_f64(min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+  return vminq_f64(max, a);
+}
+
+template <>
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+  return vmaxq_f64(min, a);
+}
+
+template <>
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vreinterpretq_f64_u64(
+      vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vreinterpretq_f64_u64(
+      vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return vreinterpretq_f64_u64(
+      veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
+
+inline Vectorized<double> Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return vfmaq_f64(c, a, b);
+}
+
+template <>
+Vectorized<double> inline fnmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return vfmsq_f64(c, a, b);
+}
+
+template <>
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return vfmaq_f64(vnegq_f64(c), a, b);
+}
+
+template <>
+Vectorized<double> inline fnmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return vfmsq_f64(vnegq_f64(c), a, b);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6f047f86fc4f62fc82e24506f688e7d39a92214
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -0,0 +1,661 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
+// Sleef offers vectorized versions of some transcedentals
+// such as sin, cos, tan etc..
+// However for now opting for STL, since we are not building
+// with Sleef for mobile yet.
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Right now contains only aarch64 implementation.
+// Due to follow two reasons aarch32 is not currently supported.
+// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics
+//    that work for aarch64 dont work for aarch32.
+// 2. Android NDK r21 has problems with compiling aarch32.
+//    Clang seg faults.
+//    https://github.com/android/ndk/issues/1248
+//    https://bugs.llvm.org/show_bug.cgi?id=45824
+// Most likely we will do aarch32 support with inline asm.
+#if defined(__aarch64__)
+
+#ifdef __BIG_ENDIAN__
+#error "Big endian is not supported."
+#endif
+
+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+template <int index, bool mask_val>
+struct BlendRegs {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res);
+};
+
+template <int index>
+struct BlendRegs<index, true> {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
+  }
+};
+
+template <int index>
+struct BlendRegs<index, false> {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
+  }
+};
+
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  float32x4_t values;
+
+ public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {
+    values = vmovq_n_f32(0);
+  }
+  Vectorized(float32x4_t v) : values(v) {}
+  Vectorized(float val) : values{vdupq_n_f32(val)} {}
+  Vectorized(float val0, float val1, float val2, float val3)
+      : values{val0, val1, val2, val3} {}
+  Vectorized(float (&arr)[4]) : Vectorized(arr[0], arr[1], arr[2], arr[3]) {}
+  operator float32x4_t() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    Vectorized<float> vec;
+    vec.values = BlendRegs < 0,
+    (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 1,
+    (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 2,
+    (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 3,
+    (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
+    return vec;
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    // TODO
+    // NB: This requires that each value, i.e., each uint value,
+    // of the mask either all be zeros or all be 1s.
+    // We perhaps need some kind of an assert?
+    // But that will affect performance.
+    Vectorized<float> vec(mask.values);
+    vec.values =
+        vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+    return vec;
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    const Vectorized<float> base_vec(base);
+    const Vectorized<float> step_vec(step);
+    const Vectorized<float> step_sizes(0, 1, 2, 3);
+    return fmadd(step_sizes, step_vec, base_vec);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+      case 2: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+      case 3: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size()) {
+      return vld1q_f32(reinterpret_cast<const float*>(ptr));
+    } else {
+      __at_align__ float tmp_values[size()];
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0.0;
+      }
+      std::memcpy(
+          tmp_values,
+          reinterpret_cast<const float*>(ptr),
+          count * sizeof(float));
+      return vld1q_f32(reinterpret_cast<const float*>(tmp_values));
+    }
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      vst1q_f32(reinterpret_cast<float*>(ptr), values);
+    } else {
+      float tmp_values[size()];
+      vst1q_f32(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  // Very slow implementation of indexing.
+  // Only required because vec256_qint refers to this.
+  // Once we specialize that implementation for ARM
+  // this should be removed. TODO (kimishpatel)
+  float operator[](int idx) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  float operator[](int idx) {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  int zero_mask() const {
+    uint32x4_t is_zero_vec = vceqzq_f32(values);
+    const int32x4_t shift = vcombine_s32(
+        vcreate_s32(0x0 | (int64_t(0x1) << 32)),
+        vcreate_s32(0x2 | (int64_t(0x3) << 32)));
+    uint32x4_t bits_vec =
+        vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift);
+    return vaddvq_u32(bits_vec);
+  }
+  Vectorized<float> isnan() const {
+    return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values)));
+  }
+  bool has_inf_nan() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> map2(
+      const Vectorized<float>& second,
+      float (*const f)(float, float)) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_second[size()];
+    store(tmp);
+    second.store(tmp_second);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i], tmp_second[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return Vectorized<float>(vabsq_f32(values));
+  }
+  Vectorized<float> angle() const {
+    auto zero = Vectorized<float>(0);
+    auto pi = Vectorized<float>(c10::pi<float>);
+    auto tmp = blendv(zero, pi, *this < zero);
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(      \
+    name, sleef_name)                                                        \
+  Vectorized<float> name() const {                                           \
+    return USE_SLEEF(Vectorized<float>(sleef_name(values)), map(std::name)); \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name)      \
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+      name, Sleef_##name##f4_u10)
+
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acos)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acosh)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(asin)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(asinh)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atan)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atanh)
+
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+    name, sleef_name)                                                    \
+  Vectorized<float> name(const Vectorized<float>& arg) const {           \
+    return USE_SLEEF(                                                    \
+        Vectorized<float>(sleef_name(values, arg.values)),               \
+        map2(arg, std::name));                                           \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(name)      \
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+      name, Sleef_##name##f4_u10)
+
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(atan2)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      copysign,
+      Sleef_copysignf4)
+  Vectorized<float> erf() const;
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      erfc,
+      Sleef_erfcf4_u15)
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
+  // Implementation copied from Arm Optimized Routine
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+  inline Vectorized<float> vexpq_f32_u20() const {
+    // bail out to sleef if it's a special case:
+    // i.e. there's an input s.t. |input| > 87.3....
+    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
+    uint32x4_t cmp = vcagtq_f32(values, special_bound);
+    if (vpaddd_u64(vreinterpretq_u64_u32(cmp)) != 0) {
+      return exp();
+    }
+
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
+    const float ln2_hi = 0x1.62e4p-1f;
+    const float ln2_lo = 0x1.7f7d1cp-20f;
+    const float c0 = 0x1.0e4020p-7f;
+    const float c2 = 0x1.555e66p-3f;
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
+
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+    float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2));
+    float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0);
+    r = vfmsq_laneq_f32(r, n, ln2_c02, 1);
+    uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23);
+    float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias));
+
+    float32x4_t r2 = vmulq_f32(r, r);
+    float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2);
+    float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3);
+    q = vfmaq_f32(q, p, r2);
+    p = vmulq_f32(c4, r);
+    float32x4_t poly = vfmaq_f32(p, q, r2);
+
+    return vfmaq_f32(scale, poly, scale);
+  }
+  Vectorized<float> exp_u20() const {
+    return vexpq_f32_u20();
+  }
+  Vectorized<float> fexp_u20() const {
+    return exp_u20();
+  }
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      fmod,
+      Sleef_fmodf4)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      hypot,
+      Sleef_hypotf4_u05)
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    return map2(x, calc_igamma);
+  }
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    return map2(x, calc_igammac);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log10)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log1p)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log2)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      nextafter,
+      Sleef_nextafterf4)
+  Vectorized<float> frac() const;
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sin)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sinh)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cos)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cosh)
+  Vectorized<float> ceil() const {
+    return map(at::native::ceil_impl);
+  }
+  Vectorized<float> floor() const {
+    return map(at::native::floor_impl);
+  }
+  Vectorized<float> neg() const {
+    return Vectorized<float>(vnegq_f32(values));
+  }
+  Vectorized<float> round() const {
+    // We do not use std::round because we would like to round midway numbers to
+    // the nearest even integer.
+    return map(at::native::round_impl);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tan)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tanh)
+  Vectorized<float> trunc() const {
+    return Vectorized<float>(vrndq_f32(values));
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(lgamma)
+  Vectorized<float> sqrt() const {
+    return Vectorized<float>(vsqrtq_f32(values));
+  }
+  Vectorized<float> reciprocal() const {
+    return Vectorized<float>(vdivq_f32(vdupq_n_f32(1.0f), values));
+  }
+  Vectorized<float> rsqrt() const {
+    return this->sqrt().reciprocal();
+  }
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(pow)
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vceqq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+        vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, other.values)));
+    return Vectorized<float>(r0);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcltq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcleq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcgtq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcgeq_f32(values, other.values)));
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vaddq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vsubq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vmulq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vdivq_f32(a, b));
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vmaxq_f32(a, b));
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vminq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>(vfmaq_f32(c, a, b));
+}
+
+template <>
+Vectorized<float> inline fnmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>(vfmsq_f32(c, a, b));
+}
+
+template <>
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
+}
+
+template <>
+Vectorized<float> inline fnmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
+}
+
+inline Vectorized<float> Vectorized<float>::erf() const {
+  // constants
+  const Vectorized<float> neg_zero_vec(-0.f);
+  const Vectorized<float> one_vec(1.0f);
+  const Vectorized<float> p(0.3275911f);
+  const Vectorized<float> p1(0.254829592f);
+  const Vectorized<float> p2(-0.284496736f);
+  const Vectorized<float> p3(1.421413741f);
+  const Vectorized<float> p4(-1.453152027f);
+  const Vectorized<float> p5(1.061405429f);
+  // sign(x)
+  auto sign_mask = neg_zero_vec & *this;
+  auto abs_vec = this->abs();
+  // t = 1 / (p * abs(x) + 1)
+  auto tmp0 = fmadd(p, abs_vec, one_vec);
+  auto t = one_vec / tmp0;
+  // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+  auto tmp1 = fmadd(p5, t, p4);
+  auto tmp2 = fmadd(tmp1, t, p3);
+  auto tmp3 = fmadd(tmp2, t, p2);
+  auto r = fmadd(tmp3, t, p1);
+  // - exp(- x * x)
+  auto pow_2 = (*this) * (*this);
+  auto neg_pow_2 = pow_2 ^ neg_zero_vec;
+  auto tmp4 = neg_pow_2.vexpq_f32_u20();
+  auto tmp5 = tmp4 ^ neg_zero_vec;
+  // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+  auto tmp6 = t * tmp5;
+  auto tmp7 = fmadd(tmp6, r, one_vec);
+  return tmp7 ^ sign_mask;
+}
+#undef DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC
+#undef DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC
+#endif /* defined(aarch64) */
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d5a95e2fc54ae704bb019f50ae8347a6be93938
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_int_aarch64.h
@@ -0,0 +1,799 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#define VEC_INT_NEON_TEMPLATE(vl, bit)                                        \
+  template <>                                                                 \
+  struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {};  \
+                                                                              \
+  template <>                                                                 \
+  class Vectorized<int##bit##_t> {                                            \
+    using neon_type = int##bit##x##vl##_t;                                    \
+                                                                              \
+   private:                                                                   \
+    neon_type values;                                                         \
+                                                                              \
+   public:                                                                    \
+    using value_type = int##bit##_t;                                          \
+    using size_type = int;                                                    \
+    static constexpr size_type size() {                                       \
+      return vl;                                                              \
+    }                                                                         \
+    Vectorized() {                                                            \
+      values = vdupq_n_s##bit(0);                                             \
+    }                                                                         \
+    Vectorized(neon_type v) : values(v) {}                                    \
+    Vectorized(int##bit##_t val);                                             \
+    template <                                                                \
+        typename... Args,                                                     \
+        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
+    Vectorized(Args... vals) {                                                \
+      __at_align__ int##bit##_t buffer[size()] = {vals...};                   \
+      values = vld1q_s##bit(buffer);                                          \
+    }                                                                         \
+    operator neon_type() const {                                              \
+      return values;                                                          \
+    }                                                                         \
+    static Vectorized<int##bit##_t> loadu(                                    \
+        const void* ptr,                                                      \
+        int64_t count = size());                                              \
+    void store(void* ptr, int64_t count = size()) const;                      \
+    template <int64_t mask>                                                   \
+    static Vectorized<int##bit##_t> blend(                                    \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b);                                   \
+    static Vectorized<int##bit##_t> blendv(                                   \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        const Vectorized<int##bit##_t>& mask_) {                              \
+      return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \
+    }                                                                         \
+    template <typename step_t>                                                \
+    static Vectorized<int##bit##_t> arange(                                   \
+        value_type base = 0,                                                  \
+        step_t step = static_cast<step_t>(1));                                \
+    static Vectorized<int##bit##_t> set(                                      \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        int64_t count = size());                                              \
+    const int##bit##_t& operator[](int idx) const = delete;                   \
+    int##bit##_t& operator[](int idx) = delete;                               \
+    Vectorized<int##bit##_t> abs() const {                                    \
+      return vabsq_s##bit(values);                                            \
+    }                                                                         \
+    Vectorized<int##bit##_t> real() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> imag() const {                                   \
+      return vdupq_n_s##bit(0);                                               \
+    }                                                                         \
+    Vectorized<int##bit##_t> conj() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> neg() const {                                    \
+      return vnegq_s##bit(values);                                            \
+    }                                                                         \
+    int##bit##_t reduce_add() const {                                         \
+      return vaddvq_s##bit(values);                                           \
+    }                                                                         \
+    int##bit##_t reduce_max() const;                                          \
+    Vectorized<int##bit##_t> operator==(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator!=(                                      \
+        const Vectorized<int##bit##_t>& other) const;                         \
+    Vectorized<int##bit##_t> operator<(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
+  };                                                                          \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator+(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vaddq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator-(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vsubq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator&(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vandq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator|(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vorrq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator^(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return veorq_s##bit(a, b);                                                \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this == other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this != other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this > other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this >= other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this < other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this <= other) & Vectorized<int##bit##_t>(1);                    \
+  }
+
+VEC_INT_NEON_TEMPLATE(2, 64)
+VEC_INT_NEON_TEMPLATE(4, 32)
+VEC_INT_NEON_TEMPLATE(8, 16)
+VEC_INT_NEON_TEMPLATE(16, 8)
+
+inline int32_t Vectorized<int32_t>::reduce_max() const {
+  return vmaxvq_s32(values);
+}
+
+inline int16_t Vectorized<int16_t>::reduce_max() const {
+  return vmaxvq_s16(values);
+}
+
+inline int8_t Vectorized<int8_t>::reduce_max() const {
+  return vmaxvq_s8(values);
+}
+
+template <>
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vmulq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vmulq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vmulq_s8(a, b);
+}
+
+template <>
+inline Vectorized<int64_t> operator~(const Vectorized<int64_t>& a) {
+  int64x2_t val = a;
+  return ~val;
+}
+
+template <>
+inline Vectorized<int32_t> operator~(const Vectorized<int32_t>& a) {
+  return vmvnq_s32(a);
+}
+
+template <>
+inline Vectorized<int16_t> operator~(const Vectorized<int16_t>& a) {
+  return vmvnq_s16(a);
+}
+
+template <>
+inline Vectorized<int8_t> operator~(const Vectorized<int8_t>& a) {
+  return vmvnq_s8(a);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::operator!=(
+    const Vectorized<int64_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::operator!=(
+    const Vectorized<int32_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::operator!=(
+    const Vectorized<int16_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::operator!=(
+    const Vectorized<int8_t>& other) const {
+  return ~(*this == other);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vminq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vminq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vminq_s8(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vmaxq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vmaxq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vmaxq_s8(a, b);
+}
+
+template <int64_t mask>
+Vectorized<int64_t> Vectorized<int64_t>::blend(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint64x2_t maskArray = {
+      (mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0,
+      (mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s64(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int32_t> Vectorized<int32_t>::blend(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint32x4_t maskArray = {
+      (mask & 1LL) ? 0xFFFFFFFF : 0,
+      (mask & 2LL) ? 0xFFFFFFFF : 0,
+      (mask & 4LL) ? 0xFFFFFFFF : 0,
+      (mask & 8LL) ? 0xFFFFFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s32(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int16_t> Vectorized<int16_t>::blend(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint16x8_t maskArray = {
+      (mask & 1LL) ? 0xFFFF : 0,
+      (mask & 2LL) ? 0xFFFF : 0,
+      (mask & 4LL) ? 0xFFFF : 0,
+      (mask & 8LL) ? 0xFFFF : 0,
+      (mask & 16LL) ? 0xFFFF : 0,
+      (mask & 32LL) ? 0xFFFF : 0,
+      (mask & 64LL) ? 0xFFFF : 0,
+      (mask & 128LL) ? 0xFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s16(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int8_t> Vectorized<int8_t>::blend(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint8x16_t maskArray = {
+      (mask & 1LL) ? 0xFF : 0,
+      (mask & 2LL) ? 0xFF : 0,
+      (mask & 4LL) ? 0xFF : 0,
+      (mask & 8LL) ? 0xFF : 0,
+      (mask & 16LL) ? 0xFF : 0,
+      (mask & 32LL) ? 0xFF : 0,
+      (mask & 64LL) ? 0xFF : 0,
+      (mask & 128LL) ? 0xFF : 0,
+      (mask & 256LL) ? 0xFF : 0,
+      (mask & 512LL) ? 0xFF : 0,
+      (mask & 1024LL) ? 0xFF : 0,
+      (mask & 2048LL) ? 0xFF : 0,
+      (mask & 4096LL) ? 0xFF : 0,
+      (mask & 8192LL) ? 0xFF : 0,
+      (mask & 16384LL) ? 0xFF : 0,
+      (mask & 32768LL) ? 0xFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s8(maskArray, b.values, a.values);
+}
+
+#define VEC_INT_NEON_OPS(vl, bit)                                             \
+  inline Vectorized<int##bit##_t>::Vectorized(int##bit##_t val) {             \
+    values = vdupq_n_s##bit(val);                                             \
+  }                                                                           \
+  inline Vectorized<int##bit##_t> Vectorized<int##bit##_t>::loadu(            \
+      const void* ptr, int64_t count) {                                       \
+    if (count == size()) {                                                    \
+      return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(ptr));        \
+    } else {                                                                  \
+      __at_align__ int##bit##_t tmp_values[size()];                           \
+      for (const auto i : c10::irange(size())) {                              \
+        tmp_values[i] = 0;                                                    \
+      }                                                                       \
+      std::memcpy(                                                            \
+          tmp_values,                                                         \
+          reinterpret_cast<const int##bit##_t*>(ptr),                         \
+          count * sizeof(int##bit##_t));                                      \
+      return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(tmp_values)); \
+    }                                                                         \
+  }                                                                           \
+  inline void Vectorized<int##bit##_t>::store(void* ptr, int64_t count)       \
+      const {                                                                 \
+    if (count == size()) {                                                    \
+      vst1q_s##bit(reinterpret_cast<int##bit##_t*>(ptr), values);             \
+    } else {                                                                  \
+      int##bit##_t tmp_values[size()];                                        \
+      vst1q_s##bit(reinterpret_cast<int##bit##_t*>(tmp_values), values);      \
+      std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t));             \
+    }                                                                         \
+  }
+
+VEC_INT_NEON_OPS(2, 64)
+VEC_INT_NEON_OPS(4, 32)
+VEC_INT_NEON_OPS(8, 16)
+VEC_INT_NEON_OPS(16, 8)
+
+template <>
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return x * y;
+}
+
+template <>
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return x / y;
+}
+
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t x = a;
+  int32x4_t y = b;
+  return x / y;
+}
+
+inline int64_t Vectorized<int64_t>::reduce_max() const {
+  return std::max(values[0], values[1]);
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return {std::min(x[0], y[0]), std::min(x[1], y[1])};
+}
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return {std::max(x[0], y[0]), std::max(x[1], y[1])};
+}
+
+template <typename step_t>
+inline Vectorized<int64_t> Vectorized<int64_t>::arange(
+    int64_t base,
+    step_t step) {
+  const Vectorized<int64_t> base_vec(base);
+  const Vectorized<int64_t> step_vec(step);
+  const int64x2_t step_sizes = {0, 1};
+  return base_vec.values + step_sizes * step_vec.values;
+}
+
+template <typename step_t>
+inline Vectorized<int32_t> Vectorized<int32_t>::arange(
+    int32_t base,
+    step_t step) {
+  const Vectorized<int32_t> base_vec(base);
+  const Vectorized<int32_t> step_vec(step);
+  const int32x4_t step_sizes = {0, 1, 2, 3};
+  return vmlaq_s32(base_vec, step_sizes, step_vec);
+}
+
+template <typename step_t>
+inline Vectorized<int16_t> Vectorized<int16_t>::arange(
+    int16_t base,
+    step_t step) {
+  const Vectorized<int16_t> base_vec(base);
+  const Vectorized<int16_t> step_vec(step);
+  const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vmlaq_s16(base_vec, step_sizes, step_vec);
+}
+
+template <typename step_t>
+inline Vectorized<int8_t> Vectorized<int8_t>::arange(int8_t base, step_t step) {
+  const Vectorized<int8_t> base_vec(base);
+  const Vectorized<int8_t> step_vec(step);
+  const int8x16_t step_sizes = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return vmlaq_s8(base_vec, step_sizes, step_vec);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  uint64x2_t u = vreinterpretq_u64_s64(y);
+  uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)};
+  return x >> vreinterpretq_s64_u64(z);
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t x = a;
+  int32x4_t y = b;
+  uint32x4_t bound = vdupq_n_u32(31);
+  uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
+  return x >> vreinterpretq_s32_u32(z);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  int16x8_t x = a;
+  int16x8_t y = b;
+  uint16x8_t bound = vdupq_n_u16(15);
+  uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
+  return x >> vreinterpretq_s16_u16(z);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  int8x16_t x = a;
+  int8x16_t y = b;
+  uint8x16_t bound = vdupq_n_u8(7);
+  int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
+  return x >> z;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t y = b;
+  uint64x2_t u = vreinterpretq_u64_s64(y);
+  uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)};
+  return vshlq_s64(a, vreinterpretq_s64_u64(z));
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t y = b;
+  uint32x4_t bound = vdupq_n_u32(32);
+  uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
+  return vshlq_s32(a, vreinterpretq_s32_u32(z));
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  int16x8_t y = b;
+  uint16x8_t bound = vdupq_n_u16(16);
+  uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
+  return vshlq_s16(a, vreinterpretq_s16_u16(z));
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  int8x16_t y = b;
+  uint8x16_t bound = vdupq_n_u8(8);
+  int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
+  return vshlq_s8(a, z);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::set(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 2) {
+    return b;
+  } else {
+    int64x2_t c = {b.values[0], a.values[1]};
+    return c;
+  }
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::set(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 4) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint32x4_t maskArray = {
+        (count >= 1LL) ? 0xFFFFFFFF : 0,
+        (count >= 2LL) ? 0xFFFFFFFF : 0,
+        (count >= 3LL) ? 0xFFFFFFFF : 0,
+        0};
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s32(maskArray, b.values, a.values);
+  }
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::set(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 8) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint16x8_t maskArray = {
+        static_cast<uint16_t>((count >= 1LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 2LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 3LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 4LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 5LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 6LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 7LL) ? 0xFFFF : 0),
+        0};
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s16(maskArray, b.values, a.values);
+  }
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::set(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 16) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint8x16_t maskArray = {
+        static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
+        0};
+
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s8(maskArray, b.values, a.values);
+  }
+}
+
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  Vectorized<int32_t> highBitsA = vmovl_high_s16(a);
+  Vectorized<int32_t> highBitsB = vmovl_high_s16(b);
+  Vectorized<int32_t> lowBitsA = vmovl_s16(vget_low_s16(a));
+  Vectorized<int32_t> lowBitsB = vmovl_s16(vget_low_s16(b));
+  int32x4_t highBitsResult = highBitsA / highBitsB;
+  int32x4_t lowBitsResult = lowBitsA / lowBitsB;
+  return vuzp1q_s16(
+      vreinterpretq_s16_s32(lowBitsResult),
+      vreinterpretq_s16_s32(highBitsResult));
+}
+
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  Vectorized<int16_t> highBitsA = vmovl_high_s8(a);
+  Vectorized<int16_t> highBitsB = vmovl_high_s8(b);
+  Vectorized<int16_t> lowBitsA = vmovl_s8(vget_low_s8(a));
+  Vectorized<int16_t> lowBitsB = vmovl_s8(vget_low_s8(b));
+  int16x8_t highBitsResult = highBitsA / highBitsB;
+  int16x8_t lowBitsResult = lowBitsA / lowBitsB;
+  return vuzp1q_s8(
+      vreinterpretq_s8_s16(lowBitsResult),
+      vreinterpretq_s8_s16(highBitsResult));
+}
+
+template <>
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min,
+    const Vectorized<int64_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min,
+    const Vectorized<int32_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min,
+    const Vectorized<int16_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min,
+    const Vectorized<int8_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min) {
+  return maximum(min, a);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8c811704314cceb401a0ed793a219332977fded
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
@@ -0,0 +1,383 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#define VEC_UINT_NEON_TEMPLATE(vl, bit)                                       \
+  template <>                                                                 \
+  struct is_vec_specialized_for<uint##bit##_t> : std::bool_constant<true> {}; \
+                                                                              \
+  template <>                                                                 \
+  class Vectorized<uint##bit##_t> {                                           \
+    using neon_type = uint##bit##x##vl##_t;                                   \
+                                                                              \
+   private:                                                                   \
+    neon_type values;                                                         \
+                                                                              \
+   public:                                                                    \
+    using value_type = uint##bit##_t;                                         \
+    using size_type = int;                                                    \
+    static constexpr size_type size() {                                       \
+      return vl;                                                              \
+    }                                                                         \
+    Vectorized() {                                                            \
+      values = vdupq_n_u##bit(0);                                             \
+    }                                                                         \
+    Vectorized(neon_type v) : values(v) {}                                    \
+    Vectorized(uint##bit##_t val);                                            \
+    template <                                                                \
+        typename... Args,                                                     \
+        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
+    Vectorized(Args... vals) {                                                \
+      __at_align__ uint##bit##_t buffer[size()] = {vals...};                  \
+      values = vld1q_u##bit(buffer);                                          \
+    }                                                                         \
+    operator neon_type() const {                                              \
+      return values;                                                          \
+    }                                                                         \
+    static Vectorized<uint##bit##_t> loadu(                                   \
+        const void* ptr,                                                      \
+        uint64_t count = size());                                             \
+    void store(void* ptr, uint64_t count = size()) const;                     \
+    template <uint64_t mask>                                                  \
+    static Vectorized<uint##bit##_t> blend(                                   \
+        const Vectorized<uint##bit##_t>& a,                                   \
+        const Vectorized<uint##bit##_t>& b);                                  \
+    static Vectorized<uint##bit##_t> blendv(                                  \
+        const Vectorized<uint##bit##_t>& a,                                   \
+        const Vectorized<uint##bit##_t>& b,                                   \
+        const Vectorized<uint##bit##_t>& mask_) {                             \
+      return vbslq_u##bit(mask_.values, b, a);                                \
+    }                                                                         \
+    template <typename step_t>                                                \
+    static Vectorized<uint##bit##_t> arange(                                  \
+        value_type base = 0,                                                  \
+        step_t step = static_cast<step_t>(1));                                \
+    static Vectorized<uint##bit##_t> set(                                     \
+        const Vectorized<uint##bit##_t>& a,                                   \
+        const Vectorized<uint##bit##_t>& b,                                   \
+        uint64_t count = size());                                             \
+    const uint##bit##_t& operator[](uint idx) const = delete;                 \
+    uint##bit##_t& operator[](uint idx) = delete;                             \
+    Vectorized<uint##bit##_t> abs() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<uint##bit##_t> real() const {                                  \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<uint##bit##_t> imag() const {                                  \
+      return vdupq_n_u##bit(0);                                               \
+    }                                                                         \
+    Vectorized<uint##bit##_t> conj() const {                                  \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<uint##bit##_t> neg() const {                                   \
+      return vreinterpretq_u##bit##_s##bit(                                   \
+          vnegq_s##bit(vreinterpretq_s##bit##_u##bit(values)));               \
+    }                                                                         \
+    uint##bit##_t reduce_add() const {                                        \
+      return vaddvq_u##bit(values);                                           \
+    }                                                                         \
+    uint##bit##_t reduce_max() const;                                         \
+    Vectorized<uint##bit##_t> operator==(                                     \
+        const Vectorized<uint##bit##_t>& other) const {                       \
+      return Vectorized<value_type>(vceqq_u##bit(values, other.values));      \
+    }                                                                         \
+    Vectorized<uint##bit##_t> operator!=(                                     \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> operator<(                                      \
+        const Vectorized<uint##bit##_t>& other) const {                       \
+      return Vectorized<value_type>(vcltq_u##bit(values, other.values));      \
+    }                                                                         \
+    Vectorized<uint##bit##_t> operator<=(                                     \
+        const Vectorized<uint##bit##_t>& other) const {                       \
+      return Vectorized<value_type>(vcleq_u##bit(values, other.values));      \
+    }                                                                         \
+    Vectorized<uint##bit##_t> operator>(                                      \
+        const Vectorized<uint##bit##_t>& other) const {                       \
+      return Vectorized<value_type>(vcgtq_u##bit(values, other.values));      \
+    }                                                                         \
+    Vectorized<uint##bit##_t> operator>=(                                     \
+        const Vectorized<uint##bit##_t>& other) const {                       \
+      return Vectorized<value_type>(vcgeq_u##bit(values, other.values));      \
+    }                                                                         \
+    Vectorized<uint##bit##_t> eq(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> ne(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> gt(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> ge(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> lt(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+    Vectorized<uint##bit##_t> le(                                             \
+        const Vectorized<uint##bit##_t>& other) const;                        \
+  };                                                                          \
+  template <>                                                                 \
+  Vectorized<uint##bit##_t> inline operator+(                                 \
+      const Vectorized<uint##bit##_t>& a,                                     \
+      const Vectorized<uint##bit##_t>& b) {                                   \
+    return vaddq_u##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<uint##bit##_t> inline operator-(                                 \
+      const Vectorized<uint##bit##_t>& a,                                     \
+      const Vectorized<uint##bit##_t>& b) {                                   \
+    return vsubq_u##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<uint##bit##_t> inline operator&(                                 \
+      const Vectorized<uint##bit##_t>& a,                                     \
+      const Vectorized<uint##bit##_t>& b) {                                   \
+    return vandq_u##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<uint##bit##_t> inline operator|(                                 \
+      const Vectorized<uint##bit##_t>& a,                                     \
+      const Vectorized<uint##bit##_t>& b) {                                   \
+    return vorrq_u##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<uint##bit##_t> inline operator^(                                 \
+      const Vectorized<uint##bit##_t>& a,                                     \
+      const Vectorized<uint##bit##_t>& b) {                                   \
+    return veorq_u##bit(a, b);                                                \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::eq(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this == other) & Vectorized<uint##bit##_t>(1);                   \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ne(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this != other) & Vectorized<uint##bit##_t>(1);                   \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::gt(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this > other) & Vectorized<uint##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ge(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this >= other) & Vectorized<uint##bit##_t>(1);                   \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::lt(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this < other) & Vectorized<uint##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::le(             \
+      const Vectorized<uint##bit##_t>& other) const {                         \
+    return (*this <= other) & Vectorized<uint##bit##_t>(1);                   \
+  }
+
+VEC_UINT_NEON_TEMPLATE(16, 8)
+
+inline uint8_t Vectorized<uint8_t>::reduce_max() const {
+  return vmaxvq_u8(values);
+}
+
+template <>
+Vectorized<uint8_t> inline operator*(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return vmulq_u8(a, b);
+}
+
+template <>
+inline Vectorized<uint8_t> operator~(const Vectorized<uint8_t>& a) {
+  return vmvnq_u8(a);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::operator!=(
+    const Vectorized<uint8_t>& other) const {
+  return ~(*this == other);
+}
+
+template <>
+Vectorized<uint8_t> inline minimum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return vminq_u8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline maximum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return vmaxq_u8(a, b);
+}
+
+template <uint64_t mask>
+Vectorized<uint8_t> Vectorized<uint8_t>::blend(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint8x16_t maskArray = {
+      (mask & 1LL) ? 0xFF : 0,
+      (mask & 2LL) ? 0xFF : 0,
+      (mask & 4LL) ? 0xFF : 0,
+      (mask & 8LL) ? 0xFF : 0,
+      (mask & 16LL) ? 0xFF : 0,
+      (mask & 32LL) ? 0xFF : 0,
+      (mask & 64LL) ? 0xFF : 0,
+      (mask & 128LL) ? 0xFF : 0,
+      (mask & 256LL) ? 0xFF : 0,
+      (mask & 512LL) ? 0xFF : 0,
+      (mask & 1024LL) ? 0xFF : 0,
+      (mask & 2048LL) ? 0xFF : 0,
+      (mask & 4096LL) ? 0xFF : 0,
+      (mask & 8192LL) ? 0xFF : 0,
+      (mask & 16384LL) ? 0xFF : 0,
+      (mask & 32768LL) ? 0xFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_u8(maskArray, b.values, a.values);
+}
+
+#define VEC_UINT_NEON_OPS(vl, bit)                                             \
+  inline Vectorized<uint##bit##_t>::Vectorized(uint##bit##_t val) {            \
+    values = vdupq_n_u##bit(val);                                              \
+  }                                                                            \
+  inline Vectorized<uint##bit##_t> Vectorized<uint##bit##_t>::loadu(           \
+      const void* ptr, uint64_t count) {                                       \
+    if (count == size()) {                                                     \
+      return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(ptr));        \
+    } else {                                                                   \
+      __at_align__ uint##bit##_t tmp_values[size()];                           \
+      for (const auto i : c10::irange(size())) {                               \
+        tmp_values[i] = 0;                                                     \
+      }                                                                        \
+      std::memcpy(                                                             \
+          tmp_values,                                                          \
+          reinterpret_cast<const uint##bit##_t*>(ptr),                         \
+          count * sizeof(uint##bit##_t));                                      \
+      return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(tmp_values)); \
+    }                                                                          \
+  }                                                                            \
+  inline void Vectorized<uint##bit##_t>::store(void* ptr, uint64_t count)      \
+      const {                                                                  \
+    if (count == size()) {                                                     \
+      vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(ptr), values);             \
+    } else {                                                                   \
+      uint##bit##_t tmp_values[size()];                                        \
+      vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(tmp_values), values);      \
+      std::memcpy(ptr, tmp_values, count * sizeof(uint##bit##_t));             \
+    }                                                                          \
+  }
+
+VEC_UINT_NEON_OPS(16, 8)
+
+template <typename step_t>
+inline Vectorized<uint8_t> Vectorized<uint8_t>::arange(
+    uint8_t base,
+    step_t step) {
+  const Vectorized<uint8_t> base_vec(base);
+  const Vectorized<uint8_t> step_vec(step);
+  const uint8x16_t step_sizes = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return vmlaq_u8(base_vec, step_sizes, step_vec);
+}
+
+template <>
+Vectorized<uint8_t> inline operator>>(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  uint8x16_t x = a;
+  uint8x16_t bound = vdupq_n_u8(8);
+  uint8x16_t z = vminq_u8(b, bound);
+  return x >> z;
+}
+
+template <>
+Vectorized<uint8_t> inline operator<<(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  uint8x16_t bound = vdupq_n_u8(8);
+  uint8x16_t z = vminq_u8(b, bound);
+  return vshlq_u8(a, vreinterpretq_s8_u8(z));
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::set(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b,
+    uint64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 16) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint8x16_t maskArray = {
+        static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
+        0};
+
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_u8(maskArray, b.values, a.values);
+  }
+}
+
+template <>
+Vectorized<uint8_t> inline operator/(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  uint8x16_t x = a;
+  uint8x16_t y = b;
+  return x / y;
+}
+
+template <>
+Vectorized<uint8_t> inline clamp(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min,
+    const Vectorized<uint8_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_max(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_min(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min) {
+  return maximum(min, a);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h
new file mode 100644
index 0000000000000000000000000000000000000000..6745dd7eb2a1f371b45d5e21fe2f52276cf864db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h
@@ -0,0 +1,435 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+#if !(                                                 \
+    defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
+    defined(CPU_CAPABILITY_ZVECTOR))
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
+// clang-format off
+#include <ATen/cpu/vec/vec256/vec256_float.h>
+#include <ATen/cpu/vec/vec256/vec256_double.h>
+#include <ATen/cpu/vec/vec256/vec256_int.h>
+#include <ATen/cpu/vec/vec256/vec256_qint.h>
+#endif
+#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#endif
+#include <ATen/cpu/vec/vec256/vec256_half.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
+// clang-format on
+#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
+#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
+#else
+// clang-format off
+#include <ATen/cpu/vec/vec256/zarch/vec256_zarch.h>
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#include <ATen/cpu/vec/vec256/vec256_half.h>
+// clang-format on
+#endif
+
+#include <ATen/cpu/vec/vec256/vec256_convert.h>
+#include <ATen/cpu/vec/vec256/vec256_mask.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+namespace at::vec {
+
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << ']';
+  return stream;
+}
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return _mm256_castpd_ps(src);
+}
+
+template <>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return _mm256_castps_pd(src);
+}
+
+template <>
+inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
+  return _mm256_castsi256_ps(src);
+}
+
+template <>
+inline Vectorized<double> cast<double, int64_t>(
+    const Vectorized<int64_t>& src) {
+  return _mm256_castsi256_pd(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm256_i64gather_pd(base_addr, vindex, scale);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+  return _mm256_i32gather_ps(base_addr, vindex, scale);
+}
+#endif
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex,
+        Vectorized<double>& mask) {
+  return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale);
+}
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex,
+        Vectorized<float>& mask) {
+  return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+}
+#endif
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000));
+  return _mm256_sub_epi64(
+      _mm256_castpd_si256(x),
+      _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+  return _mm256_cvttps_epi32(src);
+}
+
+// From: https://stackoverflow.com/a/41148578
+template <>
+Vectorized<double> inline convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src) {
+  __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 */
+  __m256i magic_i_hi32 =
+      _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 */
+  __m256i magic_i_all =
+      _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 */
+  __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
+
+  __m256i v_lo = _mm256_blend_epi32(
+      magic_i_lo, src, 0b01010101); /* v_low = low32 + 2^52 */
+  __m256i v_hi = _mm256_srli_epi64(src, 32);
+  v_hi = _mm256_xor_si256(
+      v_hi, magic_i_hi32); /* v_hi = high32*2^32 + 2^84 + 2^63 */
+  /* int64 = low32 + high32*2^32 = v_hi + v_lo - 2^52 - 2^63 - 2^84 */
+  __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+  __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+  return result;
+}
+
+template <>
+Vectorized<float> inline convert_to_fp_of_same_size<float>(
+    const Vectorized<int32_t>& src) {
+  return _mm256_cvtepi32_ps(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3}
+  //   b = {b0, b1, b2, b3}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, b0, b1}
+  //   b_swapped = {a2, a3, b2, b3}
+  auto a_swapped =
+      _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2.   4 bits apart
+  auto b_swapped =
+      _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3.   4 bits apart
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      _mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3
+      _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  auto a_swapped =
+      _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2.   4 bits apart
+  auto b_swapped =
+      _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3.   4 bits apart
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  return std::make_pair(
+      _mm256_permutevar8x32_ps(a_swapped, group_ctrl),
+      _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, b0, b1}
+  //   b_grouped = {a2, a3, b2, b3}
+  auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000); // 0, 2, 1, 3
+  auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000); // 0, 2, 1, 3
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(
+      _mm256_permute2f128_pd(
+          a_grouped, b_grouped, 0b0100000), // 0, 2.   4 bits apart
+      _mm256_permute2f128_pd(
+          a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl);
+  auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(
+      _mm256_permute2f128_ps(
+          a_grouped, b_grouped, 0b0100000), // 0, 2.   4 bits apart
+      _mm256_permute2f128_ps(
+          a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+inline Vectorized<float> flip(const Vectorized<float>& v) {
+  const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_ps(v, mask_float);
+}
+
+template <>
+inline Vectorized<double> flip(const Vectorized<double>& v) {
+  return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template <>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t>& v) {
+  return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template <>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t>& v) {
+  const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_epi32(v, mask_int32);
+}
+
+template <>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t>& v) {
+  const __m256i mask = _mm256_set_epi8(
+      1,
+      0,
+      3,
+      2,
+      5,
+      4,
+      7,
+      6,
+      9,
+      8,
+      11,
+      10,
+      13,
+      12,
+      15,
+      14,
+      1,
+      0,
+      3,
+      2,
+      5,
+      4,
+      7,
+      6,
+      9,
+      8,
+      11,
+      10,
+      13,
+      12,
+      15,
+      14);
+  auto reversed = _mm256_shuffle_epi8(v, mask);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+inline __m256i flip8(const __m256i& v) {
+  const __m256i mask_int8 = _mm256_set_epi8(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15);
+  auto reversed = _mm256_shuffle_epi8(v, mask_int8);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+template <>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t>& v) {
+  return flip8(v);
+}
+
+template <>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t>& v) {
+  return flip8(v);
+}
+
+inline Vectorized<bool> operator&&(
+    const Vectorized<bool>& self,
+    const Vectorized<bool>& other) {
+  const __m256i* self_ = reinterpret_cast<const __m256i*>(self.as_bytes());
+  const __m256i* other_ = reinterpret_cast<const __m256i*>(other.as_bytes());
+  __m256i out = _mm256_and_si256(*self_, *other_);
+  Vectorized<bool> ret;
+  std::memcpy(ret, &out, ret.size() * sizeof(bool));
+  return ret;
+}
+
+#endif // (defined(CPU_CAPABILITY_AVX2)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a585884e36ebdb20ef32ef8dc0e9f82d02895ba
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_16bit_float.h
@@ -0,0 +1,837 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+// Used for shared functions and classes for vec256_bfloat16.h and
+// vec256_half.h. Any functions/classes that are common between those two files
+// should be defined here. Any non-shared functions/classes should be defined in
+// the respective files.
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif
+
+// bfloat16 conversion
+static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
+  o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16));
+}
+
+static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
+  __m128i lo = _mm256_extractf128_si256(a, 0);
+  __m128i hi = _mm256_extractf128_si256(a, 1);
+  cvtbf16_fp32(lo, o1);
+  cvtbf16_fp32(hi, o2);
+}
+
+static inline __m128i cvtfp32_bf16(const __m256& src) {
+  __m256i value = _mm256_castps_si256(src);
+  __m256i nan = _mm256_set1_epi32(0xffff);
+  __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(src, src, _CMP_ORD_Q));
+  __m256i ones = _mm256_set1_epi32(0x1);
+  __m256i vec_bias = _mm256_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm256_and_si256(_mm256_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm256_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm256_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm256_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm256_blendv_epi8(nan, t_value, mask);
+  t_value =
+      _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4]
+  t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11     01     10     00
+  return _mm256_castsi256_si128(t_value);
+}
+
+static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
+  __m256i lo = _mm256_castps_si256(a);
+  __m256i hi = _mm256_castps_si256(b);
+  __m256i nan = _mm256_set1_epi32(0xffff);
+  __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q));
+  __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q));
+  __m256i ones = _mm256_set1_epi32(0x1);
+  __m256i vec_bias = _mm256_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_lo = _mm256_and_si256(_mm256_srli_epi32(lo, 16), ones);
+  auto t_hi = _mm256_and_si256(_mm256_srli_epi32(hi, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_lo = _mm256_add_epi32(t_lo, vec_bias);
+  t_hi = _mm256_add_epi32(t_hi, vec_bias);
+  // input += rounding_bias;
+  t_lo = _mm256_add_epi32(t_lo, lo);
+  t_hi = _mm256_add_epi32(t_hi, hi);
+  // input = input >> 16;
+  t_lo = _mm256_srli_epi32(t_lo, 16);
+  t_hi = _mm256_srli_epi32(t_hi, 16);
+  // Check NaN before converting back to bf16
+  t_lo = _mm256_blendv_epi8(nan, t_lo, mask_lo);
+  t_hi = _mm256_blendv_epi8(nan, t_hi, mask_hi);
+
+  t_lo = _mm256_packus_epi32(
+      t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+  return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11        01        10 00
+}
+
+static inline __m256i merge_compare_result(const __m256& a, const __m256& b) {
+  __m256i lo = _mm256_castps_si256(a);
+  __m256i hi = _mm256_castps_si256(b);
+  lo = _mm256_srli_epi32(lo, 16);
+  hi = _mm256_srli_epi32(hi, 16);
+  auto out = _mm256_packus_epi32(lo, hi);
+  return _mm256_permute4x64_epi64(out, 0xd8);
+}
+
+// float16 conversion
+static inline void cvtfp16_fp32(const __m128i& a, __m256& o) {
+  o = _mm256_cvtph_ps(a);
+}
+
+static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
+  __m128i lo = _mm256_extractf128_si256(a, 0);
+  __m128i hi = _mm256_extractf128_si256(a, 1);
+  cvtfp16_fp32(lo, o1);
+  cvtfp16_fp32(hi, o2);
+}
+
+static inline __m128i cvtfp32_fp16(const __m256& src) {
+  return _mm256_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
+  __m128i lo =
+      _mm256_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m128i hi =
+      _mm256_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+// dtype conversion between float16/bfloat16 and float32
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m128i& a, __m256& o);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m128i& a, __m256& o) {
+  cvtbf16_fp32(a, o);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m128i& a, __m256& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <
+    typename T,
+    bool is_compare_op = false,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m256i cvt_from_fp32(const __m256& a, const __m256& b);
+template <>
+inline __m256i cvt_from_fp32<BFloat16, false>(
+    const __m256& a,
+    const __m256& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<BFloat16, true>(const __m256& a, const __m256& b) {
+  return merge_compare_result(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<Half, false>(const __m256& a, const __m256& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<Half, true>(const __m256& a, const __m256& b) {
+  return cvtfp32_fp16(a, b);
+}
+
+template <typename T>
+class Vectorized16 {
+  static_assert(
+      is_reduced_floating_point_v<T>,
+      "Support only float16 and bfloat16.");
+
+ protected:
+  __m256i values;
+
+ public:
+  using value_type = uint16_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized16() {}
+  Vectorized16(__m256i v) : values(v) {}
+  Vectorized16(T val) {
+    value_type uw = val.x;
+    values = _mm256_set1_epi16(uw);
+  }
+  Vectorized16(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16) {
+    values = _mm256_setr_epi16(
+        val1.x,
+        val2.x,
+        val3.x,
+        val4.x,
+        val5.x,
+        val6.x,
+        val7.x,
+        val8.x,
+        val9.x,
+        val10.x,
+        val11.x,
+        val12.x,
+        val13.x,
+        val14.x,
+        val15.x,
+        val16.x);
+  }
+  operator __m256i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    __m256i cmp = _mm256_cmpeq_epi16(values, _mm256_set1_epi16(0));
+    return _mm256_movemask_epi8(cmp);
+  }
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+
+    __at_align__ int16_t tmp_values[size()];
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+    for (const auto i : c10::irange(count, size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(tmp_values));
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int16_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+    }
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    __at_align__ int16_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi16(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi16(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi16(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi16(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi16(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi16(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi16(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi16(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi16(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi16(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi16(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi16(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi16(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi16(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi16(b.values, 15);
+    return loadu(tmp_values);
+  }
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+
+  // 'const' type qualifier on return type has no effect, but sleef defines this
+  // this way For example `Sleef_exp2f8_u10` signature is `const __m256
+  // (__m256)`
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers")
+  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    const auto o1 = vop(lo);
+    const auto o2 = vop(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  C10_DIAGNOSTIC_POP()
+  Vectorized<T> isnan() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    lo = _mm256_cmp_ps(lo, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+    hi = _mm256_cmp_ps(hi, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+    return merge_compare_result(lo, hi);
+  }
+  Vectorized<T> abs() const {
+    return _mm256_andnot_si256(_mm256_set1_epi16(0x8000), values);
+  }
+  Vectorized<T> angle() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto angle_lambda = [](__m256 values_2) {
+      const auto zero_vec = _mm256_set1_ps(0.f);
+      const auto nan_vec = _mm256_set1_ps(NAN);
+      const auto not_nan_mask = _mm256_cmp_ps(values_2, values_2, _CMP_EQ_OQ);
+      const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+      const auto pi = _mm256_set1_ps(c10::pi<float>);
+
+      const auto neg_mask = _mm256_cmp_ps(values_2, zero_vec, _CMP_LT_OQ);
+      auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+      angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+      return angle;
+    };
+    auto o1 = angle_lambda(lo);
+    auto o2 = angle_lambda(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+  Vectorized<T> acos() const {
+    return map(Sleef_acosf8_u10);
+  }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf8_u10);
+  }
+  Vectorized<T> asin() const {
+    return map(Sleef_asinf8_u10);
+  }
+  Vectorized<T> atan() const {
+    return map(Sleef_atanf8_u10);
+  }
+  Vectorized<T> atanh() const {
+    return map(Sleef_atanhf8_u10);
+  }
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_atan2f8_u10(lo, b1);
+    auto o2 = Sleef_atan2f8_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m256i mask_value = _mm256_set1_epi32(~0x80008000);
+    __m256i mask_signbit = _mm256_set1_epi32(0x80008000);
+    return Vectorized<T>(_mm256_or_si256(
+        _mm256_and_si256(values, mask_value),
+        _mm256_and_si256(sign, mask_signbit)));
+  }
+  Vectorized<T> erf() const {
+    return map(Sleef_erff8_u10);
+  }
+  Vectorized<T> erfc() const {
+    return map(Sleef_erfcf8_u15);
+  }
+  Vectorized<T> erfinv() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_erfinv(tmp1[i]);
+      tmp2[i] = calc_erfinv(tmp2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> exp() const {
+    return map(Sleef_expf8_u10);
+  }
+  Vectorized<T> exp2() const {
+    return map(Sleef_exp2f8_u10);
+  }
+  Vectorized<T> expm1() const {
+    return map(Sleef_expm1f8_u10);
+  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+    __m256 x_lo, x_hi;
+    cvt_to_fp32<T>(values, x_lo, x_hi);
+    __m256 q_lo, q_hi;
+    cvt_to_fp32<T>(q.values, q_lo, q_hi);
+    auto o1 = Sleef_fmodf8(x_lo, q_lo);
+    auto o2 = Sleef_fmodf8(x_hi, q_hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_hypotf8_u05(lo, b1);
+    auto o2 = Sleef_hypotf8_u05(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_i0(tmp1[i]);
+      tmp2[i] = calc_i0(tmp2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0e() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_i0e(tmp1[i]);
+      tmp2[i] = calc_i0e(tmp2[i]);
+    }
+    const auto o1 = _mm256_loadu_ps(tmp1);
+    const auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> digamma() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_digamma(tmp1[i]);
+      tmp2[i] = calc_digamma(tmp2[i]);
+    }
+    const auto o1 = _mm256_loadu_ps(tmp1);
+    const auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+    __m256 lo, hi;
+    __m256 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+    __m256 lo, hi;
+    __m256 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> log() const {
+    return map(Sleef_logf8_u10);
+  }
+  Vectorized<T> log2() const {
+    return map(Sleef_log2f8_u10);
+  }
+  Vectorized<T> log10() const {
+    return map(Sleef_log10f8_u10);
+  }
+  Vectorized<T> log1p() const {
+    return map(Sleef_log1pf8_u10);
+  }
+  Vectorized<T> sin() const {
+    return map(Sleef_sinf8_u10);
+  }
+  Vectorized<T> sinh() const {
+    return map(Sleef_sinhf8_u10);
+  }
+  Vectorized<T> cos() const {
+    return map(Sleef_cosf8_u10);
+  }
+  Vectorized<T> cosh() const {
+    return map(Sleef_coshf8_u10);
+  }
+  Vectorized<T> ceil() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_ceil_ps(lo);
+    auto o2 = _mm256_ceil_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> floor() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_floor_ps(lo);
+    auto o2 = _mm256_floor_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> neg() const {
+    return _mm256_xor_si256(values, _mm256_set1_epi16(0x8000));
+  }
+  Vectorized<T> round() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 =
+        _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 =
+        _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> tan() const {
+    return map(Sleef_tanf8_u10);
+  }
+  Vectorized<T> tanh() const {
+    return map(Sleef_tanhf8_u10);
+  }
+  Vectorized<T> trunc() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> lgamma() const {
+    return map(Sleef_lgammaf8_u10);
+  }
+  Vectorized<T> sqrt() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_sqrt_ps(lo);
+    auto o2 = _mm256_sqrt_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> reciprocal() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm256_set1_ps(1);
+    auto o1 = _mm256_div_ps(ones, lo);
+    auto o2 = _mm256_div_ps(ones, hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> rsqrt() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm256_set1_ps(1);
+    auto o1 = _mm256_div_ps(ones, _mm256_sqrt_ps(lo));
+    auto o2 = _mm256_div_ps(ones, _mm256_sqrt_ps(hi));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_powf8_u10(lo, b1);
+    auto o2 = Sleef_powf8_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+ private:
+  template <typename Op, typename VectorizedType>
+  Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
+    __m256 a_lo, a_hi;
+    __m256 b_lo, b_hi;
+    cvt_to_fp32<T>(values, a_lo, a_hi);
+    cvt_to_fp32<T>(b.values, b_lo, b_hi);
+    auto o1 = op(a_lo, b_lo);
+    auto o2 = op(a_hi, b_hi);
+    return cvt_from_fp32<T, /*is_compare_op*/ true>(o1, o2);
+  }
+
+ public:
+  Vectorized<T> inline operator>(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_GT_OQ);
+    });
+  }
+  Vectorized<T> inline operator<(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
+    });
+  }
+  Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
+    });
+  }
+  Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_LE_OQ);
+    });
+  }
+  Vectorized<T> inline operator==(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_EQ_OQ);
+    });
+  }
+  Vectorized<T> inline operator!=(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ);
+    });
+  }
+};
+
+template <typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvt_to_fp32<T>(__m256i(a), a_lo, a_hi);
+  cvt_to_fp32<T>(__m256i(b), b_lo, b_hi);
+  auto o1 = op(a_lo, b_lo);
+  auto o2 = op(a_hi, b_hi);
+  return cvt_from_fp32<T>(o1, o2);
+}
+
+#define CONVERT_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>       \
+      convert_##name##_float(const Vectorized<type>& a) {       \
+    __m256 o1, o2;                                              \
+    cvt_to_fp32<type>(__m256i(a), o1, o2);                      \
+    return std::make_tuple(o1, o2);                             \
+  }                                                             \
+  inline Vectorized<type> convert_float_##name(                 \
+      const Vectorized<float>& a, const Vectorized<float>& b) { \
+    return cvt_from_fp32<type>(__m256(a), __m256(b));           \
+  }
+
+#define LOAD_FP32_VECTORIZED_INIT(type, name)                               \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));  \
+    __m256 out_values;                                                      \
+    cvt_to_fp32<type>(values, out_values);                                  \
+    out = out_values;                                                       \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    auto vec = Vectorized<type>::loadu(data);                               \
+    __m256 out1_values, out2_values;                                        \
+    cvt_to_fp32<type>(vec, out1_values, out2_values);                       \
+    out1 = out1_values;                                                     \
+    out2 = out2_values;                                                     \
+  }
+
+#else // CPU_CAPABILITY_AVX2
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
+      convert_##name##_float(const Vectorized<type>& a) {           \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr2);                                                  \
+    convert(arr2, arr, K);                                          \
+    return std::make_tuple(                                         \
+        Vectorized<float>::loadu(arr),                              \
+        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+  }                                                                 \
+  inline Vectorized<type> convert_float_##name(                     \
+      const Vectorized<float>& a, const Vectorized<float>& b) {     \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr);                                                   \
+    b.store(arr + Vectorized<float>::size());                       \
+    convert(arr, arr2, K);                                          \
+    return Vectorized<type>::loadu(arr2);                           \
+  }
+
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    __at_align__ float values[Vectorized<float>::size()];                   \
+    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
+      values[k] = data[k];                                                  \
+    }                                                                       \
+    out = Vectorized<float>::loadu(values);                                 \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    load_fp32_from_##name(data, out1);                                      \
+    data += Vectorized<float>::size();                                      \
+    load_fp32_from_##name(data, out2);                                      \
+  }
+
+#endif // CPU_CAPABILITY_AVX2
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fec6b9b7b59a2ba50b720c71b4146992b665084
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -0,0 +1,285 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/vec256/vec256_16bit_float.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> : public Vectorized16<BFloat16> {
+ public:
+  using Vectorized16::Vectorized16;
+
+  using value_type = BFloat16;
+
+  Vectorized<BFloat16> frac() const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+Vectorized<BFloat16> inline operator+(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_add_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator-(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_sub_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator*(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_mul_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator/(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_div_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  auto max_lo = _mm256_max_ps(a_lo, b_lo);
+  auto max_hi = _mm256_max_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(max_lo, nan_lo);
+  auto o2 = _mm256_or_ps(max_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  auto min_lo = _mm256_min_ps(a_lo, b_lo);
+  auto min_hi = _mm256_min_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(min_lo, nan_lo);
+  auto o2 = _mm256_or_ps(min_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  __m256 max_lo, max_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(min), min_lo, min_hi);
+  cvtbf16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo));
+  auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi));
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+  __m256 a_lo, a_hi;
+  __m256 max_lo, max_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, a_lo);
+  auto o2 = _mm256_min_ps(max_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(min), min_lo, min_hi);
+  auto o1 = _mm256_max_ps(min_lo, a_lo);
+  auto o2 = _mm256_max_ps(min_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size());
+       i += Vectorized<BFloat16>::size()) {
+    auto vsrc =
+        _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+    __m256 a = _mm256_loadu_ps(&src[i]);
+    __m256 b = _mm256_loadu_ps(&src[i + 8]);
+
+    __m256i bf = cvtfp32_bf16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, BFloat16* dst, int64_t n) {
+  auto load_float = [](const double* src) -> __m256 {
+    // Load one float vector from an array of doubles
+    __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
+    __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+    __m256 a = load_float(&src[i]);
+    __m256 b = load_float(&src[i + 8]);
+
+    __m256i bf = cvtfp32_bf16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  __m256 c_lo, c_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  cvtbf16_fp32(__m256i(c), c_lo, c_hi);
+  auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+CONVERT_VECTORIZED_INIT(BFloat16, bfloat16)
+LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
+
+#else // defined(CPU_CAPABILITY_AVX2)
+
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
+CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
+#endif
+
+LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
+#endif // defined(CPU_CAPABILITY_AVX2)
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8b68fdfc60003e8bf42dcaec98fdc02219bda15
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -0,0 +1,543 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<c10::complex<double>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<double>> {
+ private:
+  __m256d values;
+
+ public:
+  using value_type = c10::complex<double>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 2;
+  }
+  Vectorized() {
+    values = _mm256_setzero_pd();
+  }
+  Vectorized(__m256d v) : values(v) {}
+  Vectorized(c10::complex<double> val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    values = _mm256_setr_pd(real_value, imag_value, real_value, imag_value);
+  }
+  Vectorized(c10::complex<double> val1, c10::complex<double> val2) {
+    values = _mm256_setr_pd(val1.real(), val1.imag(), val2.real(), val2.imag());
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<double>> blend(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 4, "Unexpected mask value");
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_pd(a.values, b.values, 0x03);
+      case 2:
+        return _mm256_blend_pd(a.values, b.values, 0x0c);
+      case 3:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values);
+    return _mm256_blendv_pd(a.values, b.values, mask_);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<double>> arange(
+      c10::complex<double> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(base, base + step);
+  }
+  static Vectorized<c10::complex<double>> set(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(c10::complex<double>));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[2 * size()];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
+    }
+  }
+  const c10::complex<double>& operator[](int idx) const = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(
+      c10::complex<double> (*const f)(const c10::complex<double>&)) const {
+    __at_align__ c10::complex<double> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256d abs_2_() const {
+    auto val_2 = _mm256_mul_pd(values, values); // a*a     b*b
+    return _mm256_hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b
+  }
+  __m256d abs_() const {
+    auto real = _mm256_movedup_pd(values); // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm256_permute_pd(values, 0xf); // imag imag
+    return Sleef_hypotd4_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm256_and_pd(abs_(), real_mask); // abs     0
+  }
+  __m256d angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm256_permute_pd(values, 0x05); // b        a
+    return Sleef_atan2d4_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
+    return _mm256_and_pd(angle, real_mask); // angle    0
+  }
+  Vectorized<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_pd();
+    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+    auto div = _mm256_div_pd(values, abs);
+    return _mm256_blendv_pd(div, zero, mask);
+  }
+  __m256d real_() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm256_and_pd(values, real_mask);
+  }
+  Vectorized<c10::complex<double>> real() const {
+    return real_();
+  }
+  __m256d imag_() const {
+    const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF));
+    return _mm256_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm256_permute_pd(imag_(), 0x05); // b        a
+  }
+  __m256d conj_() const {
+    const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_pd(values, sign_mask); // a       -b
+  }
+  Vectorized<c10::complex<double>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<double>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<double>> log2() const {
+    const __m256d log2_ = _mm256_set1_pd(std::log(2));
+    return _mm256_div_pd(log(), log2_);
+  }
+  Vectorized<c10::complex<double>> log10() const {
+    const __m256d log10_ = _mm256_set1_pd(std::log(10));
+    return _mm256_div_pd(log(), log10_);
+  }
+  Vectorized<c10::complex<double>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<double>> asin() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // // asin(x)
+    // // = -i*ln(iz + sqrt(1 -z^2))
+    // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    // const __m256d one = _mm256_set1_pd(1);
+
+    // auto conj = conj_();
+    // auto b_a = _mm256_permute_pd(conj, 0x05);                         //-b a
+    // auto ab = _mm256_mul_pd(conj, b_a);                               //-ab
+    // -ab auto im = _mm256_add_pd(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm256_mul_pd(values, values);                       // a*a
+    // b*b auto re = _mm256_hsub_pd(val_2, _mm256_permute_pd(val_2, 0x05));  //
+    // a*a-b*b  b*b-a*a re = _mm256_sub_pd(one, re);
+
+    // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt(); //sqrt(re +
+    // i*im) auto ln = Vectorized(_mm256_add_pd(b_a, root)).log(); //ln(iz +
+    // sqrt()) return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj();
+    // //-i*ln()
+    return map(std::asin);
+  }
+  Vectorized<c10::complex<double>> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    constexpr auto pi_2d = c10::pi<double> / 2;
+    const __m256d pi_2 = _mm256_setr_pd(pi_2d, 0.0, pi_2d, 0.0);
+    return _mm256_sub_pd(pi_2, asin());
+  }
+  Vectorized<c10::complex<double>> atan() const;
+  Vectorized<c10::complex<double>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<double>> exp() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expd4_u10(values); //exp(a)           exp(b) exp =
+    // _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A);   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosd4_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y,
+    // 0x05),
+    //                                sin_cos.x, 0x0A); //cos(b) sin(b)
+    // return _mm256_mul_pd(exp, cos_sin);
+    return map(std::exp);
+  }
+  Vectorized<c10::complex<double>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m256d ln_2 = _mm256_set1_pd(c10::ln_2<double>);
+    Vectorized<c10::complex<double>> scaled_values =
+        _mm256_mul_pd(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<double>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<double>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<double>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<double>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<double>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<double>> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vectorized<c10::complex<double>> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vectorized<c10::complex<double>> neg() const {
+    auto zero = _mm256_setzero_pd();
+    return _mm256_sub_pd(zero, values);
+  }
+  Vectorized<c10::complex<double>> round() const {
+    return _mm256_round_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<double>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<double>> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<double>> reciprocal() const;
+  Vectorized<c10::complex<double>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<double>> pow(
+      const Vectorized<c10::complex<double>>& exp) const {
+    __at_align__ c10::complex<double> x_tmp[size()];
+    __at_align__ c10::complex<double> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<double>> operator==(
+      const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<double>> operator!=(
+      const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<double>> operator<(
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(
+      const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(
+      const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<double>> inline operator+(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator-(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator*(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_pd(a, b); // ac       bd
+
+  auto d_c = _mm256_permute_pd(b, 0x05); // d        c
+  d_c = _mm256_xor_pd(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm256_mul_pd(a, d_c); // ad      -bc
+
+  auto ret = _mm256_hsub_pd(ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator/(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // auto mask = _mm256_set1_pd(-0.f);
+  // auto fabs_cd = _mm256_andnot_pd(mask, b);     // |c|    |d|
+  // auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05);   // |d|    |c|
+  // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd,
+  // fabs_dc));  // 1/sc     1/sc auto a2 = _mm256_mul_pd(a, scale);         //
+  // a/sc     b/sc auto b2 = _mm256_mul_pd(b, scale);         // c/sc     d/sc
+  // auto acbd2 = _mm256_mul_pd(a2, b2);
+
+  // const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0);
+  // auto dc2 = _mm256_permute_pd(b2, 0x05);    // d/sc         c/sc
+  // dc2 = _mm256_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm256_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = _mm256_hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm256_div_pd(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return _mm256_loadu_pd(reinterpret_cast<const double*>(out));
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<double>> Vectorized<
+    c10::complex<double>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+  // auto c_d = _mm256_xor_pd(sign_mask, values);    //c       -d
+  // return _mm256_div_pd(c_d, abs_2_());
+  __at_align__ c10::complex<double> tmp[size()];
+  store(tmp);
+  for (const auto i : c10::irange(size())) {
+    tmp[i] = c10::complex<double>(1) / tmp[i];
+  }
+  return loadu(tmp);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m256d i = _mm256_setr_pd(0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm256_setr_pd(0.0, 0.5, 0.0, 0.5);
+
+  // auto sum = Vectorized(_mm256_add_pd(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm256_sub_pd(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+  return map(std::atan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline maximum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(max, isnan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline minimum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator&(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator|(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator^(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d0530f038d32d5eebfd82269c1df7cd5ae5daa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -0,0 +1,625 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<c10::complex<float>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<float>> {
+ private:
+  __m256 values;
+
+ public:
+  using value_type = c10::complex<float>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {
+    values = _mm256_setzero_ps();
+  }
+  Vectorized(__m256 v) : values(v) {}
+  Vectorized(c10::complex<float> val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    values = _mm256_setr_ps(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<float> val1,
+      c10::complex<float> val2,
+      c10::complex<float> val3,
+      c10::complex<float> val4) {
+    values = _mm256_setr_ps(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag());
+  }
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<float>> blend(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 16, "Unexpected mask range");
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x03); // b0000 0001 = b0000 0011
+      case 2:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x0C); // b0000 0010 = b0000 1100
+      case 3:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x0F); // b0000 0011 = b0000 1111
+      case 4:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x30); // b0000 0100 = b0011 0000
+      case 5:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x33); // b0000 0101 = b0011 0011
+      case 6:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x3C); // b0000 0110 = b0011 1100
+      case 7:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x3F); // b0000 0111 = b0011 1111
+      case 8:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xC0); // b0000 1000 = b1100 0000
+      case 9:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xC3); // b0000 1001 = b1100 0011
+      case 10:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xCC); // b0000 1010 = b1100 1100
+      case 11:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xCF); // b0000 1011 = b1100 1111
+      case 12:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xF0); // b0000 1100 = b1111 0000
+      case 13:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xF3); // b0000 1101 = b1111 0011
+      case 14:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xFC); // b0000 1110 = b1111 1100
+      default:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values);
+    return _mm256_blendv_ps(a.values, b.values, mask_);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<float>> arange(
+      c10::complex<float> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(
+        base,
+        base + step,
+        base + c10::complex<float>(2) * step,
+        base + c10::complex<float>(3) * step);
+  }
+  static Vectorized<c10::complex<float>> set(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float*>(ptr),
+        count * sizeof(c10::complex<float>));
+    return _mm256_load_ps(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[2 * size()];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
+    }
+  }
+  const c10::complex<float>& operator[](int idx) const = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(
+      c10::complex<float> (*const f)(const c10::complex<float>&)) const {
+    __at_align__ c10::complex<float> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256 abs_2_() const {
+    auto val_2 = _mm256_mul_ps(values, values); // a*a     b*b
+    auto ret = _mm256_hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b
+    return _mm256_permute_ps(ret, 0xD8);
+  }
+  __m256 abs_() const {
+    auto real = _mm256_moveldup_ps(values); // real real
+    auto imag = _mm256_movehdup_ps(values); // imag imag
+    return Sleef_hypotf8_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm256_and_ps(abs_(), real_mask); // abs     0
+  }
+  __m256 angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
+    return Sleef_atan2f8_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm256_and_ps(angle, real_mask); // angle    0
+  }
+  Vectorized<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_ps();
+    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+    auto div = _mm256_div_ps(values, abs);
+    return _mm256_blendv_ps(div, zero, mask);
+  }
+  __m256 real_() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm256_and_ps(values, real_mask);
+  }
+  Vectorized<c10::complex<float>> real() const {
+    return real_();
+  }
+  __m256 imag_() const {
+    const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF));
+    return _mm256_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm256_permute_ps(imag_(), 0xB1); // b        a
+  }
+  __m256 conj_() const {
+    const __m256 sign_mask =
+        _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_ps(values, sign_mask); // a       -b
+  }
+  Vectorized<c10::complex<float>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<float>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<float>> log2() const {
+    const __m256 log2_ = _mm256_set1_ps(std::log(2));
+    return _mm256_div_ps(log(), log2_);
+  }
+  Vectorized<c10::complex<float>> log10() const {
+    const __m256 log10_ = _mm256_set1_ps(std::log(10));
+    return _mm256_div_ps(log(), log10_);
+  }
+  Vectorized<c10::complex<float>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<float>> asin() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // // asin(x)
+    // // = -i*ln(iz + sqrt(1 -z^2))
+    // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    // const __m256 one = _mm256_set1_ps(1);
+
+    // auto conj = conj_();
+    // auto b_a = _mm256_permute_ps(conj, 0xB1);                         //-b a
+    // auto ab = _mm256_mul_ps(conj, b_a);                               //-ab
+    // -ab auto im = _mm256_add_ps(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm256_mul_ps(values, values);                       // a*a
+    // b*b auto re = _mm256_hsub_ps(val_2, _mm256_permute_ps(val_2, 0xB1));  //
+    // a*a-b*b  b*b-a*a re = _mm256_permute_ps(re, 0xD8); re =
+    // _mm256_sub_ps(one, re);
+
+    // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt(); //sqrt(re +
+    // i*im) auto ln = Vectorized(_mm256_add_ps(b_a, root)).log(); //ln(iz +
+    // sqrt()) return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj();
+    // //-i*ln()
+    return map(std::asin);
+  }
+  Vectorized<c10::complex<float>> acos() const {
+    return map(std::acos);
+  }
+  Vectorized<c10::complex<float>> atan() const;
+  Vectorized<c10::complex<float>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<float>> exp() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expf8_u10(values); //exp(a)           exp(b) exp =
+    // _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA);   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosf8_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y,
+    // 0xB1),
+    //                                sin_cos.x, 0xAA); //cos(b) sin(b)
+    // return _mm256_mul_ps(exp, cos_sin);
+    return map(std::exp);
+  }
+  Vectorized<c10::complex<float>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m256 ln_2 = _mm256_set1_ps(c10::ln_2<float>);
+    Vectorized<c10::complex<float>> scaled_values = _mm256_mul_ps(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<float>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<float>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<float>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<float>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<float>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<float>> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vectorized<c10::complex<float>> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vectorized<c10::complex<float>> neg() const {
+    auto zero = _mm256_setzero_ps();
+    return _mm256_sub_ps(zero, values);
+  }
+  Vectorized<c10::complex<float>> round() const {
+    return _mm256_round_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<float>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<float>> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<float>> reciprocal() const;
+  Vectorized<c10::complex<float>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<float>> pow(
+      const Vectorized<c10::complex<float>>& exp) const {
+    __at_align__ c10::complex<float> x_tmp[size()];
+    __at_align__ c10::complex<float> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<float>> operator==(
+      const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<float>> operator!=(
+      const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<float>> operator<(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(
+      const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(
+      const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<float>> inline operator+(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator-(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator*(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256 sign_mask =
+      _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_ps(a, b); // ac       bd
+
+  auto d_c = _mm256_permute_ps(b, 0xB1); // d        c
+  d_c = _mm256_xor_ps(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm256_mul_ps(a, d_c); // ad      -bc
+
+  auto ret = _mm256_hsub_ps(ac_bd, ad_bc); // ac - bd  ad + bc
+  ret = _mm256_permute_ps(ret, 0xD8);
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator/(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // auto mask = _mm256_set1_ps(-0.f);
+  // auto fabs_cd = _mm256_andnot_ps(mask, b);     // |c|    |d|
+  // auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc));  // 1/sc 1/sc
+  // auto a2 = _mm256_mul_ps(a, scale);         // a/sc     b/sc
+  // auto b2 = _mm256_mul_ps(b, scale);         // c/sc     d/sc
+  // auto acbd2 = _mm256_mul_ps(a2, b2);
+
+  // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0); auto dc2 = _mm256_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  // dc2 = _mm256_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm256_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = _mm256_hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+  // res2 = _mm256_permute_ps(res2, 0xD8);
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm256_div_ps(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<float>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return _mm256_loadu_ps(reinterpret_cast<const float*>(out));
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<float>> Vectorized<
+    c10::complex<float>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0); auto c_d = _mm256_xor_ps(sign_mask, values);    //c       -d
+  // return _mm256_div_ps(c_d, abs_2_());
+  __at_align__ c10::complex<float> tmp[size()];
+  store(tmp);
+  for (const auto i : c10::irange(size())) {
+    tmp[i] = c10::complex<float>(1) / tmp[i];
+  }
+  return loadu(tmp);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m256 i = _mm256_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm256_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5);
+
+  // auto sum = Vectorized(_mm256_add_ps(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm256_sub_ps(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+  return map(std::atan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline maximum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline minimum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator&(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator|(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator^(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ea85701b7cbbef81f26709ea08be38cdea3e108
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_convert.h
@@ -0,0 +1,370 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtbf16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtfp16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 2> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 2>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = convert_float_bfloat16(src[0], src[1]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 2> result;
+    std::tie(result[0], result[1]) = convert_bfloat16_float(src[0]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 2> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 2>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = convert_float_half(src[0], src[1]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, Half, 1> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 2> result;
+    std::tie(result[0], result[1]) = convert_half_float(src[0]);
+    return result;
+  }
+};
+
+template <>
+inline Vectorized<double> convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src);
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low_double = at::vec::convert_to_fp_of_same_size<double>(src[0]);
+    auto low = _mm256_cvtpd_ps(low_double);
+    auto high_double = at::vec::convert_to_fp_of_same_size<double>(src[1]);
+    auto high = _mm256_cvtpd_ps(high_double);
+    return Vectorized<float>(
+        _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    // Scalarization is the most reliable way of converting fp to int64 on AVX2.
+    // Check: https://stackoverflow.com/questions/41144668
+    float buffer[8];
+    src.store(buffer);
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = Vectorized<int64_t>(
+        static_cast<int64_t>(buffer[0]),
+        static_cast<int64_t>(buffer[1]),
+        static_cast<int64_t>(buffer[2]),
+        static_cast<int64_t>(buffer[3]));
+    result[1] = Vectorized<int64_t>(
+        static_cast<int64_t>(buffer[4]),
+        static_cast<int64_t>(buffer[5]),
+        static_cast<int64_t>(buffer[6]),
+        static_cast<int64_t>(buffer[7]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0));
+    auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0));
+    auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0));
+    auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0));
+    return Vectorized<int32_t>(_mm256_blend_epi32(low_perm, high_perm, 0xF0));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src[0]));
+    result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepu8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int16_t>(_mm256_cvtepu8_epi16(src128));
+  }
+};
+
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
+    __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
+    // Shuffle [191:128] bit from combined in to [127:64] bit of result
+    __m256i result =
+        _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+    return at::vec::Vectorized<dst_t>(result);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_float_to_int8<dst_t>(src[0]);
+  }
+};
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
+    __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
+    __m256i src2 =
+        _mm256_castsi128_si256(_mm_castps_si128(_mm256_extractf128_ps(
+            _mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
+                                                ));
+    return VectorizedN<float, 2>(
+        convert_int8_to_float<src_t>(src[0]),
+        convert_int8_to_float<src_t>(src2));
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    std::enable_if_t<
+        std::is_same_v<dst_t, int8_t> || std::is_same_v<dst_t, uint8_t>>> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+#endif /* defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) */
+
+#if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER))
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_to_float<src_t>(src[0]);
+  }
+};
+#endif
+
+#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> res;
+    // Load 16-bit unsigned integers from src into an SVE vector
+    svuint16_t u16x4 =
+        svld1_u16(svptrue_b16(), reinterpret_cast<const uint16_t*>(&src[0]));
+    // Zero-extend to 32-bit SVE does not have direct vmovl_u16 equivalent.
+    vls_uint32_t u32x4 =
+        svreinterpret_u32_u16(svzip1_u16(svdup_n_u16(0), u16x4));
+    // Reinterpret as float32
+    vls_float32_t f32x4 = svreinterpret_f32_u32(u32x4);
+    res[0] = Vectorized<float>(f32x4);
+    return res;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 2> res;
+    std::tie(res[0], res[1]) = convert_bfloat16_float(src[0]);
+    return res;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 2> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 2>& src) {
+    VectorizedN<BFloat16, 1> res;
+    res[0] = convert_float_bfloat16(src[0], src[1]);
+    return res;
+  }
+};
+
+#endif // defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    auto [res_vec1, res_vec2] = convert_to_float<src_t>(src[0]);
+    return res_vec1;
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_from_float<dst_t>(src[0], src[0]);
+  }
+};
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..34c34f62526d9cb2d5cd5ed9d8e396280ca608f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h
@@ -0,0 +1,531 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  __m256d values;
+
+ public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {
+    values = _mm256_setzero_pd();
+  }
+  Vectorized(__m256d v) : values(v) {}
+  Vectorized(double val) {
+    values = _mm256_set1_pd(val);
+  }
+  Vectorized(double val1, double val2, double val3, double val4) {
+    values = _mm256_setr_pd(val1, val2, val3, val4);
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    return _mm256_blend_pd(a.values, b.values, mask);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    return _mm256_blendv_pd(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(double));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[size()];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(double));
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
+    return _mm256_movemask_pd(cmp);
+  }
+  Vectorized<double> isnan() const {
+    return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
+  }
+  bool has_inf_nan() const {
+    __m256d self_sub = _mm256_sub_pd(values, values);
+    return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) !=
+        0;
+  }
+  Vectorized<double> map(double (*const f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    auto mask = _mm256_set1_pd(-0.f);
+    return _mm256_andnot_pd(mask, values);
+  }
+  Vectorized<double> angle() const {
+    const auto zero_vec = _mm256_set1_pd(0.f);
+    const auto nan_vec = _mm256_set1_pd(NAN);
+    const auto not_nan_mask = _mm256_cmp_pd(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_pd(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_pd(c10::pi<double>);
+
+    const auto neg_mask = _mm256_cmp_pd(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_pd(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_pd(angle, nan_vec, nan_mask);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return _mm256_set1_pd(0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return Vectorized<double>(Sleef_acosd4_u10(values));
+  }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd4_u10(values));
+  }
+  Vectorized<double> asin() const {
+    return Vectorized<double>(Sleef_asind4_u10(values));
+  }
+  Vectorized<double> asinh() const {
+    return Vectorized<double>(Sleef_asinhd4_u10(values));
+  }
+  Vectorized<double> atan() const {
+    return Vectorized<double>(Sleef_atand4_u10(values));
+  }
+  Vectorized<double> atanh() const {
+    return Vectorized<double>(Sleef_atanhd4_u10(values));
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_atan2d4_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+    return Vectorized<double>(Sleef_copysignd4(values, sign));
+  }
+  Vectorized<double> erf() const {
+    return Vectorized<double>(Sleef_erfd4_u10(values));
+  }
+  Vectorized<double> erfc() const {
+    return Vectorized<double>(Sleef_erfcd4_u15(values));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return Vectorized<double>(Sleef_expd4_u10(values));
+  }
+  Vectorized<double> exp2() const {
+    return Vectorized<double>(Sleef_exp2d4_u10(values));
+  }
+  Vectorized<double> expm1() const {
+    return Vectorized<double>(Sleef_expm1d4_u10(values));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    return Vectorized<double>(Sleef_fmodd4(values, q));
+  }
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_hypotd4_u05(values, b));
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> log() const {
+    return Vectorized<double>(Sleef_logd4_u10(values));
+  }
+  Vectorized<double> log2() const {
+    return Vectorized<double>(Sleef_log2d4_u10(values));
+  }
+  Vectorized<double> log10() const {
+    return Vectorized<double>(Sleef_log10d4_u10(values));
+  }
+  Vectorized<double> log1p() const {
+    return Vectorized<double>(Sleef_log1pd4_u10(values));
+  }
+  Vectorized<double> sin() const {
+    return Vectorized<double>(Sleef_sind4_u10(values));
+  }
+  Vectorized<double> sinh() const {
+    return Vectorized<double>(Sleef_sinhd4_u10(values));
+  }
+  Vectorized<double> cos() const {
+    return Vectorized<double>(Sleef_cosd4_u10(values));
+  }
+  Vectorized<double> cosh() const {
+    return Vectorized<double>(Sleef_coshd4_u10(values));
+  }
+  Vectorized<double> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vectorized<double> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> neg() const {
+    return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
+  }
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_nextafterd4(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm256_round_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> tan() const {
+    return Vectorized<double>(Sleef_tand4_u10(values));
+  }
+  Vectorized<double> tanh() const {
+    return Vectorized<double>(Sleef_tanhd4_u10(values));
+  }
+  Vectorized<double> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> lgamma() const {
+    return Vectorized<double>(Sleef_lgammad4_u10(values));
+  }
+  Vectorized<double> sqrt() const {
+    return _mm256_sqrt_pd(values);
+  }
+  Vectorized<double> reciprocal() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), values);
+  }
+  Vectorized<double> rsqrt() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values));
+  }
+  Vectorized<double> pow(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_powd4_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ);
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_mul_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_div_pd(a, b);
+}
+
+// frac. Implement this here so we can use subtraction.
+inline Vectorized<double> Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  Vectorized<double> max = _mm256_max_pd(a, b);
+  Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_pd(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  Vectorized<double> min = _mm256_min_pd(a, b);
+  Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+  return _mm256_min_pd(max, _mm256_max_pd(min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+  return _mm256_max_pd(min, a);
+}
+
+template <>
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+  return _mm256_min_pd(max, a);
+}
+
+template <>
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+#ifdef CPU_CAPABILITY_AVX2
+template <>
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm256_fmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm256_fnmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm256_fmsub_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm256_fnmsub_pd(a, b, c);
+}
+#endif
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a2cbb07006467f5eded6893f5aadf4d68e93053
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h
@@ -0,0 +1,847 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  __m256 values;
+
+ public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {
+    values = _mm256_setzero_ps();
+  }
+  Vectorized(__m256 v) : values(v) {}
+  Vectorized(float val) {
+    values = _mm256_set1_ps(val);
+  }
+  Vectorized(
+      float val1,
+      float val2,
+      float val3,
+      float val4,
+      float val5,
+      float val6,
+      float val7,
+      float val8) {
+    values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  Vectorized(const float (&arr)[8])
+      : Vectorized(
+            arr[0],
+            arr[1],
+            arr[2],
+            arr[3],
+            arr[4],
+            arr[5],
+            arr[6],
+            arr[7]) {}
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    return _mm256_blend_ps(a.values, b.values, mask);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    return _mm256_blendv_ps(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+    __at_align__ float tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
+    return _mm256_loadu_ps(tmp_values);
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[size()];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
+    return _mm256_movemask_ps(cmp);
+  }
+  Vectorized<float> isnan() const {
+    return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+  }
+
+  bool has_inf_nan() const {
+    __m256 self_sub = _mm256_sub_ps(values, values);
+    return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) !=
+        0;
+  }
+
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    auto mask = _mm256_set1_ps(-0.f);
+    return _mm256_andnot_ps(mask, values);
+  }
+  Vectorized<float> angle() const {
+    const auto zero_vec = _mm256_set1_ps(0.f);
+    const auto nan_vec = _mm256_set1_ps(NAN);
+    const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_ps(c10::pi<float>);
+
+    const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return _mm256_set1_ps(0);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+  Vectorized<float> acos() const {
+    return Vectorized<float>(Sleef_acosf8_u10(values));
+  }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf8_u10(values));
+  }
+  Vectorized<float> asin() const {
+    return Vectorized<float>(Sleef_asinf8_u10(values));
+  }
+  Vectorized<float> asinh() const {
+    return Vectorized<float>(Sleef_asinhf8_u10(values));
+  }
+  Vectorized<float> atan() const {
+    return Vectorized<float>(Sleef_atanf8_u10(values));
+  }
+  Vectorized<float> atanh() const {
+    return Vectorized<float>(Sleef_atanhf8_u10(values));
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_atan2f8_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+    return Vectorized<float>(Sleef_copysignf8(values, sign));
+  }
+  Vectorized<float> erf() const {
+    // constants
+    const auto neg_zero_vec = _mm256_set1_ps(-0.f);
+    const auto one_vec = _mm256_set1_ps(1.0f);
+    const auto p = _mm256_set1_ps(0.3275911f);
+    const auto p1 = _mm256_set1_ps(0.254829592f);
+    const auto p2 = _mm256_set1_ps(-0.284496736f);
+    const auto p3 = _mm256_set1_ps(1.421413741f);
+    const auto p4 = _mm256_set1_ps(-1.453152027f);
+    const auto p5 = _mm256_set1_ps(1.061405429f);
+    // sign(x)
+    auto sign_mask = _mm256_and_ps(neg_zero_vec, values);
+    auto abs_vec = _mm256_xor_ps(sign_mask, values);
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec);
+    auto t = _mm256_div_ps(one_vec, tmp0);
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = _mm256_fmadd_ps(p5, t, p4);
+    auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3);
+    auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2);
+    auto r = _mm256_fmadd_ps(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = _mm256_mul_ps(values, values);
+    auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2);
+    // auto tmp4 = exp(neg_pow_2);
+    auto tmp4 = Vectorized<float>(Sleef_expf8_u10(neg_pow_2));
+    auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4);
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = _mm256_mul_ps(tmp5, t);
+    auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec);
+    return _mm256_xor_ps(sign_mask, tmp7);
+  }
+  Vectorized<float> erfc() const {
+    return Vectorized<float>(Sleef_erfcf8_u15(values));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return Vectorized<float>(Sleef_expf8_u10(values));
+  }
+  Vectorized<float> exp2() const {
+    return Vectorized<float>(Sleef_exp2f8_u10(values));
+  }
+  Vectorized<float> expm1() const {
+    return Vectorized<float>(Sleef_expm1f8_u10(values));
+  }
+  Vectorized<float> fexp_u20() const {
+    const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
+    const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
+    const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
+    const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
+
+    const __m256 vec_exp_log2ef =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
+
+    const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
+    const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
+
+    const __m256 vec_ln_flt_min =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    const __m256 vec_ln_flt_max =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+    const __m256 vec_inf = _mm256_set1_ps(INFINITY);
+    const __m256 zero = _mm256_setzero_ps();
+
+    // exp(x) = 2**(x * log2(e))
+    //        = 2**xi * 2**xf   - TIPS we are using  the EEEE floating point
+    //        representation with identification to the exponent and the
+    //        mentissa
+    //  2**xf will be approximated to a polynomial of degree 3 computed with
+    //  Horner method
+    // compute the min/max for the mask
+    // Masks
+    __m256 mask_too_small =
+        _mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
+    __m256 mask_too_large =
+        _mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
+
+    // transformation with log2(e)
+    auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
+    auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
+
+    // compute polynomial using Horner Scheme
+    auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
+    vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
+    vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
+
+    vec_src = _mm256_sub_ps(vec_src, vec_res);
+    // // the tips is here, headache in perspective
+    auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
+    // headache bis
+    __m256i casted_integer = _mm256_cvttps_epi32(tmp);
+    // bitwise to float for the final transformation
+    auto result = _mm256_castsi256_ps(casted_integer);
+    // boundary condition
+    // Set to 0 where x < ln(FLT_MIN)
+    result = _mm256_blendv_ps(result, zero, mask_too_small);
+    // Set to +inf where x > ln(FLT_MAX)
+    result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
+    // final interpretation to float
+    return result;
+  }
+
+  Vectorized<float> exp_u20() const {
+    // A faster version of exp with ULP=20
+    const __m256 vec_factorial_1 =
+        _mm256_set1_ps(0.999999701f); // 1/factorial(1)
+    const __m256 vec_factorial_2 =
+        _mm256_set1_ps(0.499991506f); // 1/factorial(2)
+    const __m256 vec_factorial_3 =
+        _mm256_set1_ps(0.166676521f); // 1/factorial(3)
+    const __m256 vec_factorial_4 =
+        _mm256_set1_ps(0.0418978221f); // 1/factorial(4)
+    const __m256 vec_factorial_5 =
+        _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
+    const __m256 vec_exp_log2ef =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
+    const __m256 vec_half = _mm256_set1_ps(0.5f);
+    const __m256 vec_one = _mm256_set1_ps(1.f);
+    const __m256 vec_zero = _mm256_set1_ps(0.f);
+    const __m256 vec_two = _mm256_set1_ps(2.f);
+    const __m256 vec_ln2f =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+    const __m256 vec_ln_flt_min =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    const __m256 vec_ln_flt_max =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+    const __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
+    const int n_mantissa_bits = 23;
+
+    // exp(x) =
+    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
+    // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression
+
+    auto less_ln_flt_min_mask =
+        _mm256_cmp_ps(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);
+    auto vec_src = _mm256_min_ps(values, vec_ln_flt_max);
+    vec_src = _mm256_max_ps(vec_src, vec_ln_flt_min);
+
+    // fx = floorf(x * log2ef + 0.5)
+    auto vec_fx = _mm256_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);
+    vec_fx = _mm256_floor_ps(vec_fx);
+
+    // x = x - fx * ln2
+    auto vec_exp_poly = _mm256_fnmadd_ps(vec_fx, vec_ln2f, vec_src);
+
+    // compute polynomial
+    auto vec_res =
+        _mm256_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_one);
+
+    // compute 2^(n-1)
+    auto vec_exp_number = _mm256_sub_ps(vec_fx, vec_one);
+    auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
+    auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
+    vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
+    vec_two_pow_n =
+        _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
+
+    // y = y * 2^n
+    vec_res = _mm256_mul_ps(vec_res, vec_two_pow_n);
+    vec_res = _mm256_mul_ps(vec_res, vec_two);
+    return vec_res;
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return Vectorized<float>(Sleef_fmodf8(values, q));
+  }
+  Vectorized<float> log() const {
+    return Vectorized<float>(Sleef_logf8_u10(values));
+  }
+  Vectorized<float> log2() const {
+    return Vectorized<float>(Sleef_log2f8_u10(values));
+  }
+  Vectorized<float> log10() const {
+    return Vectorized<float>(Sleef_log10f8_u10(values));
+  }
+  Vectorized<float> log1p() const {
+    return Vectorized<float>(Sleef_log1pf8_u10(values));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return Vectorized<float>(Sleef_sinf8_u35(values));
+  }
+  Vectorized<float> sinh() const {
+    return Vectorized<float>(Sleef_sinhf8_u10(values));
+  }
+  Vectorized<float> cos() const {
+    return Vectorized<float>(Sleef_cosf8_u35(values));
+  }
+  Vectorized<float> cosh() const {
+    return Vectorized<float>(Sleef_coshf8_u10(values));
+  }
+  Vectorized<float> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vectorized<float> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_hypotf8_u05(values, b));
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> neg() const {
+    return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
+  }
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_nextafterf8(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm256_round_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> tan() const {
+    return Vectorized<float>(Sleef_tanf8_u10(values));
+  }
+  Vectorized<float> tanh() const {
+    return Vectorized<float>(Sleef_tanhf8_u10(values));
+  }
+  Vectorized<float> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> lgamma() const {
+    return Vectorized<float>(Sleef_lgammaf8_u10(values));
+  }
+  Vectorized<float> sqrt() const {
+    return _mm256_sqrt_ps(values);
+  }
+  Vectorized<float> reciprocal() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), values);
+  }
+  Vectorized<float> rsqrt() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
+  }
+  Vectorized<float> pow(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_powf8_u10(values, b));
+  }
+  float reduce_add() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = _mm256_add_ps(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = _mm256_add_ps(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = _mm256_add_ps(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+  float reduce_max() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = _mm256_max_ps(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = _mm256_max_ps(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = _mm256_max_ps(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_mul_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_div_ps(a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  Vectorized<float> max = _mm256_max_ps(a, b);
+  Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_ps(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  Vectorized<float> min = _mm256_min_ps(a, b);
+  Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+  return _mm256_min_ps(max, _mm256_max_ps(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+  return _mm256_min_ps(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+  return _mm256_max_ps(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm256_fmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm256_fnmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm256_fmsub_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm256_fnmsub_ps(a, b, c);
+}
+
+// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
+// Used by Inductor CPP codegen for micro gemm
+inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {
+  __m256 temp0[8];
+  // unpacking and interleaving 32-bit elements
+  // a0  b0  a1  b1  a4  b4  a5  b5
+  // a2  b2  a3  b3  a6  b6  a7  b7
+  // c0  d0  c1  d1 ...
+  // c2  d2  c3  d3 ...
+  // e0  f0  e1  f1 ...
+  // e2  f2  e3  f3 ...
+  // g0  h0  g1  h1 ...
+  // g2  h2  g3  h3 ...
+  temp0[0] = _mm256_unpacklo_ps(input[0], input[1]);
+  temp0[1] = _mm256_unpackhi_ps(input[0], input[1]);
+  temp0[2] = _mm256_unpacklo_ps(input[2], input[3]);
+  temp0[3] = _mm256_unpackhi_ps(input[2], input[3]);
+  temp0[4] = _mm256_unpacklo_ps(input[4], input[5]);
+  temp0[5] = _mm256_unpackhi_ps(input[4], input[5]);
+  temp0[6] = _mm256_unpacklo_ps(input[6], input[7]);
+  temp0[7] = _mm256_unpackhi_ps(input[6], input[7]);
+
+  __m256 temp1[8];
+  // unpacking and interleaving 64-bit elements
+  //  a0  b0  c0  d0  a4  b4  c4  d4
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  e0  f0  g0  h0  e4  f4  g4  h4
+  //  e1  f1  g1  h1 ...
+  //  e2  f2  g2  h2 ...
+  //  e3  f3  g3  h3 ...
+  temp1[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2])));
+  temp1[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2])));
+  temp1[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3])));
+  temp1[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3])));
+  temp1[4] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6])));
+  temp1[5] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6])));
+  temp1[6] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
+  temp1[7] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
+
+  //  shuffle 128-bits (composed of 4 32-bit elements)
+  //  a0  b0  c0  d0  e0  f0  g0  h0
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  a4  b4  c4  d4 ...
+  //  a5  b5  c5  d5 ...
+  //  a6  b6  c6  d6 ...
+  //  a7  b7  c7  d7 ...
+  input[0] = _mm256_permute2f128_ps(temp1[0], temp1[4], 0x20);
+  input[1] = _mm256_permute2f128_ps(temp1[1], temp1[5], 0x20);
+  input[2] = _mm256_permute2f128_ps(temp1[2], temp1[6], 0x20);
+  input[3] = _mm256_permute2f128_ps(temp1[3], temp1[7], 0x20);
+  input[4] = _mm256_permute2f128_ps(temp1[0], temp1[4], 0x31);
+  input[5] = _mm256_permute2f128_ps(temp1[1], temp1[5], 0x31);
+  input[6] = _mm256_permute2f128_ps(temp1[2], temp1[6], 0x31);
+  input[7] = _mm256_permute2f128_ps(temp1[3], temp1[7], 0x31);
+}
+
+// Used by Inductor CPP codegen
+template <>
+inline void transpose_mxn<float, 8, 8>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  // load from src to registers
+  at::vec::VectorizedN<float, 8> input;
+  // a: a0  a1  a2  a3  a4  a5  a6  a7
+  // b: b0  b1  b2  b3  b4  b5  b6  b7
+  // c: c0  c1  c2  c3  c4  c5  c6  c7
+  // d: d0  d1  d2  d3  d4  d5  d6  d7
+  // e: e0  e1  e2  e3  e4  e5  e6  e7
+  // f: f0  f1  f2  f3  f4  f5  f6  f7
+  // g: g0  g1  g2  g3  g4  g5  g6  g7
+  // h: h0  h1  h2  h3  h4  h5  h6  h7
+  int i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i < 8; i++) {
+    input[i] = _mm256_loadu_ps(&src[i * ld_src]);
+  }
+
+  transpose_block(input);
+
+  // store from registers to dst
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i < 8; i++) {
+    _mm256_storeu_ps(&dst[i * ld_dst], input[i]);
+  }
+}
+
+template <>
+inline void transpose_mxn<float, 16, 16>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  transpose_mxn<float, 8, 8>(src, ld_src, dst, ld_dst);
+  transpose_mxn<float, 8, 8>(src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
+  transpose_mxn<float, 8, 8>(src + 8 * ld_src, ld_src, dst + 8, ld_dst);
+  transpose_mxn<float, 8, 8>(
+      src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
+}
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5d95b014801a22c7eec6b9295baa51a66f0fd2c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_half.h
@@ -0,0 +1,285 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/vec256/vec256_16bit_float.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#ifdef CPU_CAPABILITY_AVX2
+
+template <>
+struct is_vec_specialized_for<Half> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<Half> : public Vectorized16<Half> {
+ public:
+  using Vectorized16::Vectorized16;
+
+  using value_type = Half;
+
+  Vectorized<Half> frac() const;
+
+  Vectorized<Half> eq(const Vectorized<Half>& other) const;
+  Vectorized<Half> ne(const Vectorized<Half>& other) const;
+  Vectorized<Half> gt(const Vectorized<Half>& other) const;
+  Vectorized<Half> ge(const Vectorized<Half>& other) const;
+  Vectorized<Half> lt(const Vectorized<Half>& other) const;
+  Vectorized<Half> le(const Vectorized<Half>& other) const;
+};
+
+Vectorized<Half> inline operator+(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_add_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator-(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_sub_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator*(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_mul_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator/(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_div_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator&(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<Half> inline operator|(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<Half> inline operator^(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(
+    const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ne(
+    const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::gt(
+    const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ge(
+    const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::lt(
+    const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::le(
+    const Vectorized<Half>& other) const {
+  return (*this <= other) & Vectorized<Half>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<Half> Vectorized<Half>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline maximum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  auto max_lo = _mm256_max_ps(a_lo, b_lo);
+  auto max_hi = _mm256_max_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(max_lo, nan_lo);
+  auto o2 = _mm256_or_ps(max_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline minimum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  auto min_lo = _mm256_min_ps(a_lo, b_lo);
+  auto min_hi = _mm256_min_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(min_lo, nan_lo);
+  auto o2 = _mm256_or_ps(min_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min,
+    const Vectorized<Half>& max) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  __m256 max_lo, max_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(min), min_lo, min_hi);
+  cvtfp16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo));
+  auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi));
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_max(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& max) {
+  __m256 a_lo, a_hi;
+  __m256 max_lo, max_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, a_lo);
+  auto o2 = _mm256_min_ps(max_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_min(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(min), min_lo, min_hi);
+  auto o1 = _mm256_max_ps(min_lo, a_lo);
+  auto o2 = _mm256_max_ps(min_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+inline void convert(const Half* src, Half* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<Half>::size());
+       i += Vectorized<Half>::size()) {
+    auto vsrc =
+        _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, Half* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+    __m256 a = _mm256_loadu_ps(&src[i]);
+    __m256 b = _mm256_loadu_ps(&src[i + 8]);
+
+    __m256i c = cvtfp32_fp16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, Half* dst, int64_t n) {
+  auto load_float = [](const double* src) -> __m256 {
+    // Load one float vector from an array of doubles
+    __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
+    __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+    __m256 a = load_float(&src[i]);
+    __m256 b = load_float(&src[i + 8]);
+
+    __m256i c = cvtfp32_fp16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+Vectorized<Half> inline fmadd(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b,
+    const Vectorized<Half>& c) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  __m256 c_lo, c_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  cvtfp16_fp32(__m256i(c), c_lo, c_hi);
+  auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+CONVERT_VECTORIZED_INIT(Half, half)
+LOAD_FP32_VECTORIZED_INIT(Half, fp16)
+
+#else // defined(CPU_CAPABILITY_AVX2)
+
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
+CONVERT_NON_VECTORIZED_INIT(Half, half)
+#endif
+
+LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
+#endif // defined(CPU_CAPABILITY_AVX2)
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb2866dfc45192365a6d31495ccfdfe9fe5c1a98
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h
@@ -0,0 +1,2327 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#ifdef CPU_CAPABILITY_AVX2
+
+struct Vectorizedi {
+ protected:
+  __m256i values;
+
+  static inline __m256i invert(const __m256i& v) {
+    const auto ones = _mm256_set1_epi64x(-1);
+    return _mm256_xor_si256(ones, v);
+  }
+
+ public:
+  Vectorizedi() {
+    values = _mm256_setzero_si256();
+  }
+  Vectorizedi(__m256i v) : values(v) {}
+  operator __m256i() const {
+    return values;
+  }
+};
+
+#else
+
+struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined
+
+#endif // CPU_CAPABILITY_AVX2
+
+#ifdef CPU_CAPABILITY_AVX2
+
+template <>
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int64_t> ones;
+
+ public:
+  using value_type = int64_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {
+    values = _mm256_setzero_si256();
+  }
+  Vectorized(int64_t v) {
+    values = _mm256_set1_epi64x(v);
+  }
+  Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
+    values = _mm256_setr_epi64x(val1, val2, val3, val4);
+  }
+  template <int64_t mask>
+  static Vectorized<int64_t> blend(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b) {
+    __at_align__ int64_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi64(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi64(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi64(b.values, 3);
+    return loadu(tmp_values);
+  }
+  static Vectorized<int64_t> blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(
+      int64_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<int64_t> set(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int64_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
+    __at_align__ int64_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to one using "={1}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 1;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int64_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
+    }
+  }
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+  Vectorized<int64_t> abs() const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto is_larger = _mm256_cmpgt_epi64(zero, values);
+    auto inverse = _mm256_xor_si256(values, is_larger);
+    return _mm256_sub_epi64(inverse, is_larger);
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return _mm256_set1_epi64x(0);
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+  Vectorized<int64_t> neg() const;
+  Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpeq_epi64(values, other.values);
+  }
+  Vectorized<int64_t> operator!=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpeq_epi64(values, other.values));
+  }
+  Vectorized<int64_t> operator<(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(other.values, values);
+  }
+  Vectorized<int64_t> operator<=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpgt_epi64(values, other.values));
+  }
+  Vectorized<int64_t> operator>(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(values, other.values);
+  }
+  Vectorized<int64_t> operator>=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpgt_epi64(other.values, values));
+  }
+
+  Vectorized<int64_t> eq(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ne(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> gt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ge(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> lt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> le(const Vectorized<int64_t>& other) const;
+};
+
+template <>
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int32_t> ones;
+
+ public:
+  using value_type = int32_t;
+  static constexpr int size() {
+    return 8;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int32_t v) {
+    values = _mm256_set1_epi32(v);
+  }
+  Vectorized(
+      int32_t val1,
+      int32_t val2,
+      int32_t val3,
+      int32_t val4,
+      int32_t val5,
+      int32_t val6,
+      int32_t val7,
+      int32_t val8) {
+    values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b) {
+    return _mm256_blend_epi32(a, b, mask);
+  }
+  static Vectorized<int32_t> blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int32_t> arange(
+      int32_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int32_t> set(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b,
+      int32_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int32_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int32_t> loadu(const void* ptr, int32_t count) {
+    __at_align__ int32_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to one using "={1}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 1;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int32_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
+    }
+  }
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+  Vectorized<int32_t> abs() const {
+    return _mm256_abs_epi32(values);
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return _mm256_set1_epi32(0);
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+  Vectorized<int32_t> neg() const;
+  int32_t reduce_add() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_si256(v, v, 0x1);
+    v = _mm256_add_epi32(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0x4E);
+    v = _mm256_add_epi32(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0xB1);
+    v = _mm256_add_epi32(v, v1);
+    __m128i lo = _mm256_castsi256_si128(v);
+    return _mm_cvtsi128_si32(lo);
+  }
+  int32_t reduce_max() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_si256(v, v, 0x1);
+    v = _mm256_max_epi32(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0x4E);
+    v = _mm256_max_epi32(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0xB1);
+    v = _mm256_max_epi32(v, v1);
+    __m128i lo = _mm256_castsi256_si128(v);
+    return _mm_cvtsi128_si32(lo);
+  }
+  Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpeq_epi32(values, other.values);
+  }
+  Vectorized<int32_t> operator!=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpeq_epi32(values, other.values));
+  }
+  Vectorized<int32_t> operator<(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(other.values, values);
+  }
+  Vectorized<int32_t> operator<=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpgt_epi32(values, other.values));
+  }
+  Vectorized<int32_t> operator>(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(values, other.values);
+  }
+  Vectorized<int32_t> operator>=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpgt_epi32(other.values, values));
+  }
+  Vectorized<int32_t> eq(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ne(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> gt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ge(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> lt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> le(const Vectorized<int32_t>& other) const;
+};
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size());
+       i += Vectorized<int32_t>::size()) {
+    auto input_vec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_ps(input_vec);
+    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, double* dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    auto input_128_vec =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
+    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+template <>
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int16_t> ones;
+
+ public:
+  using value_type = int16_t;
+  static constexpr int size() {
+    return 16;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int16_t v) {
+    values = _mm256_set1_epi16(v);
+  }
+  Vectorized(
+      int16_t val1,
+      int16_t val2,
+      int16_t val3,
+      int16_t val4,
+      int16_t val5,
+      int16_t val6,
+      int16_t val7,
+      int16_t val8,
+      int16_t val9,
+      int16_t val10,
+      int16_t val11,
+      int16_t val12,
+      int16_t val13,
+      int16_t val14,
+      int16_t val15,
+      int16_t val16) {
+    values = _mm256_setr_epi16(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b) {
+    __at_align__ int16_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi16(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi16(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi16(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi16(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi16(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi16(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi16(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi16(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi16(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi16(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi16(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi16(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi16(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi16(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi16(b.values, 15);
+    return loadu(tmp_values);
+  }
+  static Vectorized<int16_t> blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int16_t> set(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b,
+      int16_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int16_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int16_t> loadu(const void* ptr, int16_t count) {
+    __at_align__ int16_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to one using "={1}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 1;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int16_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+    }
+  }
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+  Vectorized<int16_t> abs() const {
+    return _mm256_abs_epi16(values);
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+  Vectorized<int16_t> neg() const;
+  Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpeq_epi16(values, other.values);
+  }
+  Vectorized<int16_t> operator!=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpeq_epi16(values, other.values));
+  }
+  Vectorized<int16_t> operator<(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(other.values, values);
+  }
+  Vectorized<int16_t> operator<=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpgt_epi16(values, other.values));
+  }
+  Vectorized<int16_t> operator>(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(values, other.values);
+  }
+  Vectorized<int16_t> operator>=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpgt_epi16(other.values, values));
+  }
+
+  Vectorized<int16_t> eq(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ne(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> gt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ge(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> lt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
+};
+
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+
+ protected:
+  static const Vectorized<T> ones;
+
+ public:
+  using value_type = T;
+  static constexpr int size() {
+    return 32;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized8() {}
+  Vectorized8(T v) {
+    values = _mm256_set1_epi8(v);
+  }
+  Vectorized8(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32) {
+    values = _mm256_setr_epi8(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16,
+        val17,
+        val18,
+        val19,
+        val20,
+        val21,
+        val22,
+        val23,
+        val24,
+        val25,
+        val26,
+        val27,
+        val28,
+        val29,
+        val30,
+        val31,
+        val32);
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
+    __at_align__ T tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi8(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi8(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi8(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi8(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi8(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi8(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi8(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi8(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi8(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi8(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi8(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi8(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi8(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi8(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi8(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi8(b.values, 15);
+    if (mask & 0x010000)
+      tmp_values[16] = _mm256_extract_epi8(b.values, 16);
+    if (mask & 0x020000)
+      tmp_values[17] = _mm256_extract_epi8(b.values, 17);
+    if (mask & 0x040000)
+      tmp_values[18] = _mm256_extract_epi8(b.values, 18);
+    if (mask & 0x080000)
+      tmp_values[19] = _mm256_extract_epi8(b.values, 19);
+    if (mask & 0x100000)
+      tmp_values[20] = _mm256_extract_epi8(b.values, 20);
+    if (mask & 0x200000)
+      tmp_values[21] = _mm256_extract_epi8(b.values, 21);
+    if (mask & 0x400000)
+      tmp_values[22] = _mm256_extract_epi8(b.values, 22);
+    if (mask & 0x800000)
+      tmp_values[23] = _mm256_extract_epi8(b.values, 23);
+    if (mask & 0x1000000)
+      tmp_values[24] = _mm256_extract_epi8(b.values, 24);
+    if (mask & 0x2000000)
+      tmp_values[25] = _mm256_extract_epi8(b.values, 25);
+    if (mask & 0x4000000)
+      tmp_values[26] = _mm256_extract_epi8(b.values, 26);
+    if (mask & 0x8000000)
+      tmp_values[27] = _mm256_extract_epi8(b.values, 27);
+    if (mask & 0x10000000)
+      tmp_values[28] = _mm256_extract_epi8(b.values, 28);
+    if (mask & 0x20000000)
+      tmp_values[29] = _mm256_extract_epi8(b.values, 29);
+    if (mask & 0x40000000)
+      tmp_values[30] = _mm256_extract_epi8(b.values, 30);
+    if (mask & 0x80000000)
+      tmp_values[31] = _mm256_extract_epi8(b.values, 31);
+    return loadu(tmp_values);
+  }
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<T> set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<T> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+    // Fast path if only load element number of 8.
+    // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+    // Because loadu(const void* ptr, T count) requires zero initialization for
+    // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
+    // bits of the result are undefined.
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
+    // since gcc 9.3 doesn't support it now.
+    __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
+    return _mm256_castsi128_si256(input_128);
+  }
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    __at_align__ T tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to one using "={1}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 1;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(T));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 8) {
+        // Fast path if only store element number of 8
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values));
+      } else {
+        __at_align__ T tmp_values[size()];
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+        std::memcpy(ptr, tmp_values, count * sizeof(T));
+      }
+    }
+  }
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm256_set1_epi8(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+};
+
+template <>
+struct is_vec_specialized_for<int8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int8_t> : public Vectorized8<int8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+    return _mm256_abs_epi8(values);
+  }
+
+  Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
+    return _mm256_cmpeq_epi8(values, other.values);
+  }
+  Vectorized<int8_t> operator!=(const Vectorized<int8_t>& other) const {
+    return invert(_mm256_cmpeq_epi8(values, other.values));
+  }
+  Vectorized<int8_t> operator<(const Vectorized<int8_t>& other) const {
+    return _mm256_cmpgt_epi8(other.values, values);
+  }
+  Vectorized<int8_t> operator<=(const Vectorized<int8_t>& other) const {
+    return invert(_mm256_cmpgt_epi8(values, other.values));
+  }
+  Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ne(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> gt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ge(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> lt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
+};
+
+template <>
+struct is_vec_specialized_for<uint8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<uint8_t> : public Vectorized8<uint8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  Vectorized<uint8_t> neg() const;
+
+  Vectorized<uint8_t> abs() const {
+    return *this;
+  }
+
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    return _mm256_cmpeq_epi8(values, other.values);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    return invert(_mm256_cmpeq_epi8(values, other.values));
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return invert(_mm256_cmpeq_epi8(max, values));
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return _mm256_cmpeq_epi8(max, other.values);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
+template <>
+Vectorized<int64_t> inline operator+(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm256_add_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator+(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_add_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator+(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator+(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm256_add_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator+(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm256_add_epi8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator-(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm256_sub_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator-(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_sub_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator-(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator-(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm256_sub_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator-(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm256_sub_epi8(a, b);
+}
+
+// Negation. Defined here so we can utilize operator-
+inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
+  return Vectorized<int64_t>(0) - *this;
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::neg() const {
+  return Vectorized<int32_t>(0) - *this;
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::neg() const {
+  return Vectorized<int16_t>(0) - *this;
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
+  return Vectorized<int8_t>(0) - *this;
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
+// Emulate operations with no native 64-bit support in avx,
+// by extracting each element, performing the operation pointwise,
+// then combining the results into a vector.
+template <typename op_t>
+Vectorized<int64_t> inline emulate(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const op_t& op) {
+  int64_t a0 = _mm256_extract_epi64(a, 0);
+  int64_t a1 = _mm256_extract_epi64(a, 1);
+  int64_t a2 = _mm256_extract_epi64(a, 2);
+  int64_t a3 = _mm256_extract_epi64(a, 3);
+
+  int64_t b0 = _mm256_extract_epi64(b, 0);
+  int64_t b1 = _mm256_extract_epi64(b, 1);
+  int64_t b2 = _mm256_extract_epi64(b, 2);
+  int64_t b3 = _mm256_extract_epi64(b, 3);
+
+  int64_t c0 = op(a0, b0);
+  int64_t c1 = op(a1, b1);
+  int64_t c2 = op(a2, b2);
+  int64_t c3 = op(a3, b3);
+
+  return _mm256_set_epi64x(c3, c2, c1, c0);
+}
+
+template <typename op_t>
+Vectorized<int64_t> inline emulate(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c,
+    const op_t& op) {
+  int64_t a0 = _mm256_extract_epi64(a, 0);
+  int64_t a1 = _mm256_extract_epi64(a, 1);
+  int64_t a2 = _mm256_extract_epi64(a, 2);
+  int64_t a3 = _mm256_extract_epi64(a, 3);
+
+  int64_t b0 = _mm256_extract_epi64(b, 0);
+  int64_t b1 = _mm256_extract_epi64(b, 1);
+  int64_t b2 = _mm256_extract_epi64(b, 2);
+  int64_t b3 = _mm256_extract_epi64(b, 3);
+
+  int64_t c0 = _mm256_extract_epi64(c, 0);
+  int64_t c1 = _mm256_extract_epi64(c, 1);
+  int64_t c2 = _mm256_extract_epi64(c, 2);
+  int64_t c3 = _mm256_extract_epi64(c, 3);
+
+  int64_t d0 = op(a0, b0, c0);
+  int64_t d1 = op(a1, b1, c1);
+  int64_t d2 = op(a2, b2, c2);
+  int64_t d3 = op(a3, b3, c3);
+
+  return _mm256_set_epi64x(d3, d2, d1, d0);
+}
+
+// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated
+// This could be implemented more efficiently using epi32 instructions
+// This is also technically avx compatible, but then we'll need AVX
+// code for add as well.
+// Note: intentionally ignores undefined behavior like (-lowest * -1).
+template <>
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return emulate(
+      a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ {
+        return a_point * b_point;
+      });
+}
+
+template <>
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+template <typename T, typename Op>
+Vectorized<T> inline int_elementwise_binary_256(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] = op(values_a[i], values_b[i]);
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  // We don't have an instruction for multiplying int8_t
+#ifndef CPU_CAPABILITY_AVX2
+  return int_elementwise_binary_256(a, b, std::multiplies<int8_t>());
+#else
+  __m256i mask00FF = _mm256_set1_epi16(0x00FF);
+  __m256i a_lo = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8);
+  __m256i b_lo = _mm256_srai_epi16(_mm256_slli_epi16(b, 8), 8);
+  __m256i a_hi = _mm256_srai_epi16(a, 8);
+  __m256i b_hi = _mm256_srai_epi16(b, 8);
+  __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+  __m256i res = _mm256_or_si256(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<uint8_t> inline operator*(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+#ifndef CPU_CAPABILITY_AVX2
+  return int_elementwise_binary_256(a, b, std::multiplies<uint8_t>());
+#else
+  __m256i mask00FF = _mm256_set1_epi16(0x00FF);
+  __m256i a_lo = _mm256_and_si256(a, mask00FF);
+  __m256i b_lo = _mm256_and_si256(b, mask00FF);
+  __m256i a_hi = _mm256_srli_epi16(a, 8);
+  __m256i b_hi = _mm256_srli_epi16(b, 8);
+  __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+  __m256i res = _mm256_or_si256(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {
+    return std::min(a_point, b_point);
+  });
+#else
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  return _mm256_blendv_epi8(a, b, cmp);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_min_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm256_min_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm256_min_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline minimum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm256_min_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {
+    return std::max(a_point, b_point);
+  });
+#else
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  return _mm256_blendv_epi8(b, a, cmp);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_max_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm256_max_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm256_max_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline maximum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm256_max_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val,
+    const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(
+      a,
+      min_val,
+      max_val,
+      [](int64_t a_point, int64_t min_point, int64_t max_point) {
+        return std::min(max_point, std::max(a_point, min_point));
+      });
+#else
+  return minimum(maximum(a, min_val), max_val);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val,
+    const Vectorized<int32_t>& max_val) {
+  return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val,
+    const Vectorized<int16_t>& max_val) {
+  return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val,
+    const Vectorized<int8_t>& max_val) {
+  return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val));
+}
+
+template <>
+Vectorized<uint8_t> inline clamp(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val,
+    const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {
+    return std::min(max_point, a_point);
+  });
+#else
+  return minimum(max_val, a);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max_val) {
+  return _mm256_min_epi32(max_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max_val) {
+  return _mm256_min_epi16(max_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max_val) {
+  return _mm256_min_epi8(max_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_max(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {
+    return std::max(min_point, a_point);
+  });
+#else
+  return maximum(min_val, a);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val) {
+  return _mm256_max_epi32(min_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val) {
+  return _mm256_max_epi16(min_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val) {
+  return _mm256_max_epi8(min_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_min(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val) {
+  return _mm256_max_epu8(min_val, a);
+}
+
+template <typename T>
+std::enable_if_t<
+    !(std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>),
+    Vectorized<
+        int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized<int32_t>::size()) {
+  return Vectorized<int32_t>::loadu(ptr, count);
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const int8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm256_cvtepi8_epi32(
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+  } else {
+    auto a = Vectorized<int8_t>::loadu(ptr, count);
+    return _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a));
+  }
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const uint8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm256_cvtepu8_epi32(
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+  } else {
+    auto a = Vectorized<uint8_t>::loadu(ptr, count);
+    return _mm256_cvtepu8_epi32(_mm256_castsi256_si128(a));
+  }
+}
+
+template <>
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<uint8_t>());
+}
+
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_and_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_or_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_xor_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(
+    const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(
+    const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(
+    const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(
+    const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(
+    const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(
+    const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(
+    const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(
+    const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(
+    const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(
+    const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(
+    const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(
+    const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(
+    const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(
+    const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(
+    const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(
+    const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(
+    const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(
+    const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(
+    const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(
+    const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(
+    const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(
+    const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(
+    const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(
+    const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(
+    const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(
+    const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(
+    const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(
+    const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <bool left_shift>
+Vectorized<int16_t> inline shift_256_16(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  // No vector instruction for shifting int16_t, so emulating it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 16-bit elements, and considering pairs of neighboring
+  // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m256i ctl_0_1 = _mm256_set_epi8(
+      29,
+      28,
+      0x80,
+      0x80,
+      25,
+      24,
+      0x80,
+      0x80,
+      21,
+      20,
+      0x80,
+      0x80,
+      17,
+      16,
+      0x80,
+      0x80,
+      13,
+      12,
+      0x80,
+      0x80,
+      9,
+      8,
+      0x80,
+      0x80,
+      5,
+      4,
+      0x80,
+      0x80,
+      1,
+      0,
+      0x80,
+      0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      31,
+      30,
+      0x80,
+      0x80,
+      27,
+      26,
+      0x80,
+      0x80,
+      23,
+      22,
+      0x80,
+      0x80,
+      19,
+      18,
+      0x80,
+      0x80,
+      15,
+      14,
+      0x80,
+      0x80,
+      11,
+      10,
+      0x80,
+      0x80,
+      7,
+      6,
+      0x80,
+      0x80,
+      3,
+      2);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 16-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFFFF);
+  __m256i keep_1 = _mm256_set1_epi32(0xFFFF0000);
+
+  // Take each 16-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 16
+  // bits will be proper result of shifting original 16-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  else
+    c0 = _mm256_srav_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
+
+  // Perform shifting the same way for input array elements with
+  // idx%2==1.
+  __m256i a1 = _mm256_and_si256(a, keep_1);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  else
+    c1 = _mm256_srav_epi32(a1, b1);
+  c1 = _mm256_and_si256(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m256i c = _mm256_or_si256(c0, c1);
+
+  return c;
+}
+
+template <
+    bool left_shift,
+    typename T,
+    typename std::enable_if_t<
+        std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+        int> = 0>
+Vectorized<T> inline shift_256_8(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 8-bit elements, and considering quadruples of
+  // neighboring elements.  Specifically, a mask named "ctl_M_N" (M,N
+  // in [0,1,2,3], and M!=N) is set so that shuffle will move element
+  // with index M from input quadruple into element with index N in
+  // output quadruple, and other elements in output quadruple will be
+  // set to all 0s.
+  __m256i ctl_0_3 = _mm256_set_epi8(
+      28,
+      0x80,
+      0x80,
+      0x80,
+      24,
+      0x80,
+      0x80,
+      0x80,
+      20,
+      0x80,
+      0x80,
+      0x80,
+      16,
+      0x80,
+      0x80,
+      0x80,
+      12,
+      0x80,
+      0x80,
+      0x80,
+      8,
+      0x80,
+      0x80,
+      0x80,
+      4,
+      0x80,
+      0x80,
+      0x80,
+      0,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      29,
+      0x80,
+      0x80,
+      0x80,
+      25,
+      0x80,
+      0x80,
+      0x80,
+      21,
+      0x80,
+      0x80,
+      0x80,
+      17,
+      0x80,
+      0x80,
+      0x80,
+      13,
+      0x80,
+      0x80,
+      0x80,
+      9,
+      0x80,
+      0x80,
+      0x80,
+      5,
+      0x80,
+      0x80,
+      0x80,
+      1);
+  __m256i ctl_1_3 = _mm256_set_epi8(
+      29,
+      0x80,
+      0x80,
+      0x80,
+      25,
+      0x80,
+      0x80,
+      0x80,
+      21,
+      0x80,
+      0x80,
+      0x80,
+      17,
+      0x80,
+      0x80,
+      0x80,
+      13,
+      0x80,
+      0x80,
+      0x80,
+      9,
+      0x80,
+      0x80,
+      0x80,
+      5,
+      0x80,
+      0x80,
+      0x80,
+      1,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_2_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      30,
+      0x80,
+      0x80,
+      0x80,
+      26,
+      0x80,
+      0x80,
+      0x80,
+      22,
+      0x80,
+      0x80,
+      0x80,
+      18,
+      0x80,
+      0x80,
+      0x80,
+      14,
+      0x80,
+      0x80,
+      0x80,
+      10,
+      0x80,
+      0x80,
+      0x80,
+      6,
+      0x80,
+      0x80,
+      0x80,
+      2);
+  __m256i ctl_2_3 = _mm256_set_epi8(
+      30,
+      0x80,
+      0x80,
+      0x80,
+      26,
+      0x80,
+      0x80,
+      0x80,
+      22,
+      0x80,
+      0x80,
+      0x80,
+      18,
+      0x80,
+      0x80,
+      0x80,
+      14,
+      0x80,
+      0x80,
+      0x80,
+      10,
+      0x80,
+      0x80,
+      0x80,
+      6,
+      0x80,
+      0x80,
+      0x80,
+      2,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_3_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3);
+  __m256i ctl_3_1 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3,
+      0x80);
+  __m256i ctl_3_2 = _mm256_set_epi8(
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3,
+      0x80,
+      0x80);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 8-bit elements, and considering them in quadruples of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1,2,3]) is set so that
+  // bitwise and will copy element with index M from input quadruple
+  // into element with the same index in output quadruple, while the
+  // other elements in output quadruple will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFF);
+  __m256i keep_3 = _mm256_set1_epi32(0xFF000000);
+
+  // Take each 8-bit element with idx%4==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%4!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c0 = _mm256_srav_epi32(a0, b0);
+  else
+    c0 = _mm256_srlv_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
+
+  // Perform shifting the same way for input array elements with
+  // idx%4==1.
+  __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c1 = _mm256_srav_epi32(a1, b1);
+  else
+    c1 = _mm256_srlv_epi32(a1, b1);
+  c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
+
+  // Perform shifting the same way for input array elements with
+  // idx%4==2.
+  __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
+  __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
+  __m256i c2;
+  if (left_shift)
+    c2 = _mm256_sllv_epi32(a2, b2);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c2 = _mm256_srav_epi32(a2, b2);
+  else
+    c2 = _mm256_srlv_epi32(a2, b2);
+  c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
+
+  // Perform shifting the same way for input array elements with
+  // idx%4==3.
+  __m256i a3 = _mm256_and_si256(a, keep_3);
+  __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
+  __m256i c3;
+  if (left_shift)
+    c3 = _mm256_sllv_epi32(a3, b3);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c3 = _mm256_srav_epi32(a3, b3);
+  else
+    c3 = _mm256_srlv_epi32(a3, b3);
+  c3 = _mm256_and_si256(c3, keep_3);
+
+  // Merge partial results into the final result.
+  __m256i c01 = _mm256_or_si256(c0, c1);
+  __m256i c23 = _mm256_or_si256(c2, c3);
+  __m256i c = _mm256_or_si256(c01, c23);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm256_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return shift_256_16<true>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator<<(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  // No vector instruction for right arithmetic shifting int64_t, so emulating
+  // it instead.
+
+  // Clamp the shift values such that shift values < 0 and > 64 are changed to
+  // 64 which results in -1 for negative input and 0 for non-negative input.
+  __m256i zero = _mm256_set1_epi64x(0);
+  __m256i max_shift = _mm256_set1_epi64x(64);
+  __m256i mask = _mm256_or_si256(
+      _mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift));
+  __m256i shift = _mm256_blendv_epi8(b, max_shift, mask);
+  // Shift the number logically to the right, thus filling the most
+  // significant bits with 0s.  Then, replace these bits with the sign
+  // bit.
+  __m256i sign_bits = _mm256_cmpgt_epi64(zero, a);
+  __m256i sign_shift = _mm256_sub_epi64(max_shift, shift);
+  __m256i sign_ext = _mm256_sllv_epi64(sign_bits, sign_shift);
+  __m256i c = _mm256_srlv_epi64(a, shift);
+  c = _mm256_or_si256(c, sign_ext);
+
+  return c;
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm256_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return shift_256_16<false>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator>>(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h
new file mode 100644
index 0000000000000000000000000000000000000000..595e0c4946a461bb6cc446d202f2156ef4bfbdc9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_mask.h
@@ -0,0 +1,303 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <typename T, int dst_n, typename mask_t, int mask_n>
+struct VecMaskLoad<
+    T,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (mask_n == dst_n * 2 && dst_n >= 1) &&
+            (std::is_same_v<T, float> || std::is_same_v<T, int32_t>),
+        void>> {
+  static inline VectorizedN<T, dst_n> apply(
+      const T* ptr,
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    VectorizedN<mask_t, 2> tmp_vec;
+    VectorizedN<T, dst_n> result;
+    for (int i = 0; i < dst_n; i++) {
+      tmp_vec[0] = vec_mask[2 * i];
+      tmp_vec[1] = vec_mask[2 * i + 1];
+      auto int64_mask = VecMask<mask_t, 2>(tmp_vec).template cast<int64_t, 2>();
+      auto int_mask = int64_mask.template cast<int, 1>()[0];
+      if constexpr (std::is_same_v<T, float>) {
+        result[i] = Vectorized<T>(
+            _mm256_maskload_ps(ptr + i * Vectorized<T>::size(), int_mask));
+      } else {
+        result[i] = Vectorized<T>(
+            _mm256_maskload_epi32(ptr + i * Vectorized<T>::size(), int_mask));
+      }
+    }
+    return result;
+  }
+};
+
+template <typename T, int dst_n, typename mask_t>
+struct VecMaskLoad<
+    T,
+    dst_n,
+    mask_t,
+    dst_n,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t>,
+        void>> {
+  static inline VectorizedN<T, dst_n> apply(
+      const T* ptr,
+      const VecMask<mask_t, dst_n>& vec_mask) {
+    VectorizedN<T, dst_n> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < dst_n; i++) {
+      auto tmp_mask = VecMask<mask_t, 1>(vec_mask[i]);
+      auto int_mask = tmp_mask.template cast<int, 1>()[0];
+      if constexpr (std::is_same_v<T, float>) {
+        result[i] = Vectorized<T>(
+            _mm256_maskload_ps(ptr + i * Vectorized<T>::size(), int_mask));
+      } else {
+        result[i] = Vectorized<T>(
+            _mm256_maskload_epi32(ptr + i * Vectorized<T>::size(), int_mask));
+      }
+    }
+    return result;
+  }
+};
+
+template <typename T, typename mask_t>
+struct VecMaskLoad<
+    T,
+    2,
+    mask_t,
+    1,
+    typename std::enable_if_t<
+        std::is_same_v<T, int64_t> || std::is_same_v<T, double>>> {
+  static inline VectorizedN<T, 2> apply(
+      const T* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto int64_mask = vec_mask.template cast<int64_t, 2>();
+    auto result = at::vec::VectorizedN<T, 2>();
+    if constexpr (std::is_same_v<T, double>) {
+      result[0] = _mm256_maskload_pd(ptr, int64_mask[0]);
+      result[1] = _mm256_maskload_pd(
+          ptr + at::vec::Vectorized<T>::size(), int64_mask[1]);
+    } else {
+      result[0] = _mm256_maskload_epi64(
+          reinterpret_cast<const long long*>(ptr), int64_mask[0]);
+      result[1] = _mm256_maskload_epi64(
+          reinterpret_cast<const long long*>(
+              ptr + at::vec::Vectorized<T>::size()),
+          int64_mask[1]);
+    }
+    return result;
+  }
+};
+
+// TODO: add specialization of VecMaskLoad for bfloat16/half and int8/uint8
+
+template <int N>
+struct VecMaskCast<float, N, int, N> {
+  static inline VecMask<float, N> apply(const VecMask<int, N>& vec_mask) {
+    VectorizedN<float, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm256_castsi256_ps(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<int, N, float, N> {
+  static inline VecMask<int, N> apply(const VecMask<float, N>& vec_mask) {
+    VectorizedN<int, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm256_castps_si256(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<int64_t, N, double, N> {
+  static inline VecMask<int64_t, N> apply(const VecMask<double, N>& vec_mask) {
+    VectorizedN<int64_t, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm256_castpd_si256(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<double, N, int64_t, N> {
+  static inline VecMask<double, N> apply(const VecMask<int64_t, N>& vec_mask) {
+    VectorizedN<double, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm256_castsi256_pd(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int dst_n, typename mask_t, int mask_n>
+struct VecMaskCast<
+    int64_t,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (dst_n == 2 * mask_n) &&
+            (std::is_same_v<mask_t, float> || std::is_same_v<mask_t, int>),
+        void>> {
+  static inline VecMask<int64_t, dst_n> apply(
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    VectorizedN<int64_t, dst_n> result;
+    auto int_mask = vec_mask.template cast<int, mask_n>();
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < mask_n; ++i) {
+      auto int64_vec =
+          convert<int64_t, 2, int, 1>(VectorizedN<int, 1>(int_mask[i]));
+      result[2 * i] = int64_vec[0];
+      result[2 * i + 1] = int64_vec[1];
+    }
+    return VecMask<int64_t, dst_n>(result);
+  }
+};
+
+template <typename dst_t, int dst_n, int mask_n>
+struct VecMaskCast<
+    dst_t,
+    dst_n,
+    int64_t,
+    mask_n,
+    typename std::enable_if_t<
+        (mask_n == 2 * dst_n) &&
+            (std::is_same_v<dst_t, float> || std::is_same_v<dst_t, int>),
+        void>> {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<int64_t, mask_n>& vec_mask) {
+    VectorizedN<int, dst_n> result;
+    VectorizedN<int64_t, 2> int64_vec;
+    for (int i = 0; i < dst_n; ++i) {
+      int64_vec[0] = vec_mask[2 * i];
+      int64_vec[1] = vec_mask[2 * i + 1];
+      result[i] = convert<int, 1, int64_t, 2>(int64_vec);
+    }
+    return VecMask<int, dst_n>(result).template cast<dst_t, dst_n>();
+  }
+};
+
+template <>
+struct VecMaskCast<double, 2, float, 1> {
+  static inline VecMask<double, 2> apply(const VecMask<float, 1>& vec_mask) {
+    auto int64_mask = VecMaskCast<int64_t, 2, float, 1>::apply(vec_mask);
+    return VecMaskCast<double, 2, int64_t, 2>::apply(int64_mask);
+  }
+};
+template <>
+struct VecMaskCast<float, 1, double, 2> {
+  static inline VecMask<float, 1> apply(const VecMask<double, 2>& vec_mask) {
+    auto int64_mask = VecMaskCast<int64_t, 2, double, 2>::apply(vec_mask);
+    return VecMaskCast<float, 1, int64_t, 2>::apply(int64_mask);
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  return _mm256_testz_si256(mask_[0], mask_[0]);
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  int mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0]));
+  return mask == 0xff;
+}
+
+template <int N>
+struct VecMaskCheck<int64_t, N> {
+  static inline bool all_zero(const VectorizedN<int64_t, N>& vec_mask) {
+    bool all_zero = true;
+    for (int i = 0; i < N; ++i) {
+      all_zero = all_zero && (_mm256_testz_si256(vec_mask[i], vec_mask[i]) > 0);
+      if (!all_zero) {
+        return all_zero;
+      }
+    }
+    return all_zero;
+  }
+
+  static inline bool is_masked(const VectorizedN<int64_t, N>& vec_mask, int i) {
+    for (int j = 0; j < N; ++j) {
+      if (i < (j + 1) * 4) {
+        return _mm256_movemask_pd(_mm256_castsi256_pd(vec_mask[j])) &
+            (1 << (i - j * 4));
+      }
+    }
+    return false;
+  }
+
+  static inline bool all_masked(const VectorizedN<int64_t, N>& vec_mask) {
+    bool all_masked = true;
+    for (int i = 0; i < N; ++i) {
+      all_masked = all_masked &&
+          (_mm256_movemask_pd(_mm256_castsi256_pd(vec_mask[i])) == 0x0f);
+      if (!all_masked) {
+        return all_masked;
+      }
+    }
+    return all_masked;
+  }
+};
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e77d78528b5d6a069347064e8dc21cbf6151682
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h
@@ -0,0 +1,1429 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cmath>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2)
+
+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m256i vals;
+#else
+struct Vectorizedqi {
+ protected:
+  __m256i vals __attribute__((aligned(64)));
+#endif
+
+ public:
+  Vectorizedqi() {
+    vals = _mm256_setzero_si256();
+  }
+  Vectorizedqi(__m256i v) : vals(v) {}
+  operator __m256i() const {
+    return vals;
+  }
+};
+
+template <typename T>
+__m256i pack_saturate_and_clamp(
+    __m256i first,
+    __m256i second,
+    T min_val,
+    T max_val);
+
+template <>
+inline __m256i pack_saturate_and_clamp<int32_t>(
+    __m256i /*first*/,
+    __m256i /*second*/,
+    int32_t /*min_val*/,
+    int32_t /*max_val*/) {
+  // This function is for linkage only, will not be used
+  TORCH_CHECK(false, "pack_saturate_and_clamp<int32_t> is not supported");
+}
+
+template <>
+inline __m256i pack_saturate_and_clamp<int8_t>(
+    __m256i first,
+    __m256i second,
+    int8_t min_val,
+    int8_t max_val) {
+  __m256i packed_and_sat = _mm256_packs_epi16(first, second);
+  return _mm256_max_epi8(
+      _mm256_set1_epi8(min_val),
+      _mm256_min_epi8(packed_and_sat, _mm256_set1_epi8(max_val)));
+}
+
+template <>
+inline __m256i pack_saturate_and_clamp<uint8_t>(
+    __m256i first,
+    __m256i second,
+    uint8_t min_val,
+    uint8_t max_val) {
+  __m256i packed_and_sat = _mm256_packus_epi16(first, second);
+  return _mm256_max_epu8(
+      _mm256_set1_epi8(min_val),
+      _mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val)));
+}
+
+template <typename T>
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 8*8 bits
+  __m128i input_128 = _mm256_castsi256_si128(src);
+  // Convert from 8*uint8/int8 to 8*int32
+  __m256i input_256_int32;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_256_int32 = _mm256_cvtepu8_epi32(input_128);
+  else
+    input_256_int32 = _mm256_cvtepi8_epi32(input_128);
+  // Convert from 8*int32 to 8*float
+  return _mm256_cvtepi32_ps(input_256_int32);
+}
+
+template <typename T>
+at::vec::Vectorized<T> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src);
+
+template <>
+at::vec::Vectorized<int8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // Convert from float32 to int32 with truncation
+  __m256i x_values_int32 = _mm256_cvttps_epi32(src);
+
+  // Convert from int32 to int16 using signed saturation
+  __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
+
+  constexpr auto min_val = std::numeric_limits<int8_t>::min();
+  constexpr auto max_val = std::numeric_limits<int8_t>::max();
+
+  // Convert from int16 to int8 using unsigned saturation
+  __m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+}
+
+template <>
+at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // The type of *_val should be int32_t to ensure correct clamping behavior.
+  constexpr auto min_val = std::numeric_limits<int32_t>::min();
+  constexpr auto max_val = std::numeric_limits<int32_t>::max();
+  __m256 float32_min_val = _mm256_set1_ps(float(min_val));
+  __m256 float32_max_val = _mm256_set1_ps(float(max_val));
+  __m256 float32_src = _mm256_max_ps(src, float32_min_val);
+  float32_src = _mm256_min_ps(float32_src, float32_max_val);
+  __m256i truncated_src = _mm256_cvttps_epi32(float32_src);
+
+  __m128i r1 = _mm256_castsi256_si128(truncated_src);
+  __m128i mask = _mm_setr_epi8(
+      0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
+  __m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
+  __m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
+  __m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
+
+  return _mm256_castsi128_si256(result);
+}
+
+template <typename T>
+__FORCE_INLINE void QuantizeAvx2(
+    const float* src,
+    T* dst,
+    int len,
+    float inverse_scale,
+    int64_t zero_point) {
+  constexpr int VLEN = 8;
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  const __m256i min_v = _mm256_set1_epi32(min_val);
+  const __m256i max_v = _mm256_set1_epi32(max_val);
+  // This is the largest int32 value < int32_max exactly representable in float
+  constexpr int32_t int32_float_max_val =
+      std::numeric_limits<int32_t>::max() - 127;
+  int i = 0;
+  __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
+  // clang-format off
+  static const __m256i shuffle_mask_v = _mm256_set_epi8(
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00);
+  // clang-format on
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  __m256i permute_mask_l8_v =
+      _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
+  int len_aligned = len / (VLEN * 4) * (VLEN * 4);
+  for (; i < len_aligned; i += 4 * VLEN) {
+    // x
+    __m256 x_vals = _mm256_load_ps(src + i);
+    __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v);
+    // If the floating point value is greater than int32_max,
+    // _mm256_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
+    // Clip at int32_float_max_val to avoid this.
+    x_transformed_v =
+        _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // y
+    __m256 y_vals = _mm256_load_ps(src + i + VLEN);
+    __m256 y_transformed_v = _mm256_mul_ps(y_vals, inverse_scale_v);
+    y_transformed_v =
+        _mm256_min_ps(y_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // z
+    __m256 z_vals = _mm256_load_ps(src + i + 2 * VLEN);
+    __m256 z_transformed_v = _mm256_mul_ps(z_vals, inverse_scale_v);
+    z_transformed_v =
+        _mm256_min_ps(z_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // w
+    __m256 w_vals = _mm256_load_ps(src + i + 3 * VLEN);
+    __m256 w_transformed_v = _mm256_mul_ps(w_vals, inverse_scale_v);
+    w_transformed_v =
+        _mm256_min_ps(w_transformed_v, _mm256_set1_ps(int32_float_max_val));
+
+    __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v);
+    __m256i y_rounded_v = _mm256_cvtps_epi32(y_transformed_v);
+    __m256i z_rounded_v = _mm256_cvtps_epi32(z_transformed_v);
+    __m256i w_rounded_v = _mm256_cvtps_epi32(w_transformed_v);
+
+    // add zero point
+    x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point));
+    y_rounded_v = _mm256_add_epi32(y_rounded_v, _mm256_set1_epi32(zero_point));
+    z_rounded_v = _mm256_add_epi32(z_rounded_v, _mm256_set1_epi32(zero_point));
+    w_rounded_v = _mm256_add_epi32(w_rounded_v, _mm256_set1_epi32(zero_point));
+
+    __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
+    __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
+    __m256i xyzw_clamped_v =
+        pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+    xyzw_clamped_v =
+        _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), xyzw_clamped_v);
+  }
+
+  // Additional 8-lane AVX2 version to take advantage when len is smaller
+  // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
+  for (; i < len / VLEN * VLEN; i += VLEN) {
+    __m256 x_vals = _mm256_load_ps(src + i);
+    __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v);
+    x_transformed_v =
+        _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v);
+    x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point));
+    __m256i x_clipped_v =
+        _mm256_max_epi32(min_v, _mm256_min_epi32(max_v, x_rounded_v));
+
+    x_clipped_v = _mm256_shuffle_epi8(x_clipped_v, shuffle_mask_v);
+    x_clipped_v = _mm256_permutevar8x32_epi32(x_clipped_v, permute_mask_l8_v);
+    _mm_storel_epi64(
+        reinterpret_cast<__m128i*>(dst + i),
+        _mm256_castsi256_si128(x_clipped_v));
+  }
+
+  for (; i < len; ++i) {
+    float transformed = src[i] * inverse_scale;
+
+    // Not exactly the same behavior as the vectorized code.
+    // The vectorized code above always rounds to even in halfway cases
+    // (https://software.intel.com/en-us/node/523819), but std::nearbyint
+    // does the same only when the current rounding mode is FE_TONEAREST.
+    // However, in practice, this should not be a problem because most cases
+    // use the default rounding mode FE_TONEAREST.
+    // Note that we cannot implement the same behavior as the vectorized code
+    // using std::round because it does rounding away from zero in halfway
+    // cases.
+    transformed = zero_point + std::nearbyint(transformed);
+    float clipped =
+        std::min(std::max(transformed, float(min_val)), float(max_val));
+    dst[i] = clipped;
+  }
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+  using size_type = int;
+  static constexpr size_type kSize = Vectorized<int>::size();
+  static constexpr size_type size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint32& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi32(uw);
+  }
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_zp_premul) const {
+    __m256 float_vals = _mm256_cvtepi32_ps(vals);
+    return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m256 float_vals = _mm256_cvtepi32_ps(vals);
+    return {(Vectorized<float>(float_vals) - zero_point) * scale};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    Vectorized<c10::qint32> retval;
+    auto rhs_data = (__m256)rhs[0];
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        (float*)&rhs_data,
+        (c10::qint32*)&retval.vals,
+        size());
+    return retval;
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    return _mm256_max_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    return _mm256_min_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    return _mm256_min_epi32(
+        _mm256_max_epi32(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {_mm256_sub_epi32(vals, b)};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+
+    __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v);
+    __m256i rounded = _mm256_cvtps_epi32(scaled);
+    return _mm256_add_epi32(rounded, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm256_add_epi32(a, b);
+}
+
+/*
+ * Convert values from int32 back to int8/uint8
+ */
+template <typename T>
+__m256i RequantizeAvx2(
+    const std::array<Vectorized<c10::qint32>, 4>& inp,
+    __m256 multiplier,
+    __m256i zp) {
+  static_assert(
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier);
+  __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier);
+  __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier);
+  __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier);
+
+  __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
+  __m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
+  __m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v);
+  __m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v);
+
+  /* Add zero point */
+  __m256i x_v = _mm256_add_epi32(x_rounded_v, zp);
+  __m256i y_v = _mm256_add_epi32(y_rounded_v, zp);
+  __m256i z_v = _mm256_add_epi32(z_rounded_v, zp);
+  __m256i w_v = _mm256_add_epi32(w_rounded_v, zp);
+
+  /* Pack to int16_t and saturate */
+  __m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v);
+  __m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v);
+
+  __m256i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+  /*
+   * xyzw_clamped_v has results in the following layout so we need to
+   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7
+   */
+  xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+  return xyzw_clamped_v;
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+  static constexpr int kSize = VECTOR_WIDTH;
+  static constexpr int size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
+  static constexpr int int_num_vecs() {
+    return kIntNumVecs;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
+  using value_type = c10::qint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+
+  Vectorized() {}
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint8& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi8(uw);
+  }
+
+  // This is needed because the compiler emits awful code for the default
+  // constructor for moving the enum
+  // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+#endif
+  Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) {}
+  C10_CLANG_DIAGNOSTIC_POP()
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+ private:
+  __m256i cvtepi8_epi32(__m128i epi8_vals) const {
+    return _mm256_cvtepi8_epi32(epi8_vals);
+  }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_neg_zp_premul) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float /*scale*/,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    int8_t quantized_values[32];
+    QuantizeAvx2<value_type>(
+        rhs_data, quantized_values, 32, inverse_scale, zero_point);
+    return Vectorized<c10::qint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    return _mm256_max_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    return _mm256_min_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    return _mm256_min_epi8(_mm256_max_epi8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256i int32_val0 = cvtepi8_epi32(int_val0);
+    __m256i int32_val1 = cvtepi8_epi32(int_val1);
+    __m256i int32_val2 = cvtepi8_epi32(int_val2);
+    __m256i int32_val3 = cvtepi8_epi32(int_val3);
+
+    __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+    __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+    __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+    __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+    __m256i int32_b0 = cvtepi8_epi32(int_b0);
+    __m256i int32_b1 = cvtepi8_epi32(int_b1);
+    __m256i int32_b2 = cvtepi8_epi32(int_b2);
+    __m256i int32_b3 = cvtepi8_epi32(int_b3);
+
+    __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+    __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+    __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+    __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+    return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+  static constexpr int kSize = VECTOR_WIDTH;
+  static constexpr int size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
+  static constexpr int int_num_vecs() {
+    return kIntNumVecs;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
+  using value_type = c10::quint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::quint8& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi8(uw);
+  }
+
+  // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+#endif
+  Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) {}
+  C10_CLANG_DIAGNOSTIC_POP()
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+ private:
+  __m256i cvtepu8_epi32(__m128i epu8_vals) const {
+    return _mm256_cvtepu8_epi32(epu8_vals);
+  }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_zp_premul) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float /*scale*/,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    uint8_t quantized_values[32];
+    QuantizeAvx2<value_type>(
+        rhs_data, quantized_values, 32, inverse_scale, zero_point);
+    return Vectorized<c10::quint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    return _mm256_max_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    return _mm256_min_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    return _mm256_min_epu8(_mm256_max_epu8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256i int32_val0 = cvtepu8_epi32(int_val0);
+    __m256i int32_val1 = cvtepu8_epi32(int_val1);
+    __m256i int32_val2 = cvtepu8_epi32(int_val2);
+    __m256i int32_val3 = cvtepu8_epi32(int_val3);
+
+    __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+    __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+    __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+    __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+    __m256i int32_b0 = cvtepu8_epi32(int_b0);
+    __m256i int32_b1 = cvtepu8_epi32(int_b1);
+    __m256i int32_b2 = cvtepu8_epi32(int_b2);
+    __m256i int32_b3 = cvtepu8_epi32(int_b3);
+
+    __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+    __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+    __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+    __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+    return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#elif !defined(CPU_CAPABILITY_SVE256)
+
+// NOTE: These are low-performance implementations that we fall back on
+// if we are not building with AVX2. This may not be an issue, because
+// currently for quantization we assume the user has at least AVX512
+// installed, so these can simply act as a reference implementation.
+//
+// If in the future we relax this requirement (AVX2+), we should probably
+// revisit these implementations
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  static constexpr int size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size_ / Vectorized<float>::size();
+  }
+
+  static constexpr int int_num_vecs() {
+    return size_ / Vectorized<int>::size();
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (const auto i : c10::irange(size())) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> /*scale_zp_premul*/) const {
+    float_vec_return_type rv;
+    for (const auto i : c10::irange(float_num_vecs())) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (const auto j : c10::irange(Vectorized<float>::size())) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            scale[j],
+            zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>(tmp_vals);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    Vectorized<float> scale_zp_premul;
+    return dequantize(scale, zero_point, scale_zp_premul);
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     Vectorized<int>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return Vectorized<c10::qint32>(tmp_values);
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        float_vals.size());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (const auto i : c10::irange(size())) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] =
+          std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    4 * Vectorized<float>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return Vectorized<c10::qint8>(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        float_vals.size());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     4 * Vectorized<float>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return Vectorized<c10::quint8>(tmp_values);
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        float_vals.size());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // if defined(CPU_CAPABILITY_AVX2)
+
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+    at::vec::Vectorized<int8_t> src) {
+  auto s8x8 = vget_low_s8(src);
+  auto s16x8 = vmovl_s8(s8x8);
+
+  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
+  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
+
+  return std::make_pair(
+      Vectorized<float>(vcvtq_f32_s32(s32x4_lo)),
+      Vectorized<float>(vcvtq_f32_s32(s32x4_hi)));
+}
+
+std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+    at::vec::Vectorized<uint8_t> src) {
+  auto u8x8 = vget_low_u8(src);
+  auto u16x8 = vmovl_u8(u8x8);
+  auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
+  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
+
+  return std::make_pair(
+      Vectorized<float>(vcvtq_f32_u32(u32x4_lo)),
+      Vectorized<float>(vcvtq_f32_u32(u32x4_hi)));
+}
+
+Vectorized<float> inline convert_int8_half_register_to_float(
+    at::vec::Vectorized<int8_t> src) {
+  auto s8x8 = vget_low_s8(src);
+  auto s16x8 = vmovl_s8(s8x8);
+
+  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
+
+  return Vectorized<float>(vcvtq_f32_s32(s32x4_lo));
+}
+
+Vectorized<float> inline convert_int8_half_register_to_float(
+    at::vec::Vectorized<uint8_t> src) {
+  auto u8x8 = vget_low_u8(src);
+  auto u16x8 = vmovl_u8(u8x8);
+  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
+
+  return Vectorized<float>(vcvtq_f32_u32(u32x4_lo));
+}
+
+#endif
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2cba8d412f2b1f8c5ba60d77d9a42c1ed0639b0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
@@ -0,0 +1,80 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<BFloat16>& a) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr2);
+  convert(arr2, arr, K);
+  return std::make_tuple(
+      Vectorized<float>::loadu(arr),
+      Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr);
+  b.store(arr + Vectorized<float>::size());
+  convert(arr, arr2, K);
+  return Vectorized<BFloat16>::loadu(arr2);
+}
+
+inline void load_fp32_from_bf16(
+    const c10::BFloat16* data,
+    Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_bf16(
+    const c10::BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_bf16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_bf16(data, out2);
+}
+
+inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_fp16(
+    const c10::Half* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_fp16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_fp16(data, out2);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..849f75c2854a361c936288792495f3b6ae0af801
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
@@ -0,0 +1,255 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+// Note: header order is important here
+#include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
+
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
+
+#include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h>
+
+namespace at {
+namespace vec {
+
+inline namespace CPU_CAPABILITY {
+
+DEFINE_CLAMP_FUNCS(c10::quint8)
+DEFINE_CLAMP_FUNCS(c10::qint8)
+DEFINE_CLAMP_FUNCS(c10::qint32)
+DEFINE_CLAMP_FUNCS(int16_t)
+DEFINE_CLAMP_FUNCS(int32_t)
+DEFINE_CLAMP_FUNCS(int64_t)
+DEFINE_CLAMP_FUNCS(float)
+DEFINE_CLAMP_FUNCS(double)
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return Vectorized<double>{
+      vec_madd(a.vec0(), b.vec0(), c.vec0()),
+      vec_madd(a.vec1(), b.vec1(), c.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c) {
+  return Vectorized<int64_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    const Vectorized<int32_t>& c) {
+  return Vectorized<int32_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    const Vectorized<int16_t>& c) {
+  return Vectorized<int16_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t)
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
+  return Vectorized<int64_t>{vec_signed(src.vec0()), vec_signed(src.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<float>(const Vectorized<float>& src) {
+  return Vectorized<int32_t>{vec_signed(src.vec0()), vec_signed(src.vec1())};
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  // int32_t and float have same size
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    const int32_t* src_a = src + i;
+    float* dst_a = dst + i;
+    vint32 input_vec0 =
+        vec_vsx_ld(offset0, reinterpret_cast<const vint32*>(src_a));
+    vint32 input_vec1 =
+        vec_vsx_ld(offset16, reinterpret_cast<const vint32*>(src_a));
+    vfloat32 c0 = vec_float(input_vec0);
+    vfloat32 c1 = vec_float(input_vec1);
+    vec_vsx_st(c0, offset0, dst_a);
+    vec_vsx_st(c1, offset16, dst_a);
+  }
+
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int64_t* src, double* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    const int64_t* src_a = src + i;
+    double* dst_a = dst + i;
+    vint64 input_vec0 =
+        vec_vsx_ld(offset0, reinterpret_cast<const vint64*>(src_a));
+    vint64 input_vec1 =
+        vec_vsx_ld(offset16, reinterpret_cast<const vint64*>(src_a));
+    vfloat64 c0 = vec_double(input_vec0);
+    vfloat64 c1 = vec_double(input_vec1);
+    vec_vsx_st(c0, offset0, reinterpret_cast<double*>(dst_a));
+    vec_vsx_st(c1, offset16, reinterpret_cast<double*>(dst_a));
+  }
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+// Generic implementation to fix compiler error
+// TO-DO : Add optimized version for ppc64
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(
+    const Vectorized<Half>& a) {
+  constexpr int64_t K = Vectorized<Half>::size();
+  __at_align__ float arr[K];
+  __at_align__ Half arr2[K];
+  a.store(arr2);
+  convert(arr2, arr, K);
+  return std::make_tuple(
+      Vectorized<float>::loadu(arr),
+      Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<Half> convert_float_half(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  constexpr int64_t K = Vectorized<Half>::size();
+  __at_align__ float arr[K];
+  __at_align__ Half arr2[K];
+  a.store(arr);
+  b.store(arr + Vectorized<float>::size());
+  convert(arr, arr2, K);
+  return Vectorized<Half>::loadu(arr2);
+};
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a      = {a0, a1, a2, a3}
+  //   b      = {b0, b1, b2, b3}
+
+  vfloat64 ab00 = vec_xxpermdi(a.vec0(), b.vec0(), 0);
+  vfloat64 ab11 = vec_xxpermdi(a.vec0(), b.vec0(), 3);
+  vfloat64 ab2_00 = vec_xxpermdi(a.vec1(), b.vec1(), 0);
+  vfloat64 ab2_11 = vec_xxpermdi(a.vec1(), b.vec1(), 3);
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      Vectorized<double>{ab00, ab11}, Vectorized<double>{ab2_00, ab2_11});
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  vfloat64 aa01 = vec_xxpermdi(a.vec0(), a.vec1(), 0);
+  vfloat64 aa23 = vec_xxpermdi(b.vec0(), b.vec1(), 0);
+
+  vfloat64 bb_01 = vec_xxpermdi(a.vec0(), a.vec1(), 3);
+  vfloat64 bb_23 = vec_xxpermdi(b.vec0(), b.vec1(), 3);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(
+      Vectorized<double>{aa01, aa23}, Vectorized<double>{bb_01, bb_23});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  vfloat32 ab0011 = vec_mergeh(a.vec0(), b.vec0());
+  vfloat32 ab2233 = vec_mergel(a.vec0(), b.vec0());
+
+  vfloat32 ab2_0011 = vec_mergeh(a.vec1(), b.vec1());
+  vfloat32 ab2_2233 = vec_mergel(a.vec1(), b.vec1());
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  return std::make_pair(
+      Vectorized<float>{ab0011, ab2233}, Vectorized<float>{ab2_0011, ab2_2233});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  // {a0,a2,b0,b2} {a1,a3,b1,b3}
+  vfloat32 a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1());
+  vfloat32 a1a3b1b3 = vec_mergel(a.vec0(), a.vec1());
+
+  vfloat32 aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3);
+  vfloat32 bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3);
+
+  vfloat32 a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1());
+  vfloat32 a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1());
+
+  vfloat32 aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2);
+  vfloat32 bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2);
+
+  // it could be done with vec_perm ,too
+  // swap lanes:
+  //   return {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  return std::make_pair(
+      Vectorized<float>{aa0123, aa0123_2}, Vectorized<float>{bb0123, bb0123_2});
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cc03ca753ae4817b50565c03c732ba3b763a973
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -0,0 +1,684 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+using ComplexDbl = c10::complex<double>;
+
+template <>
+struct is_vec_specialized_for<ComplexDbl> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<ComplexDbl> {
+  union {
+    struct {
+      vfloat64 _vec0;
+      vfloat64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = ComplexDbl;
+  using vec_internal_type = vfloat64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 2;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(ComplexDbl val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    _vec0 = vfloat64{real_value, imag_value};
+    _vec1 = vfloat64{real_value, imag_value};
+  }
+  Vectorized(ComplexDbl val1, ComplexDbl val2) {
+    _vec0 = vfloat64{val1.real(), val1.imag()};
+    _vec1 = vfloat64{val2.real(), val2.imag()};
+  }
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <int64_t mask>
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 0, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 1, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 2, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 3, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+  el_blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    return {
+        (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<ComplexDbl> blendv(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      const Vectorized<ComplexDbl>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_complex = Vectorized<ComplexDbl>(
+        vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0));
+    return {
+        vec_sel(a._vec0, b._vec0, mask_complex._vecb0),
+        vec_sel(a._vec1, b._vec1, mask_complex._vecb1)};
+  }
+
+  static Vectorized<ComplexDbl> C10_ALWAYS_INLINE elwise_blendv(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      const Vectorized<ComplexDbl>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<ComplexDbl> arange(
+      ComplexDbl base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<ComplexDbl>(base, base + step);
+  }
+  static Vectorized<ComplexDbl> set(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+    }
+    return b;
+  }
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const double*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const double*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        vec_vsx_ld(offset0, reinterpret_cast<const double*>(tmp_values)),
+        vec_vsx_ld(offset16, reinterpret_cast<const double*>(tmp_values))};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<double*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<double*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<double*>(tmp_values));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<double*>(tmp_values));
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const ComplexDbl& operator[](int idx) const = delete;
+  ComplexDbl& operator[](int idx) = delete;
+
+  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
+    __at_align__ ComplexDbl tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
+    __at_align__ ComplexDbl tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexDbl> el_swapped() const {
+    vfloat64 v0 = vec_xxpermdi(_vec0, _vec0, 2);
+    vfloat64 v1 = vec_xxpermdi(_vec1, _vec1, 2);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexDbl> el_madd(
+      const Vectorized<ComplexDbl>& multiplier,
+      const Vectorized<ComplexDbl>& val) const {
+    return {
+        vec_madd(_vec0, multiplier._vec0, val._vec0),
+        vec_madd(_vec1, multiplier._vec1, val._vec1)};
+  }
+
+  Vectorized<ComplexDbl> el_mergeo() const {
+    vfloat64 v0 = vec_splat(_vec0, 1);
+    vfloat64 v1 = vec_splat(_vec1, 1);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexDbl> el_mergee() const {
+    vfloat64 v0 = vec_splat(_vec0, 0);
+    vfloat64 v1 = vec_splat(_vec1, 0);
+    return {v0, v1};
+  }
+
+  static Vectorized<ComplexDbl> el_mergee(
+      const Vectorized<ComplexDbl>& first,
+      const Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergeh(first._vec0, second._vec0),
+        vec_mergeh(first._vec1, second._vec1)};
+  }
+
+  static Vectorized<ComplexDbl> el_mergeo(
+      const Vectorized<ComplexDbl>& first,
+      const Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergel(first._vec0, second._vec0),
+        vec_mergel(first._vec1, second._vec1)};
+  }
+
+  Vectorized<ComplexDbl> abs_2_() const {
+    auto a = (*this).elwise_mult(*this);
+    auto permuted = a.el_swapped();
+    a = a + permuted;
+    return a;
+  }
+
+  Vectorized<ComplexDbl> abs_() const {
+    auto vi = el_mergeo();
+    auto vr = el_mergee();
+    return {
+        Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0),
+        Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)};
+  }
+
+  Vectorized<ComplexDbl> abs() const {
+    return abs_() & vd_real_mask;
+  }
+
+  Vectorized<ComplexDbl> angle_() const {
+    // angle = atan2(b/a)
+    // auto b_a = _mm256_permute_pd(values, 0x05);     // b        a
+    // return Sleef_atan2d4_u10(values, b_a);          // 90-angle angle
+    Vectorized<ComplexDbl> ret;
+    ret._vec0[0] = std::atan2(_vec0[1], _vec0[0]);
+    ret._vec1[0] = std::atan2(_vec1[1], _vec1[0]);
+    return ret;
+  }
+
+  Vectorized<ComplexDbl> angle() const {
+    return angle_() & vd_real_mask;
+  }
+
+  Vectorized<ComplexDbl> real_() const {
+    return *this & vd_real_mask;
+  }
+  Vectorized<ComplexDbl> real() const {
+    return *this & vd_real_mask;
+  }
+  Vectorized<ComplexDbl> imag_() const {
+    return *this & vd_imag_mask;
+  }
+  Vectorized<ComplexDbl> imag() const {
+    return imag_().el_swapped();
+  }
+
+  Vectorized<ComplexDbl> conj_() const {
+    return *this ^ vd_isign_mask;
+  }
+  Vectorized<ComplexDbl> conj() const {
+    return *this ^ vd_isign_mask;
+  }
+
+  Vectorized<ComplexDbl> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+
+  Vectorized<ComplexDbl> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return ret.elwise_mult(vd_log2e_inv);
+  }
+  Vectorized<ComplexDbl> log10() const {
+    auto ret = log();
+    return ret.elwise_mult(vd_log10e_inv);
+  }
+
+  Vectorized<ComplexDbl> log1p() const {
+    return map(std::log1p);
+  }
+
+  Vectorized<ComplexDbl> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    auto conj = conj_();
+    auto b_a = conj.el_swapped();
+    auto ab = conj.elwise_mult(b_a);
+    auto im = ab + ab;
+    auto val_2 = (*this).elwise_mult(*this);
+    auto val_2_swapped = val_2.el_swapped();
+    auto re = horizontal_sub(val_2, val_2_swapped);
+    re = Vectorized<ComplexDbl>(vd_one) - re;
+    auto root = el_blend<0x0A>(re, im).sqrt();
+    auto ln = (b_a + root).log();
+    return ln.el_swapped().conj();
+  }
+
+  Vectorized<ComplexDbl> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized(vd_pi_2) - asin();
+  }
+
+  Vectorized<ComplexDbl> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized(vd_imag_one);
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln * vd_imag_half; // i/2*ln()
+  }
+  Vectorized<ComplexDbl> atanh() const {
+    return map(std::atanh);
+  }
+
+  Vectorized<ComplexDbl> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<ComplexDbl> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<ComplexDbl> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<ComplexDbl> cosh() const {
+    return map(std::cosh);
+  }
+
+  Vectorized<ComplexDbl> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<ComplexDbl> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<ComplexDbl> ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<ComplexDbl> floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<ComplexDbl> neg() const {
+    auto z = Vectorized<ComplexDbl>(vd_zero);
+    return z - *this;
+  }
+  Vectorized<ComplexDbl> round() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> elwise_sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> sqrt() const {
+    return map(std::sqrt);
+  }
+
+  Vectorized<ComplexDbl> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    auto c_d = *this ^ vd_isign_mask; // c       -d
+    auto abs = abs_2_();
+    return c_d.elwise_div(abs);
+  }
+
+  Vectorized<ComplexDbl> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  static Vectorized<ComplexDbl> horizontal_add(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and
+    // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
+  }
+
+  static Vectorized<ComplexDbl> horizontal_sub(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.el_swapped(); // 2perm
+    auto second_perm = second.el_swapped(); // 2perm
+    // summ
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return el_mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  Vectorized<ComplexDbl> inline operator*(
+      const Vectorized<ComplexDbl>& b) const {
+    //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+#if 1
+    // this is more vsx friendly than simulating horizontal from x86
+    auto vi = b.el_mergeo();
+    auto vr = b.el_mergee();
+    vi = vi ^ vd_rsign_mask;
+    auto ret = elwise_mult(vr);
+    auto vx_swapped = el_swapped();
+    ret = vx_swapped.elwise_mult(vi) + ret;
+#else
+    auto ac_bd = elwise_mult(b);
+    auto d_c = b.el_swapped();
+    d_c = d_c ^ vd_isign_mask;
+    auto ad_bc = elwise_mult(d_c);
+    auto ret = horizontal_sub(ac_bd, ad_bc);
+#endif
+    return ret;
+  }
+
+  Vectorized<ComplexDbl> inline operator/(
+      const Vectorized<ComplexDbl>& b) const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2()
+    // im = (bc - ad)/abs_2()
+    // auto fabs_cd =  Vectorized{
+    //    vec_andc(b._vec0, vd_sign_mask),
+    //    vec_andc(b._vec1, vd_sign_mask)};       // |c|            |d|
+    // auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    // auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    // auto a2 = elwise_div(scale);              // a/sc           b/sc
+    // auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    // auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    // auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    // dc2 = dc2 ^ vd_rsign_mask;                // -d/sc          c/sc
+    // auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    // auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    // auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2
+    // (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret;
+
+    __at_align__ c10::complex<double>
+        tmp1[Vectorized<c10::complex<double>>::size()];
+    __at_align__ c10::complex<double>
+        tmp2[Vectorized<c10::complex<double>>::size()];
+    __at_align__ c10::complex<double>
+        out[Vectorized<c10::complex<double>>::size()];
+    this->store(tmp1);
+    b.store(tmp2);
+
+    for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+      out[i] = tmp1[i] / tmp2[i];
+    }
+    return loadu(out);
+  }
+
+  Vectorized<ComplexDbl> exp() const {
+    return map(std::exp);
+  }
+  Vectorized<ComplexDbl> exp2() const {
+    return map(exp2_impl);
+  }
+  Vectorized<ComplexDbl> expm1() const {
+    return map(std::expm1);
+  }
+
+  Vectorized<ComplexDbl> pow(const Vectorized<ComplexDbl>& exp) const {
+    __at_align__ ComplexDbl x_tmp[size()];
+    __at_align__ ComplexDbl y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+
+  Vectorized<ComplexDbl> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
+  Vectorized<ComplexDbl> operator<(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator<=(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator>(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator>=(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexDbl> eq(const Vectorized<ComplexDbl>& other) const {
+    auto eq = (*this == other); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+    return (eq.real() & eq.imag()) & vd_one;
+  }
+  Vectorized<ComplexDbl> ne(const Vectorized<ComplexDbl>& other) const {
+    auto ne = (*this != other); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+    return (ne.real() | ne.imag()) & vd_one;
+  }
+
+  DEFINE_MEMBER_OP(operator==, ComplexDbl, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, ComplexDbl, vec_cmpne)
+
+  DEFINE_MEMBER_OP(operator+, ComplexDbl, vec_add)
+  DEFINE_MEMBER_OP(operator-, ComplexDbl, vec_sub)
+  DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and)
+  DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or)
+  DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor)
+  // elementwise helpers
+  DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul)
+  DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div)
+  DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt)
+  DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge)
+  DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt)
+  DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max)
+};
+
+template <>
+Vectorized<ComplexDbl> inline maximum(
+    const Vectorized<ComplexDbl>& a,
+    const Vectorized<ComplexDbl>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  // auto max = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_lt(abs_b);
+  auto max = Vectorized<ComplexDbl>::elwise_blendv(a, b, mask);
+
+  return max;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<ComplexDbl> inline minimum(
+    const Vectorized<ComplexDbl>& a,
+    const Vectorized<ComplexDbl>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  // auto min = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_gt(abs_b);
+  auto min = Vectorized<ComplexDbl>::elwise_blendv(a, b, mask);
+  return min;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator+(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator-(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator&(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator|(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator^(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator*(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  // (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+  // Split into real and imaginary parts
+  auto a_real = a.el_mergee(); // real part of a
+  auto a_imag = a.el_mergeo(); // imag part of a
+  auto b_real = b.el_mergee(); // real part of b
+  auto b_imag = b.el_mergeo(); // imag part of b
+
+  // Compute components
+  auto ac = a_real.elwise_mult(b_real); // real*real
+  auto bd = a_imag.elwise_mult(b_imag); // imag*imag
+
+  // Real part: ac - bd
+  auto real = ac - bd;
+
+  auto ad = a_real.elwise_mult(b_imag); // real*imag
+  auto bc = a_imag.elwise_mult(b_real); // imag*real
+
+  // Imag = ad + bc
+  auto imag = ad + bc;
+
+  // Merge real and imaginary parts into vectors
+  __vector double v0 = vec_mergeh(real.vec0(), imag.vec0()); // [r0, i0]
+  __vector double v1 = vec_mergeh(real.vec1(), imag.vec1()); // [r1, i1]
+
+  // Create the final result
+  auto result = Vectorized<ComplexDbl>{v0, v1};
+  return result;
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator/(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  // re + im*i = (a + bi)  / (c + di)
+  // re = (ac + bd)/abs_2()
+  // im = (bc - ad)/abs_2()
+  // Take absolute values of real and imaginary parts of b
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return Vectorized<ComplexDbl>::loadu(out);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebeab3693c288277f434948d6e9a805e5b188cf0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -0,0 +1,776 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+
+#pragma once
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+using ComplexFlt = c10::complex<float>;
+
+template <>
+struct is_vec_specialized_for<ComplexFlt> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<ComplexFlt> {
+ private:
+  union {
+    struct {
+      vfloat32 _vec0;
+      vfloat32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = ComplexFlt;
+  using vec_internal_type = vfloat32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(ComplexFlt val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    _vec0 = vfloat32{real_value, imag_value, real_value, imag_value};
+    _vec1 = vfloat32{real_value, imag_value, real_value, imag_value};
+  }
+
+  Vectorized(
+      ComplexFlt val1,
+      ComplexFlt val2,
+      ComplexFlt val3,
+      ComplexFlt val4) {
+    _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()};
+    _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()};
+  }
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 0, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 1, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 2, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 3, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 4, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 5, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 6, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 7, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 8, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+  el_blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    const vbool32 mask_2nd = VsxMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<ComplexFlt> blendv(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      const Vectorized<ComplexFlt>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_complex = Vectorized<ComplexFlt>(
+        vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1));
+    return {
+        vec_sel(
+            a._vec0, b._vec0, reinterpret_cast<vbool32>(mask_complex._vec0)),
+        vec_sel(
+            a._vec1, b._vec1, reinterpret_cast<vbool32>(mask_complex._vec1)),
+    };
+  }
+
+  static Vectorized<ComplexFlt> elwise_blendv(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      const Vectorized<ComplexFlt>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, reinterpret_cast<vbool32>(mask._vec0)),
+        vec_sel(a._vec1, b._vec1, reinterpret_cast<vbool32>(mask._vec1)),
+    };
+  }
+
+  template <typename step_t>
+  static Vectorized<ComplexFlt> arange(
+      ComplexFlt base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<ComplexFlt>(
+        base,
+        base + step,
+        base + ComplexFlt(2) * step,
+        base + ComplexFlt(3) * step);
+  }
+  static Vectorized<ComplexFlt> set(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const float*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const float*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        vec_vsx_ld(offset0, reinterpret_cast<const float*>(tmp_values)),
+        vec_vsx_ld(offset16, reinterpret_cast<const float*>(tmp_values))};
+  }
+
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<float*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<float*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<float*>(tmp_values));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<float*>(tmp_values));
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const ComplexFlt& operator[](int idx) const = delete;
+  ComplexFlt& operator[](int idx) = delete;
+
+  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
+    __at_align__ ComplexFlt tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
+    __at_align__ ComplexFlt tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  static Vectorized<ComplexFlt> horizontal_add(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and
+    // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
+  }
+
+  static Vectorized<ComplexFlt> horizontal_sub_permD8(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.el_swapped(); // 2perm
+    auto second_perm = second.el_swapped(); // 2perm
+    // sum
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return el_mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  Vectorized<ComplexFlt> abs_2_() const {
+    auto a = (*this).elwise_mult(*this);
+    auto permuted = a.el_swapped();
+    a = a + permuted;
+    return a.el_mergee();
+  }
+
+  Vectorized<ComplexFlt> abs_() const {
+    auto vi = el_mergeo();
+    auto vr = el_mergee();
+    return {
+        Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0),
+        Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)};
+  }
+
+  Vectorized<ComplexFlt> abs() const {
+    return abs_() & real_mask;
+  }
+
+  Vectorized<ComplexFlt> real_() const {
+    return *this & real_mask;
+  }
+  Vectorized<ComplexFlt> real() const {
+    return *this & real_mask;
+  }
+  Vectorized<ComplexFlt> imag_() const {
+    return *this & imag_mask;
+  }
+  Vectorized<ComplexFlt> imag() const {
+    // we can use swap_mask or sldwi
+    auto ret = imag_();
+    return {
+        vec_sldw(ret._vec0, ret._vec0, 3), vec_sldw(ret._vec1, ret._vec1, 3)};
+  }
+
+  Vectorized<ComplexFlt> conj_() const {
+    return *this ^ isign_mask;
+  }
+  Vectorized<ComplexFlt> conj() const {
+    return *this ^ isign_mask;
+  }
+
+  Vectorized<ComplexFlt> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+
+  Vectorized<ComplexFlt> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return ret.elwise_mult(log2e_inv);
+  }
+  Vectorized<ComplexFlt> log10() const {
+    auto ret = log();
+    return ret.elwise_mult(log10e_inv);
+  }
+
+  Vectorized<ComplexFlt> log1p() const {
+    return map(std::log1p);
+  }
+
+  Vectorized<ComplexFlt> el_swapped() const {
+    vfloat32 v0 = vec_perm(_vec0, _vec0, swap_mask);
+    vfloat32 v1 = vec_perm(_vec1, _vec1, swap_mask);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexFlt> el_mergee() const {
+    // as mergee phased in , we can use vec_perm with mask
+    return {vec_mergee(_vecb0, _vecb0), vec_mergee(_vecb1, _vecb1)};
+  }
+
+  Vectorized<ComplexFlt> el_mergeo() const {
+    // as mergeo phased in , we can use vec_perm with mask
+    return {vec_mergeo(_vecb0, _vecb0), vec_mergeo(_vecb1, _vecb1)};
+  }
+
+  Vectorized<ComplexFlt> el_madd(
+      const Vectorized<ComplexFlt>& multiplier,
+      const Vectorized<ComplexFlt>& val) const {
+    return {
+        vec_madd(_vec0, multiplier._vec0, val._vec0),
+        vec_madd(_vec1, multiplier._vec1, val._vec1)};
+  }
+
+  static Vectorized<ComplexFlt> el_mergee(
+      const Vectorized<ComplexFlt>& first,
+      const Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergee(first._vecb0, second._vecb0),
+        vec_mergee(first._vecb1, second._vecb1)};
+  }
+
+  static Vectorized<ComplexFlt> el_mergeo(
+      const Vectorized<ComplexFlt>& first,
+      const Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergeo(first._vecb0, second._vecb0),
+        vec_mergeo(first._vecb1, second._vecb1)};
+  }
+
+  Vectorized<ComplexFlt> angle_() const {
+    // angle = atan2(b/a)
+    // auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
+    // return Sleef_atan2f8_u10(values, b_a); // 90-angle angle
+    Vectorized<ComplexFlt> ret;
+    for (int i = 0; i < 4; i += 2) {
+      ret._vec0[i] = std::atan2(_vec0[i + 1], _vec0[i]);
+      ret._vec1[i] = std::atan2(_vec1[i + 1], _vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<ComplexFlt> angle() const {
+    return angle_() & real_mask;
+  }
+
+  Vectorized<ComplexFlt> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<ComplexFlt> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<ComplexFlt> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<ComplexFlt> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<ComplexFlt> ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<ComplexFlt> floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<ComplexFlt> neg() const {
+    auto z = Vectorized<ComplexFlt>(zero);
+    return z - *this;
+  }
+  Vectorized<ComplexFlt> round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+  Vectorized<ComplexFlt> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<ComplexFlt> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<ComplexFlt> trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<ComplexFlt> elwise_sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+
+  Vectorized<ComplexFlt> sqrt() const {
+    return map(std::sqrt);
+  }
+
+  Vectorized<ComplexFlt> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    auto c_d = *this ^ isign_mask; // c       -d
+    auto abs = abs_2_();
+    return c_d.elwise_div(abs);
+  }
+
+  Vectorized<ComplexFlt> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<ComplexFlt> pow(const Vectorized<ComplexFlt>& exp) const {
+    __at_align__ ComplexFlt x_tmp[size()];
+    __at_align__ ComplexFlt y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+
+  Vectorized<ComplexFlt> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized(imag_one);
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln * imag_half; // i/2*ln()
+  }
+  Vectorized<ComplexFlt> atanh() const {
+    return map(std::atanh);
+  }
+
+  Vectorized<ComplexFlt> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized(pi_2) - asin();
+  }
+
+  Vectorized<ComplexFlt> inline operator*(
+      const Vectorized<ComplexFlt>& b) const {
+    //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+
+#if 1
+    // this is more vsx friendly than simulating horizontal from x86
+
+    auto vi = b.el_mergeo();
+    auto vr = b.el_mergee();
+    vi = vi ^ rsign_mask;
+    auto ret = elwise_mult(vr);
+    auto vx_swapped = el_swapped();
+    ret = vx_swapped.elwise_mult(vi) + ret;
+    return ret;
+
+#else
+
+    auto ac_bd = elwise_mult(b);
+    auto d_c = b.el_swapped();
+    d_c = d_c ^ isign_mask;
+    auto ad_bc = elwise_mult(d_c);
+    auto ret = horizontal_sub_permD8(ac_bd, ad_bc);
+    return ret;
+#endif
+  }
+
+  Vectorized<ComplexFlt> inline operator/(
+      const Vectorized<ComplexFlt>& b) const {
+#if 1
+    __at_align__ c10::complex<float>
+        tmp1[Vectorized<c10::complex<float>>::size()];
+    __at_align__ c10::complex<float>
+        tmp2[Vectorized<c10::complex<float>>::size()];
+    __at_align__ c10::complex<float>
+        out[Vectorized<c10::complex<float>>::size()];
+    this->store(tmp1);
+    b.store(tmp2);
+
+    for (const auto i : c10::irange(Vectorized<c10::complex<float>>::size())) {
+      out[i] = tmp1[i] / tmp2[i];
+    }
+    return loadu(out);
+#else
+    auto fabs_cd = Vectorized{
+        vec_andc(b._vec0, sign_mask), vec_andc(b._vec1, sign_mask)}; // |c| |d|
+    auto fabs_dc = fabs_cd.el_swapped(); // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale); // a/sc           b/sc
+    auto b2 = b.elwise_div(scale); // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2); // ac/sc^2        bd/s
+    auto dc2 = b2.el_swapped(); // d/sc           c/sc
+    dc2 = dc2 ^ rsign_mask; // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
+    return ret;
+#endif
+  }
+
+  Vectorized<ComplexFlt> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+
+#if 1
+    auto conj = conj_();
+    auto b_a = conj.el_swapped();
+    auto ab = conj.elwise_mult(b_a);
+    auto im = ab + ab;
+    auto val_2 = (*this).elwise_mult(*this);
+    auto val_2_swapped = val_2.el_swapped();
+    auto re = horizontal_sub_permD8(val_2, val_2_swapped);
+    re = Vectorized<ComplexFlt>(one) - re;
+    auto root = el_blend<0xAA>(re, im).sqrt();
+    auto ln = (b_a + root).log();
+    return ln.el_swapped().conj();
+#else
+    return map(std::asin);
+#endif
+  }
+
+  Vectorized<ComplexFlt> exp() const {
+    return map(std::exp);
+  }
+  Vectorized<ComplexFlt> exp2() const {
+    return map(exp2_impl);
+  }
+  Vectorized<ComplexFlt> expm1() const {
+    return map(std::expm1);
+  }
+
+  Vectorized<ComplexFlt> eq(const Vectorized<ComplexFlt>& other) const {
+    auto eq = (*this == other); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+    return (eq.real() & eq.imag()) & one;
+  }
+  Vectorized<ComplexFlt> ne(const Vectorized<ComplexFlt>& other) const {
+    auto ne = (*this != other); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+    return (ne.real() | ne.imag()) & one;
+  }
+
+  Vectorized<ComplexFlt> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
+  Vectorized<ComplexFlt> operator<(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator<=(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator>(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator>=(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  DEFINE_MEMBER_OP(operator==, ComplexFlt, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, ComplexFlt, vec_cmpne)
+
+  DEFINE_MEMBER_OP(operator+, ComplexFlt, vec_add)
+  DEFINE_MEMBER_OP(operator-, ComplexFlt, vec_sub)
+  DEFINE_MEMBER_OP(operator&, ComplexFlt, vec_and)
+  DEFINE_MEMBER_OP(operator|, ComplexFlt, vec_or)
+  DEFINE_MEMBER_OP(operator^, ComplexFlt, vec_xor)
+  // elementwise helpers
+  DEFINE_MEMBER_OP(elwise_mult, ComplexFlt, vec_mul)
+  DEFINE_MEMBER_OP(elwise_div, ComplexFlt, vec_div)
+  DEFINE_MEMBER_OP(elwise_gt, ComplexFlt, vec_cmpgt)
+  DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge)
+  DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt)
+  DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max)
+};
+
+template <>
+Vectorized<ComplexFlt> inline maximum(
+    const Vectorized<ComplexFlt>& a,
+    const Vectorized<ComplexFlt>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  // auto max = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_lt(abs_b);
+  auto max = Vectorized<ComplexFlt>::elwise_blendv(a, b, mask);
+
+  return max;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<ComplexFlt> inline minimum(
+    const Vectorized<ComplexFlt>& a,
+    const Vectorized<ComplexFlt>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  // auto min = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_gt(abs_b);
+  auto min = Vectorized<ComplexFlt>::elwise_blendv(a, b, mask);
+  return min;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator+(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator-(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator&(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator|(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator^(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator*(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  // (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+  // Split into real and imaginary parts
+  auto a_real = a.el_mergee(); // real part of a
+  auto a_imag = a.el_mergeo(); // imag part of a
+  auto b_real = b.el_mergee(); // real part of b
+  auto b_imag = b.el_mergeo(); // imag part of b
+
+  auto b_imag_neg = b_imag ^ rsign_mask;
+  // Compute components
+  auto ac = a_real.elwise_mult(b_real); // real * real
+  auto bd = a_imag.elwise_mult(b_imag_neg); // imag * imag
+  auto ad = a_real.elwise_mult(b_imag); // real * imag
+  auto bc = a_imag.elwise_mult(b_real); // imag * real
+
+  // Real = ac - bd (fix the negative bd part)
+  auto real = ac + bd; // Real part calculation
+  auto imag = ad + bc; // Imaginary part calculation
+
+  // Step 1: Extract from real and imag
+  __vector float r0 = real.vec0(); // {r0, r1, r2, r3}
+  __vector float i0 = imag.vec0(); // {i0, i1, i2, i3}
+
+  __vector float r1 = real.vec1(); // imag[0..3]
+  __vector float i1 = imag.vec1(); // imag[4..7]
+
+  __vector unsigned char perm_lo = {
+      0,
+      1,
+      2,
+      3, // r0
+      16,
+      17,
+      18,
+      19, //
+      8,
+      9,
+      10,
+      11, // r1
+      24,
+      25,
+      26,
+      27};
+  __vector float v0 =
+      vec_perm(r0, i0, perm_lo); // Interleave r0 and i0, r1 and i1
+  __vector float v1 = vec_perm(r1, i1, perm_lo);
+  Vectorized<ComplexFlt> result(v0, v1);
+  return result;
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator/(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  // Take absolute values of real and imaginary parts of b
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i :
+       c10::irange(Vectorized<c10::complex<float>>::
+                       size())) { //{Vectorized<c10::complex<float>>::size()))
+                                  //{
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return Vectorized<ComplexFlt>::loadu(out);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..63a9e5e2f1ad1328a85db5e0228b81dfd41ab215
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -0,0 +1,520 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#include <sleef.h>
+
+namespace at {
+namespace vec {
+
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  union {
+    struct {
+      vfloat64 _vec0;
+      vfloat64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = double;
+  using vec_internal_type = vfloat64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(double scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      double scalar1,
+      double scalar2,
+      double scalar3,
+      double scalar4)
+      : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  int zero_mask() const {
+    auto cmp = (*this == vd_zero);
+    return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) |
+        (cmp._vecb1[1] & 8);
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 1, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 2, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 3, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 4, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 5, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 6, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 7, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 8, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    return {
+        (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<double> C10_ALWAYS_INLINE blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    // the mask used here returned by comparison of vec256
+
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  static Vectorized<double> C10_ALWAYS_INLINE
+  set(const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  Vectorized<double> map(double (*const f)(double)) const {
+    Vectorized<double> ret;
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec0[i] = f(_vec0[i]);
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec1[i] = f(_vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<double> mapbi(
+      double (*const f)(double, double),
+      const Vectorized<double>& other) const {
+    Vectorized<double> ret;
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec0[i] = f(_vec0[i], other._vec0[i]);
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+    }
+    return ret;
+  }
+  Vectorized<double> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE acos() const {
+    return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE acosh() const {
+    return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE asin() const {
+    return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE asinh() const {
+    return {Sleef_asinhd2_u10(_vec0), Sleef_asinhd2_u10(_vec1)};
+  }
+  Vectorized<double> atan() const {
+    return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)};
+  }
+  Vectorized<double> atanh() const {
+    return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)};
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return {
+        Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+    return {
+        Sleef_copysignd2(_vec0, sign._vec0),
+        Sleef_copysignd2(_vec1, sign._vec1)};
+  }
+  Vectorized<double> erf() const {
+    return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)};
+  }
+  Vectorized<double> erfc() const {
+    return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp() const {
+    return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp2() const {
+    return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)};
+  }
+  Vectorized<double> expm1() const {
+    return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
+    return exp();
+  }
+
+  Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
+    return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
+  }
+
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+
+  Vectorized<double> angle() const {
+    auto tmp = blendv(
+        Vectorized<double>(0),
+        Vectorized<double>(c10::pi<double>),
+        *this < Vectorized<double>(0));
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>{0};
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE log() const {
+    return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log10() const {
+    return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log1p() const {
+    return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log2() const {
+    return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE cos() const {
+    return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE cosh() const {
+    return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE round() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE sin() const {
+    return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE sinh() const {
+    return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tan() const {
+    return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tanh() const {
+    return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE reciprocal() const {
+    return {
+        vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one.
+        vec_div(vd_one, _vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE pow(const Vectorized<double>& b) const {
+    return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
+    return {Sleef_fmodd2(_vec0, b._vec0), Sleef_fmodd2(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+    return {
+        Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return {
+        Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    return mapbi(calc_igamma, x);
+  }
+
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    return mapbi(calc_igammac, x);
+  }
+
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+
+  Vectorized<double> _nor() const {
+    return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
+  }
+
+  Vectorized<double> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._nor();
+  }
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, double, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, double, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, double, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, double, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, double, vec_add)
+  DEFINE_MEMBER_OP(operator-, double, vec_sub)
+  DEFINE_MEMBER_OP(operator*, double, vec_mul)
+  DEFINE_MEMBER_OP(operator/, double, vec_div)
+  DEFINE_MEMBER_OP(maximum, double, vec_max_nan2)
+  DEFINE_MEMBER_OP(minimum, double, vec_min_nan2)
+  DEFINE_MEMBER_OP(operator&, double, vec_and)
+  DEFINE_MEMBER_OP(operator|, double, vec_or)
+  DEFINE_MEMBER_OP(operator^, double, vec_xor)
+  DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd)
+};
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return a.minimum(b);
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..f26ea32fe0b1e8d2ab91149b28b002ceadfa1f3a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -0,0 +1,553 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <sleef.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  union {
+    struct {
+      vfloat32 _vec0;
+      vfloat32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = float;
+  using vec_internal_type = vfloat32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(float scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      float scalar1,
+      float scalar2,
+      float scalar3,
+      float scalar4,
+      float scalar5,
+      float scalar6,
+      float scalar7,
+      float scalar8)
+      : _vec0{vfloat32{scalar1, scalar2, scalar3, scalar4}},
+        _vec1{vfloat32{scalar5, scalar6, scalar7, scalar8}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 0, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 1, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 2, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 3, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 4, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 5, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 6, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_2nd = VsxMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 7, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_2nd = VsxMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 8, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    const vbool32 mask_2nd = VsxMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<float> C10_ALWAYS_INLINE blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    // the mask used here returned by comparison of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+
+  Vectorized<float> map(float (*const f)(float)) const {
+    Vectorized<float> ret;
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec0[i] = f(_vec0[i]);
+    }
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec1[i] = f(_vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<float> mapbi(
+      float (*const f)(float, float),
+      const Vectorized<float>& other) const {
+    Vectorized<float> ret;
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec0[i] = f(_vec0[i], other._vec0[i]);
+    }
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<float> _nor() const {
+    return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
+  }
+
+  Vectorized<float> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._nor();
+  }
+
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    //__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
+    auto cmp = (*this == zero);
+    // return _mm256_movemask_ps(cmp);
+    // possible simulation  //mask= lvsl ( 0 ) vbpermq( vec, mask <<5)
+    vuint64 result0 = vec_vbpermq((vuint8)cmp._vecb0, mask_zero_bits);
+    vuint64 result1 = vec_vbpermq((vuint8)cmp._vecb1, mask_zero_bits);
+    return (result0[1] >> 12 | (result1[1] >> 8));
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE acos() const {
+    return {Sleef_acosf4_u10(_vec0), Sleef_acosf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE acosh() const {
+    return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE asin() const {
+    return {Sleef_asinf4_u10(_vec0), Sleef_asinf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE asinh() const {
+    return {Sleef_asinhf4_u10(_vec0), Sleef_asinhf4_u10(_vec1)};
+  }
+  Vectorized<float> atan() const {
+    return {Sleef_atanf4_u10(_vec0), Sleef_atanf4_u10(_vec1)};
+  }
+  Vectorized<float> atanh() const {
+    return {Sleef_atanhf4_u10(_vec0), Sleef_atanhf4_u10(_vec1)};
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return {
+        Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)};
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+    return {
+        Sleef_copysignf4(_vec0, sign._vec0),
+        Sleef_copysignf4(_vec1, sign._vec1)};
+  }
+  Vectorized<float> lgamma() const {
+    return {Sleef_lgammaf4_u10(_vec0), Sleef_lgammaf4_u10(_vec1)};
+  }
+  Vectorized<float> erf() const {
+    return {Sleef_erff4_u10(_vec0), Sleef_erff4_u10(_vec1)};
+  }
+
+  Vectorized<float> erfc() const {
+    return {Sleef_erfcf4_u15(_vec0), Sleef_erfcf4_u15(_vec1)};
+  }
+
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+
+  Vectorized<float> angle() const {
+    auto tmp = blendv(
+        Vectorized<float>(0),
+        Vectorized<float>(c10::pi<float>),
+        *this < Vectorized<float>(0));
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>{0};
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE exp() const {
+    return {Sleef_expf4_u10(_vec0), Sleef_expf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE exp2() const {
+    return {Sleef_exp2f4_u10(_vec0), Sleef_exp2f4_u10(_vec1)};
+  }
+  Vectorized<float> expm1() const {
+    return {Sleef_expm1f4_u10(_vec0), Sleef_expm1f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
+    return exp();
+  }
+  Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
+    return exp();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE log() const {
+    return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log10() const {
+    return {Sleef_log10f4_u10(_vec0), Sleef_log10f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log1p() const {
+    return {Sleef_log1pf4_u10(_vec0), Sleef_log1pf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log2() const {
+    return {Sleef_log2f4_u10(_vec0), Sleef_log2f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE cos() const {
+    return {Sleef_cosf4_u10(_vec0), Sleef_cosf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE cosh() const {
+    return {Sleef_coshf4_u10(_vec0), Sleef_coshf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE sin() const {
+    return {Sleef_sinf4_u10(_vec0), Sleef_sinf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE sinh() const {
+    return {Sleef_sinhf4_u10(_vec0), Sleef_sinhf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE tan() const {
+    return {Sleef_tanf4_u10(_vec0), Sleef_tanf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE tanh() const {
+    return {Sleef_tanhf4_u10(_vec0), Sleef_tanhf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE reciprocal() const {
+    return Vectorized<float>(one) / (*this);
+  }
+  Vectorized<float> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE pow(const Vectorized<float>& exp) const {
+    return {
+        Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)};
+  }
+
+  Vectorized<float> fmod(const Vectorized<float>& b) const {
+    return {Sleef_fmodf4(_vec0, b._vec0), Sleef_fmodf4(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+    return {
+        Sleef_hypotf4_u05(_vec0, b._vec0), Sleef_hypotf4_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return {
+        Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    return mapbi(calc_igamma, x);
+  }
+
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    return mapbi(calc_igammac, x);
+  }
+
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+
+  DEFINE_MEMBER_OP(operator==, float, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, float, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, float, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, float, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, float, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, float, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, float, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, float, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, float, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, float, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, float, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, float, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, float, vec_add)
+  DEFINE_MEMBER_OP(operator-, float, vec_sub)
+  DEFINE_MEMBER_OP(operator*, float, vec_mul)
+  DEFINE_MEMBER_OP(operator/, float, vec_div)
+  DEFINE_MEMBER_OP(maximum, float, vec_max_nan2)
+  DEFINE_MEMBER_OP(minimum, float, vec_min_nan2)
+  DEFINE_MEMBER_OP(operator&, float, vec_and)
+  DEFINE_MEMBER_OP(operator|, float, vec_or)
+  DEFINE_MEMBER_OP(operator^, float, vec_xor)
+  DEFINE_MEMBER_TERNARY_OP(madd, float, vec_madd)
+};
+
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return a.minimum(b);
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..5150ccf3a2cd6df9c05e1f2b1184912ebd9ad7fd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@@ -0,0 +1,422 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int16_t> {
+ private:
+  union {
+    struct {
+      vint16 _vec0;
+      vint16 _vec1;
+    };
+    struct {
+      vbool16 _vecb0;
+      vbool16 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int16_t;
+  using vec_internal_type = vint16;
+  using vec_internal_mask_type = vbool16;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int16_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+
+  C10_ALWAYS_INLINE Vectorized(
+      int16_t scalar1,
+      int16_t scalar2,
+      int16_t scalar3,
+      int16_t scalar4,
+      int16_t scalar5,
+      int16_t scalar6,
+      int16_t scalar7,
+      int16_t scalar8,
+      int16_t scalar9,
+      int16_t scalar10,
+      int16_t scalar11,
+      int16_t scalar12,
+      int16_t scalar13,
+      int16_t scalar14,
+      int16_t scalar15,
+      int16_t scalar16)
+      : _vec0{vint16{
+            scalar1,
+            scalar2,
+            scalar3,
+            scalar4,
+            scalar5,
+            scalar6,
+            scalar7,
+            scalar8}},
+        _vec1{vint16{
+            scalar9,
+            scalar10,
+            scalar11,
+            scalar12,
+            scalar13,
+            scalar14,
+            scalar15,
+            scalar16}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int16_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 65535) == 65535, Vectorized<int16_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 255, Vectorized<int16_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 255), Vectorized<int16_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0 = (mask & 1) * 0xffff;
+    constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff;
+    const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7};
+
+    return {(vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {b._vec0, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t mask2 = (mask & 65535) >> 16;
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {a, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) &&
+       ((mask & 255) != 255)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0 = (mask & 1) * 0xffff;
+    constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff;
+    constexpr int16_t mask2 = (mask & 65535) >> 16;
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7};
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {
+        (vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st),
+        (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  static Vectorized<int16_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+    // the mask used here returned by comparison of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    // warning intel style mask will not work properly
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int16_t> set(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+
+  Vectorized<int16_t> angle() const {
+    return blendv(
+        Vectorized<int16_t>(0),
+        Vectorized<int16_t>(c10::pi<int16_t>),
+        *this < Vectorized<int16_t>(0));
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return Vectorized<int16_t>{0};
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int16_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int16_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int16_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int16_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int16_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int16_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int16_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int16_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int16_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int16_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int16_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int16_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int16_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int16_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int16_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int16_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int16_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int16_t, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int16_t, /)
+  DEFINE_MEMBER_OP(maximum, int16_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int16_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int16_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int16_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int16_t, vec_xor)
+};
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return a.minimum(b);
+}
+
+DEFINE_SHIFT_FUNCS(int16_t)
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..baa0a95a9bd194a8a4f7cc3a1518a77d12bd8e58
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@@ -0,0 +1,352 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int32_t> {
+ private:
+  union {
+    struct {
+      vint32 _vec0;
+      vint32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int32_t;
+  using vec_internal_type = vint32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int32_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      int32_t scalar1,
+      int32_t scalar2,
+      int32_t scalar3,
+      int32_t scalar4,
+      int32_t scalar5,
+      int32_t scalar6,
+      int32_t scalar7,
+      int32_t scalar8)
+      : _vec0{vint32{scalar1, scalar2, scalar3, scalar4}},
+        _vec1{vint32{scalar5, scalar6, scalar7, scalar8}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int32_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 255) == 255, Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 15, Vectorized<int32_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 15), Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
+    constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+    const vbool32 mask_1st = (vbool32){g0, g1, g2, g3};
+
+    return {(vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {b._vec0, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {a, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) &&
+       ((mask & 15) != 15)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
+    constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_1st = (vbool32){g0, g1, g2, g3};
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {
+        (vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st),
+        (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  static Vectorized<int32_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+    // the mask used here returned by comparison of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    // warning intel style mask will not work properly
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<int32_t> arange(
+      int32_t base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int32_t> set(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+
+  Vectorized<int32_t> angle() const {
+    return blendv(
+        Vectorized<int32_t>(0),
+        Vectorized<int32_t>(c10::pi<int32_t>),
+        *this < Vectorized<int32_t>(0));
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return Vectorized<int32_t>{0};
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int32_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int32_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int32_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int32_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int32_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int32_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int32_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int32_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int32_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int32_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int32_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int32_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int32_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int32_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int32_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int32_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int32_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int32_t, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int32_t, /)
+  DEFINE_MEMBER_OP(maximum, int32_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int32_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int32_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int32_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int32_t, vec_xor)
+};
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return a.minimum(b);
+}
+
+DEFINE_SHIFT_FUNCS(int32_t)
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3012293b3c7b0c10855f86f6c747b50e4ee1a17
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@@ -0,0 +1,306 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int64_t> {
+ private:
+  union {
+    struct {
+      vint64 _vec0;
+      vint64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int64_t;
+  using vec_internal_type = vint64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  using ElementType = signed long long;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int64_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      int64_t scalar1,
+      int64_t scalar2,
+      int64_t scalar3,
+      int64_t scalar4)
+      : _vec0{vint64{scalar1, scalar2}}, _vec1{vint64{scalar3, scalar4}} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 3, Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 15) == 15, Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 3), Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+    constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+    const vbool64 mask_1st = (vbool64){g0, g1};
+    return {(vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
+    constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff;
+
+    const vbool64 mask_2nd = (vbool64){g0_2, g1_2};
+    return {a._vec0, (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15,
+      Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+    constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+    constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
+    constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff;
+
+    const vbool64 mask_1st = (vbool64){g0, g1};
+    const vbool64 mask_2nd = (vbool64){g0_2, g1_2};
+    return {
+        (vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st),
+        (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)};
+  }
+
+  static Vectorized<int64_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+    // the mask used here returned by comparison of vec256
+
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(
+      int64_t base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  static Vectorized<int64_t> C10_ALWAYS_INLINE
+  set(const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      static_assert(sizeof(double) == sizeof(value_type));
+      const double* dptr = reinterpret_cast<const double*>(ptr);
+      return {// treat it as double load
+              (vint64)vec_vsx_ld(offset0, dptr),
+              (vint64)vec_vsx_ld(offset16, dptr)};
+    }
+
+    __at_align__ double tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        (vint64)vec_vsx_ld(offset0, tmp_values),
+        (vint64)vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      double* dptr = reinterpret_cast<double*>(ptr);
+      vec_vsx_st((vfloat64)_vec0, offset0, dptr);
+      vec_vsx_st((vfloat64)_vec1, offset16, dptr);
+    } else if (count > 0) {
+      __at_align__ double tmp_values[size()];
+      vec_vsx_st((vfloat64)_vec0, offset0, tmp_values);
+      vec_vsx_st((vfloat64)_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+
+  Vectorized<int64_t> angle() const {
+    return blendv(
+        Vectorized<int64_t>(0),
+        Vectorized<int64_t>(c10::pi<int64_t>),
+        *this < Vectorized<int64_t>(0));
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return Vectorized<int64_t>{0};
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int64_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int64_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int64_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int64_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int64_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int64_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int64_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int64_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int64_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int64_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int64_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int64_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int64_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int64_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int64_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int64_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int64_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int64_t, vec_mul)
+  DEFINE_MEMBER_OP(operator/, int64_t, vec_div)
+  DEFINE_MEMBER_OP(maximum, int64_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int64_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int64_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int64_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int64_t, vec_xor)
+};
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return a.minimum(b);
+}
+
+DEFINE_SHIFT_FUNCS(int64_t)
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..f02be95efa692b75a8ba7349492d58177b66a978
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_mask_vsx.h
@@ -0,0 +1,74 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_VSX)
+
+template <int N>
+struct VecMaskCast<int, N, float, N> {
+  static inline VecMask<int, N> apply(const VecMask<float, N>& vec_mask) {
+    VectorizedN<int, N> result;
+
+    for (int i = 0; i < N; ++i) {
+      auto tmp = vec_mask[i];
+      result[i] = reinterpret_cast<const Vectorized<int>&>(tmp);
+    }
+    return VecMask<int, N>(result);
+  }
+};
+
+template <int N>
+struct VecMaskCast<float, N, int, N> {
+  static inline VecMask<float, N> apply(const VecMask<int, N>& vec_mask) {
+    VectorizedN<float, N> result;
+
+    for (int i = 0; i < N; ++i) {
+      auto tmp = vec_mask[i];
+      result[i] = reinterpret_cast<const Vectorized<float>&>(tmp);
+    }
+    return VecMask<float, N>(result);
+  }
+};
+
+template <int dst_n, typename mask_t, int mask_n>
+struct VecMaskCast<
+    int64_t,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (dst_n == 2 * mask_n) &&
+        (std::is_same_v<mask_t, float> || std::is_same_v<mask_t, int>)>> {
+  static inline VecMask<int64_t, dst_n> apply(
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    VectorizedN<int64_t, dst_n> result;
+
+    auto int_mask = vec_mask.template cast<int, mask_n>();
+
+    for (int i = 0; i < mask_n; ++i) {
+      VectorizedN<int, 1> in_int_n;
+      in_int_n[0] = int_mask[i];
+
+      auto int64_vecs = convert<int64_t, 2, int, 1>(in_int_n);
+
+      result[2 * i] = int64_vecs[0];
+      result[2 * i + 1] = int64_vecs[1];
+    }
+    return VecMask<int64_t, dst_n>(result);
+  }
+};
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..692607d4d5254353f74d43ce88404cb96d9d770b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@@ -0,0 +1,306 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/qint32.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+template <>
+struct Vectorized<c10::qint32> {
+ private:
+  union {
+    struct {
+      vint32 _vec0;
+      vint32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 1;
+  }
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 1>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+  using vec_internal_type = vint32;
+  using vec_internal_mask_type = vbool32;
+  C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(const c10::qint32& val)
+      : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {}
+
+  static Vectorized<c10::qint32> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    vfloat32 float_vals0 = vec_float(_vec0);
+    vfloat32 float_vals1 = vec_float(_vec1);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_sub_zero_point_0 = vec_sub(float_vals0, zero_point_vec0);
+    vfloat32 vec_sub_zero_point_1 = vec_sub(float_vals1, zero_point_vec1);
+    Vectorized<float> vf0 = {
+        vec_mul(scale_vec0, vec_sub_zero_point_0),
+        vec_mul(scale_vec1, vec_sub_zero_point_1)};
+    return {vf0};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    vfloat32 float_vals0 = vec_float(_vec0);
+    vfloat32 float_vals1 = vec_float(_vec1);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {Vectorized<float>{
+        (float_vals0 - zero_point0) * scale_vec0,
+        (float_vals1 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    Vectorized<c10::qint32> retval;
+
+    const vint32 vmin = vec_splats(std::numeric_limits<value_type>::min());
+    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());
+    vfloat32 inverse_scale_v = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)(zero_point));
+    Vectorized<float> vf0 = rhs[0];
+
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vecf0 = vec_mul(vecf0, inverse_scale_v);
+    vecf1 = vec_mul(vecf1, inverse_scale_v);
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+
+    veci0 = vec_max(veci0, vmin);
+    veci1 = vec_max(veci1, vmin);
+    veci0 = vec_min(veci0, vmax);
+    veci1 = vec_min(veci1, vmax);
+
+    return {veci0, veci1};
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) const {
+    vint32 max0 = vec_max(_vec0, zero_point._vec0);
+    vint32 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {*this - b};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    const vint32 vmin = vec_splats(std::numeric_limits<value_type>::min());
+    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());
+    vfloat32 vec_mult = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+    Vectorized<c10::qint32> vi = inp[0];
+    vfloat32 vecf0 = vec_float(vi.vec0());
+    vfloat32 vecf1 = vec_float(vi.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_mult);
+    vecf1 = vec_mul(vecf1, vec_mult);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+
+    vint32 veci0 = vec_add(vec_signed(vecf0), vec_zero_point);
+    vint32 veci1 = vec_add(vec_signed(vecf1), vec_zero_point);
+
+    veci0 = vec_max(veci0, vmin);
+    veci1 = vec_max(veci1, vmin);
+    veci0 = vec_min(veci0, vmax);
+    veci1 = vec_min(veci1, vmax);
+
+    return {veci0, veci1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::qint32, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::qint32, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::qint32, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::qint32, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::qint32, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::qint32, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint32, /)
+  DEFINE_MEMBER_OP(maximum, c10::qint32, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::qint32, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::qint32, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::qint32, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::qint32, vec_xor)
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline minimum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.minimum(b);
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fb5b62c5c0d898bd0fba05898123b7fa53bed5e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@@ -0,0 +1,517 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/qint8.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+template <>
+struct Vectorized<c10::qint8> {
+ private:
+  union {
+    struct {
+      vint8 _vec0;
+      vint8 _vec1;
+    };
+    struct {
+      vbool8 _vecb0;
+      vbool8 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 4;
+  }
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::qint8::underlying;
+  using vec_internal_type = vint8;
+  using vec_internal_mask_type = vbool8;
+  // Broadcast constructor
+  C10_ALWAYS_INLINE Vectorized(const c10::qint8& val)
+      : _vec0{vec_splats(val.val_)}, _vec1{vec_splats(val.val_)} {}
+
+  C10_ALWAYS_INLINE Vectorized(const Vectorized<c10::qint8>& other)
+      : _vec0{other._vec0}, _vec1(other._vec1) {}
+
+  C10_ALWAYS_INLINE Vectorized(vint8 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint8 v1, vint8 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  static C10_ALWAYS_INLINE Vectorized<c10::qint8> loadu(
+      const void* ptr,
+      int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const vint8*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const vint8*>(ptr))};
+    }
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+ public:
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1);
+    Vectorized<float> vf0_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_0),
+        vec_mul(scale_vec1, vec_substract_src_zp1_0)};
+
+    vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1);
+    Vectorized<float> vf1_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_1),
+        vec_mul(scale_vec1, vec_substract_src_zp1_1)};
+
+    vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1);
+    Vectorized<float> vf2_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_2),
+        vec_mul(scale_vec1, vec_substract_src_zp1_2)};
+
+    vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1);
+    Vectorized<float> vf3_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_3),
+        vec_mul(scale_vec1, vec_substract_src_zp1_3)};
+
+    return {vf0_zp, vf1_zp, vf2_zp, vf3_zp};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE
+  dequantize(Vectorized<float> scale, Vectorized<float> zero_point) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {
+        Vectorized<float>{
+            (vecf0_0 - zero_point0) * scale_vec0,
+            (vecf1_0 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_1 - zero_point0) * scale_vec0,
+            (vecf1_1 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_2 - zero_point0) * scale_vec0,
+            (vecf1_2 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_3 - zero_point0) * scale_vec0,
+            (vecf1_3 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    // constexpr int32_t min_val = std::numeric_limits<value_type>::min();
+    // constexpr int32_t max_val = std::numeric_limits<value_type>::max();
+
+    vfloat32 inverse_scale_v = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)zero_point);
+    // vint32 vmin = vec_splats(min_val);
+    // vint32 vmax = vec_splats(max_val);
+
+    Vectorized<float> vf0 = rhs[0];
+    Vectorized<float> vf1 = rhs[1];
+    Vectorized<float> vf2 = rhs[2];
+    Vectorized<float> vf3 = rhs[3];
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vfloat32 vecf2 = vf1.vec0();
+    vfloat32 vecf3 = vf1.vec1();
+
+    vfloat32 vecf4 = vf2.vec0();
+    vfloat32 vecf5 = vf2.vec1();
+    vfloat32 vecf6 = vf3.vec0();
+    vfloat32 vecf7 = vf3.vec1();
+
+    vecf0 = vec_mul(vecf0, inverse_scale_v);
+    vecf1 = vec_mul(vecf1, inverse_scale_v);
+    vecf2 = vec_mul(vecf2, inverse_scale_v);
+    vecf3 = vec_mul(vecf3, inverse_scale_v);
+
+    vecf4 = vec_mul(vecf4, inverse_scale_v);
+    vecf5 = vec_mul(vecf5, inverse_scale_v);
+    vecf6 = vec_mul(vecf6, inverse_scale_v);
+    vecf7 = vec_mul(vecf7, inverse_scale_v);
+
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vecf2 = vec_add(vec_rint(vecf2), vec_zero_point);
+    vecf3 = vec_add(vec_rint(vecf3), vec_zero_point);
+
+    vecf4 = vec_add(vec_rint(vecf4), vec_zero_point);
+    vecf5 = vec_add(vec_rint(vecf5), vec_zero_point);
+    vecf6 = vec_add(vec_rint(vecf6), vec_zero_point);
+    vecf7 = vec_add(vec_rint(vecf7), vec_zero_point);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    // veci0 = vec_min(vmax, vec_max( vmin, vecf0)) ;
+    // veci1 = vec_min(vmax, vec_max( vmin, vecf1)) ;
+    // veci2 = vec_min(vmax, vec_max( vmin, vecf2)) ;
+    // veci3 = vec_min(vmax, vec_max( vmin, vecf3)) ;
+
+    // veci4 = vec_min(vmax, vec_max( vmin, vecf4)) ;
+    // veci5 = vec_min(vmax, vec_max( vmin, vecf5)) ;
+    // veci6 = vec_min(vmax, vec_max( vmin, vecf6)) ;
+    // veci7 = vec_min(vmax, vec_max( vmin, vecf7)) ;
+    // vec_packs CLAMP already
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vint8 vec0 = vec_packs(vecshi0, vecshi1);
+    vint8 vec1 = vec_packs(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  Vectorized<c10::qint8> C10_ALWAYS_INLINE
+  relu(Vectorized<c10::qint8> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::qint8> C10_ALWAYS_INLINE
+  relu6(Vectorized<c10::qint8> zero_point, Vectorized<c10::qint8> q_six) const {
+    vint8 max0 = vec_max(_vec0, zero_point._vec0);
+    vint8 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecBshi0 = vec_unpackh(b._vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+    vint16 vecBshi1 = vec_unpackl(b._vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecBshi2 = vec_unpackh(b._vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+    vint16 vecBshi3 = vec_unpackl(b._vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 vecBi0 = vec_unpackh(vecBshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+    vint32 vecBi1 = vec_unpackl(vecBshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 vecBi2 = vec_unpackh(vecBshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+    vint32 vecBi3 = vec_unpackl(vecBshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 vecBi4 = vec_unpackh(vecBshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+    vint32 vecBi5 = vec_unpackl(vecBshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 vecBi6 = vec_unpackh(vecBshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+    vint32 vecBi7 = vec_unpackl(vecBshi3);
+
+    return {
+        Vectorized<c10::qint32>(veci0 - vecBi0, veci1 - vecBi1),
+        Vectorized<c10::qint32>(veci2 - vecBi2, veci3 - vecBi3),
+        Vectorized<c10::qint32>(veci4 - vecBi4, veci5 - vecBi5),
+        Vectorized<c10::qint32>(veci6 - vecBi6, veci7 - vecBi7)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    vfloat32 vec_multiplier = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    vfloat32 vecf0 = vec_float(vi0.vec0());
+    vfloat32 vecf1 = vec_float(vi0.vec1());
+    vfloat32 vecf2 = vec_float(vi1.vec0());
+    vfloat32 vecf3 = vec_float(vi1.vec1());
+
+    vfloat32 vecf4 = vec_float(vi2.vec0());
+    vfloat32 vecf5 = vec_float(vi2.vec1());
+    vfloat32 vecf6 = vec_float(vi3.vec0());
+    vfloat32 vecf7 = vec_float(vi3.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_multiplier);
+    vecf1 = vec_mul(vecf1, vec_multiplier);
+    vecf2 = vec_mul(vecf2, vec_multiplier);
+    vecf3 = vec_mul(vecf3, vec_multiplier);
+
+    vecf4 = vec_mul(vecf4, vec_multiplier);
+    vecf5 = vec_mul(vecf5, vec_multiplier);
+    vecf6 = vec_mul(vecf6, vec_multiplier);
+    vecf7 = vec_mul(vecf7, vec_multiplier);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+    vecf2 = vec_rint(vecf2);
+    vecf3 = vec_rint(vecf3);
+
+    vecf4 = vec_rint(vecf4);
+    vecf5 = vec_rint(vecf5);
+    vecf6 = vec_rint(vecf6);
+    vecf7 = vec_rint(vecf7);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    veci0 = vec_add(veci0, vec_zero_point);
+    veci1 = vec_add(veci1, vec_zero_point);
+    veci2 = vec_add(veci2, vec_zero_point);
+    veci3 = vec_add(veci3, vec_zero_point);
+
+    veci4 = vec_add(veci4, vec_zero_point);
+    veci5 = vec_add(veci5, vec_zero_point);
+    veci6 = vec_add(veci6, vec_zero_point);
+    veci7 = vec_add(veci7, vec_zero_point);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vint8 vec0 = vec_packs(vecshi0, vecshi1);
+    vint8 vec1 = vec_packs(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::qint8, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::qint8, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::qint8, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::qint8, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::qint8, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::qint8, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint8, /)
+  DEFINE_MEMBER_OP(maximum, c10::qint8, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::qint8, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::qint8, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::qint8, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::qint8, vec_xor)
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint8> inline minimum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.minimum(b);
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..9da6dec9db5e0314d3b70f8b4f0e5d919f02490d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@@ -0,0 +1,538 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/quint8.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+const vint16 mask_unsigned = vec_splats((short int)0xFF);
+template <>
+struct Vectorized<c10::quint8> {
+ private:
+  union {
+    struct {
+      vuint8 _vec0;
+      vuint8 _vec1;
+    };
+    struct {
+      vbool8 _vecb0;
+      vbool8 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 4;
+  }
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::quint8::underlying;
+  using vec_internal_type = vuint8;
+  using vec_internal_mask_type = vbool8;
+  // Broadcast constructor
+  C10_ALWAYS_INLINE Vectorized(const c10::quint8& val)
+      : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {}
+
+  C10_ALWAYS_INLINE Vectorized(const Vectorized<c10::quint8>& other)
+      : _vec0{other._vec0}, _vec1(other._vec1) {}
+
+  C10_ALWAYS_INLINE Vectorized(vuint8 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vuint8 v1, vuint8 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  static C10_ALWAYS_INLINE Vectorized<c10::quint8> loadu(
+      const void* ptr,
+      int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+ public:
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    // unpacking unsigned as signed
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+
+    // signed ->  unsigned
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1);
+    Vectorized<float> vf0_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_0),
+        vec_mul(scale_vec1, vec_substract_src_zp1_0)};
+
+    vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1);
+    Vectorized<float> vf1_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_1),
+        vec_mul(scale_vec1, vec_substract_src_zp1_1)};
+
+    vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1);
+    Vectorized<float> vf2_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_2),
+        vec_mul(scale_vec1, vec_substract_src_zp1_2)};
+
+    vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1);
+    Vectorized<float> vf3_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_3),
+        vec_mul(scale_vec1, vec_substract_src_zp1_3)};
+
+    return {vf0_zp, vf1_zp, vf2_zp, vf3_zp};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE
+  dequantize(Vectorized<float> scale, Vectorized<float> zero_point) const {
+    // unpacking unsigned as signed
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+
+    // signed ->  unsigned
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {
+        Vectorized<float>{
+            (vecf0_0 - zero_point0) * scale_vec0,
+            (vecf1_0 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_1 - zero_point0) * scale_vec0,
+            (vecf1_1 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_2 - zero_point0) * scale_vec0,
+            (vecf1_2 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_3 - zero_point0) * scale_vec0,
+            (vecf1_3 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    // constexpr int32_t min_val = std::numeric_limits<value_type>::min();
+    // constexpr int32_t max_val = std::numeric_limits<value_type>::max();
+
+    vfloat32 vec_inverse = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)zero_point);
+    // vuint32 vmin = vec_splats(min_val);
+    // vuint32 vmax = vec_splats(max_val);
+    Vectorized<float> vf0 = rhs[0];
+    Vectorized<float> vf1 = rhs[1];
+    Vectorized<float> vf2 = rhs[2];
+    Vectorized<float> vf3 = rhs[3];
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vfloat32 vecf2 = vf1.vec0();
+    vfloat32 vecf3 = vf1.vec1();
+
+    vfloat32 vecf4 = vf2.vec0();
+    vfloat32 vecf5 = vf2.vec1();
+    vfloat32 vecf6 = vf3.vec0();
+    vfloat32 vecf7 = vf3.vec1();
+
+    vecf0 = vec_mul(vecf0, vec_inverse);
+    vecf1 = vec_mul(vecf1, vec_inverse);
+    vecf2 = vec_mul(vecf2, vec_inverse);
+    vecf3 = vec_mul(vecf3, vec_inverse);
+
+    vecf4 = vec_mul(vecf4, vec_inverse);
+    vecf5 = vec_mul(vecf5, vec_inverse);
+    vecf6 = vec_mul(vecf6, vec_inverse);
+    vecf7 = vec_mul(vecf7, vec_inverse);
+
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vecf2 = vec_add(vec_rint(vecf2), vec_zero_point);
+    vecf3 = vec_add(vec_rint(vecf3), vec_zero_point);
+
+    vecf4 = vec_add(vec_rint(vecf4), vec_zero_point);
+    vecf5 = vec_add(vec_rint(vecf5), vec_zero_point);
+    vecf6 = vec_add(vec_rint(vecf6), vec_zero_point);
+    vecf7 = vec_add(vec_rint(vecf7), vec_zero_point);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vuint8 vec0 = vec_packsu(vecshi0, vecshi1);
+    vuint8 vec1 = vec_packsu(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE
+  relu(Vectorized<c10::quint8> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) const {
+    vuint8 max0 = vec_max(_vec0, zero_point._vec0);
+    vuint8 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecBshi0 = vec_unpackh((vint8)b._vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+    vint16 vecBshi1 = vec_unpackl((vint8)b._vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecBshi2 = vec_unpackh((vint8)b._vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+    vint16 vecBshi3 = vec_unpackl((vint8)b._vec1);
+
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecBshi0 = vec_and(vecBshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+    vecBshi1 = vec_and(vecBshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecBshi2 = vec_and(vecBshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+    vecBshi3 = vec_and(vecBshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 vecBi0 = vec_unpackh(vecBshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+    vint32 vecBi1 = vec_unpackl(vecBshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 vecBi2 = vec_unpackh(vecBshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+    vint32 vecBi3 = vec_unpackl(vecBshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 vecBi4 = vec_unpackh(vecBshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+    vint32 vecBi5 = vec_unpackl(vecBshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 vecBi6 = vec_unpackh(vecBshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+    vint32 vecBi7 = vec_unpackl(vecBshi3);
+
+    return {
+        Vectorized<c10::qint32>(veci0 - vecBi0, veci1 - vecBi1),
+        Vectorized<c10::qint32>(veci2 - vecBi2, veci3 - vecBi3),
+        Vectorized<c10::qint32>(veci4 - vecBi4, veci5 - vecBi5),
+        Vectorized<c10::qint32>(veci6 - vecBi6, veci7 - vecBi7)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    vfloat32 vec_multiplier = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    vfloat32 vecf0 = vec_float(vi0.vec0());
+    vfloat32 vecf1 = vec_float(vi0.vec1());
+    vfloat32 vecf2 = vec_float(vi1.vec0());
+    vfloat32 vecf3 = vec_float(vi1.vec1());
+
+    vfloat32 vecf4 = vec_float(vi2.vec0());
+    vfloat32 vecf5 = vec_float(vi2.vec1());
+    vfloat32 vecf6 = vec_float(vi3.vec0());
+    vfloat32 vecf7 = vec_float(vi3.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_multiplier);
+    vecf1 = vec_mul(vecf1, vec_multiplier);
+    vecf2 = vec_mul(vecf2, vec_multiplier);
+    vecf3 = vec_mul(vecf3, vec_multiplier);
+
+    vecf4 = vec_mul(vecf4, vec_multiplier);
+    vecf5 = vec_mul(vecf5, vec_multiplier);
+    vecf6 = vec_mul(vecf6, vec_multiplier);
+    vecf7 = vec_mul(vecf7, vec_multiplier);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+    vecf2 = vec_rint(vecf2);
+    vecf3 = vec_rint(vecf3);
+
+    vecf4 = vec_rint(vecf4);
+    vecf5 = vec_rint(vecf5);
+    vecf6 = vec_rint(vecf6);
+    vecf7 = vec_rint(vecf7);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    veci0 = vec_add(veci0, vec_zero_point);
+    veci1 = vec_add(veci1, vec_zero_point);
+    veci2 = vec_add(veci2, vec_zero_point);
+    veci3 = vec_add(veci3, vec_zero_point);
+
+    veci4 = vec_add(veci4, vec_zero_point);
+    veci5 = vec_add(veci5, vec_zero_point);
+    veci6 = vec_add(veci6, vec_zero_point);
+    veci7 = vec_add(veci7, vec_zero_point);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vuint8 vec0 = vec_packsu(vecshi0, vecshi1);
+    vuint8 vec1 = vec_packsu(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::quint8, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::quint8, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::quint8, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::quint8, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::quint8, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::quint8, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::quint8, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::quint8, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::quint8, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::quint8, /)
+  DEFINE_MEMBER_OP(maximum, c10::quint8, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::quint8, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::quint8, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::quint8, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::quint8, vec_xor)
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::quint8> inline minimum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.minimum(b);
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a25216bd5db17b5a732f7bdb3ebd4047eef1e24f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -0,0 +1,581 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>
+
+#if defined(__clang__)
+typedef __vector __bool char vbool8;
+typedef __vector __bool short vbool16;
+typedef __vector __bool int vbool32;
+typedef __vector __bool long long vbool64;
+using vint8 = __attribute__((vector_size(16))) signed char;
+using vint16 = __attribute__((vector_size(16))) signed short;
+using vint32 = __attribute__((vector_size(16))) signed int;
+using vint64 = __attribute__((vector_size(16))) signed long long;
+using vuint8 = __attribute__((vector_size(16))) unsigned char;
+using vuint16 = __attribute__((vector_size(16))) unsigned short;
+using vuint32 = __attribute__((vector_size(16))) unsigned int;
+using vuint64 = __attribute__((vector_size(16))) unsigned long long;
+using vfloat32 = __attribute__((vector_size(16))) float;
+using vfloat64 = __attribute__((vector_size(16))) double;
+#else
+using vbool8 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char;
+using vbool16 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short;
+using vbool32 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int;
+using vbool64 = __attribute__((altivec(vector__)))
+__attribute__((altivec(bool__))) long long;
+using vint8 = __attribute__((altivec(vector__))) signed char;
+using vint16 = __attribute__((altivec(vector__))) signed short;
+using vint32 = __attribute__((altivec(vector__))) signed int;
+using vint64 = __attribute__((altivec(vector__))) signed long long;
+using vuint8 = __attribute__((altivec(vector__))) unsigned char;
+using vuint16 = __attribute__((altivec(vector__))) unsigned short;
+using vuint32 = __attribute__((altivec(vector__))) unsigned int;
+using vuint64 = __attribute__((altivec(vector__))) unsigned long long;
+using vfloat32 = __attribute__((altivec(vector__))) float;
+using vfloat64 = __attribute__((altivec(vector__))) double;
+#endif
+
+inline auto make_vuint(vint8 v) {
+  return reinterpret_cast<vuint8>(v);
+}
+inline auto make_vuint(vint16 v) {
+  return reinterpret_cast<vuint16>(v);
+}
+inline auto make_vuint(vint32 v) {
+  return reinterpret_cast<vuint32>(v);
+}
+inline auto make_vuint(vint64 v) {
+  return reinterpret_cast<vuint64>(v);
+}
+
+#if !defined(vec_float)
+C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
+  vfloat32 vec_out;
+  __asm__("xvcvsxwsp %x0,%x1" : "=wf"(vec_out) : "wa"(vec_in));
+  return vec_out;
+}
+#endif
+
+#if !defined(vec_signed)
+C10_ALWAYS_INLINE vint32 vec_signed(const vfloat32& vec_in) {
+  vint32 vec_out;
+  __asm__("xvcvspsxws %x0,%x1" : "=wa"(vec_out) : "wf"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vint64 vec_signed(const vfloat64& vec_in) {
+  vint64 vec_out;
+  __asm__("xvcvdpsxds %x0,%x1" : "=wa"(vec_out) : "wd"(vec_in));
+  return vec_out;
+}
+#endif
+
+#if !defined(vec_neg)
+C10_ALWAYS_INLINE vfloat32 vec_neg(const vfloat32& vec_in) {
+  vfloat32 vec_out;
+  __asm__("xvnegsp %x0,%x1" : "=wf"(vec_out) : "wf"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) {
+  vfloat64 vec_out;
+  __asm__("xvnegdp %x0,%x1" : "=wd"(vec_out) : "wd"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) {
+  vint16 vint0 = {0, 0, 0, 0, 0, 0, 0, 0};
+  return vec_vsubuhm(vint0, vec_in);
+}
+
+C10_ALWAYS_INLINE vint32 vec_neg(const vint32& vec_in) {
+  vint32 vint0 = {0, 0, 0, 0};
+  return vec_vsubuwm(vint0, vec_in);
+}
+
+C10_ALWAYS_INLINE vint64 vec_neg(const vint64& vec_in) {
+  return -vec_in;
+}
+#endif
+
+#if !defined(vec_sldw)
+template <unsigned int C>
+C10_ALWAYS_INLINE vfloat32
+vec_sldw_aux(const vfloat32& vec_in0, const vfloat32& vec_in1) {
+  vfloat32 vec_out;
+  __asm("xxsldwi %x0, %x1, %x2, %3 "
+        : "=wa"(vec_out)
+        : "wa"(vec_in0), "wa"(vec_in1), "I"(C));
+  return vec_out;
+}
+
+#define vec_sldw(a, b, c) vec_sldw_aux<c>(a, b)
+#endif
+
+#define vec_not(a) vec_nor(a, a)
+#if defined(__clang__) && !defined(vec_splats)
+C10_ALWAYS_INLINE vint64 vec_splats(const int64_t& a) {
+  return vec_splats(a);
+}
+#endif
+// Vectorized min/max which return a if any operand is nan
+template <class T>
+C10_ALWAYS_INLINE T vec_min_nan(const T& a, const T& b) {
+  return vec_min(a, b);
+}
+template <class T>
+C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) {
+  return vec_max(a, b);
+}
+
+// Specializations for float/double taken from Eigen
+template <>
+C10_ALWAYS_INLINE vfloat32
+vec_min_nan<vfloat32>(const vfloat32& a, const vfloat32& b) {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat32 ret;
+  __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+// Specializations for float/double taken from Eigen
+template <>
+C10_ALWAYS_INLINE vfloat32
+vec_max_nan<vfloat32>(const vfloat32& a, const vfloat32& b) {
+  // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat32 ret;
+  __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+
+template <>
+C10_ALWAYS_INLINE vfloat64
+vec_min_nan<vfloat64>(const vfloat64& a, const vfloat64& b) {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat64 ret;
+  __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+template <>
+C10_ALWAYS_INLINE vfloat64
+vec_max_nan<vfloat64>(const vfloat64& a, const vfloat64& b) {
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE
+  // regarding NaN
+  vfloat64 ret;
+  __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+
+// Vectorizes min/max function which returns nan if any side is nan
+#define C10_VSX_VEC_NAN_PROPAG(name, type, btype, func)       \
+  C10_ALWAYS_INLINE type name(const type& a, const type& b) { \
+    type tmp = func(a, b);                                    \
+    btype nan_a = vec_cmpne(a, a);                            \
+    btype nan_b = vec_cmpne(b, b);                            \
+    tmp = vec_sel(tmp, a, nan_a);                             \
+    return vec_sel(tmp, b, nan_b);                            \
+  }
+
+C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat32, vbool32, vec_min)
+C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat32, vbool32, vec_max)
+C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat64, vbool64, vec_min)
+C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max)
+
+#undef C10_VSX_VEC_NAN_PROPAG
+
+#define DEFINE_MEMBER_UNARY_OP(op, op_type, func)         \
+  Vectorized<op_type> C10_ALWAYS_INLINE op() const {      \
+    return Vectorized<op_type>{func(_vec0), func(_vec1)}; \
+  }
+
+#define DEFINE_MEMBER_OP(op, op_type, func)                                  \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    return Vectorized<op_type>{                                              \
+        func(_vec0, other._vec0), func(_vec1, other._vec1)};                 \
+  }
+
+#define DEFINE_MEMBER_BITWISE_OP(op, op_type, func)                          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    return Vectorized<op_type>{                                              \
+        func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)};             \
+  }
+
+#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func)                       \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(                               \
+      const Vectorized<op_type>& b, const Vectorized<op_type>& c) const { \
+    return Vectorized<op_type>{                                           \
+        func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)};    \
+  }
+
+#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op)          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& b) \
+      const {                                                            \
+    Vectorized<op_type>::vec_internal_type ret_0;                        \
+    Vectorized<op_type>::vec_internal_type ret_1;                        \
+    for (int i = 0; i < Vectorized<op_type>::size() / 2; i++) {          \
+      ret_0[i] = _vec0[i] binary_op b._vec0[i];                          \
+      ret_1[i] = _vec1[i] binary_op b._vec1[i];                          \
+    }                                                                    \
+    return Vectorized<op_type>{ret_0, ret_1};                            \
+  }
+
+#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func)                          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    using vvtype = Vectorized<op_type>::vec_internal_type;                   \
+    const vvtype v_one = vec_splats(static_cast<op_type>(1.0));              \
+    vvtype ret0 = (vvtype)func(_vec0, other._vec0);                          \
+    vvtype ret1 = (vvtype)func(_vec1, other._vec1);                          \
+    return Vectorized<op_type>{vec_and(ret0, v_one), vec_and(ret1, v_one)};  \
+  }
+
+#define DEFINE_CLAMP_FUNCS(operand_type)                                       \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp(                            \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& min,                                     \
+      const Vectorized<operand_type>& max) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()),            \
+        vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())};           \
+  }                                                                            \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_min(                        \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& min) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_max_nan(a.vec0(), min.vec0()), vec_max_nan(a.vec1(), min.vec1())}; \
+  }                                                                            \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_max(                        \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& max) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_min_nan(a.vec0(), max.vec0()), vec_min_nan(a.vec1(), max.vec1())}; \
+  }
+
+#define DEFINE_REINTERPRET_CAST_FUNCS(                                 \
+    first_type, cast_type, cast_inner_vector_type)                     \
+  template <>                                                          \
+  C10_ALWAYS_INLINE Vectorized<cast_type> cast<cast_type, first_type>( \
+      const Vectorized<first_type>& src) {                             \
+    return Vectorized<cast_type>{                                      \
+        (cast_inner_vector_type)src.vec0(),                            \
+        (cast_inner_vector_type)src.vec1()};                           \
+  }
+
+#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type)      \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64) \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16)
+
+// it can be used to emulate blend faster
+constexpr int blendChoice(
+    uint32_t mask,
+    uint32_t half1 = 0xF,
+    uint32_t half2 = 0xF0) {
+  uint32_t none = 0;
+  uint32_t both = half1 | half2;
+  // clamp it between 0 and both
+  mask = mask & both;
+  // return  (a._vec0, a._vec1)
+  if (mask == none)
+    return 0;
+  // return (b._vec0,b._vec1)
+  else if (mask == both)
+    return 1;
+  // return  (b._vec0,a._vec1)
+  else if (mask == half1)
+    return 2;
+  // return  (a._vec0,b._vec1)
+  else if (mask == half2)
+    return 3;
+  // return  (*_vec0,a._vec1)
+  else if (mask > 0 && mask < half1)
+    return 4;
+  // return  (*_vec0,b._vec1)
+  else if ((mask & half2) == half2)
+    return 5;
+  // return (a._vec0,*_vec1)
+  else if ((mask & half1) == 0 && mask > half1)
+    return 6;
+  // return (b._vec0,*_vec1)
+  else if ((mask & half1) == half1 && mask > half1)
+    return 7;
+  // return (*_vec0,*_vec1)
+  return 8;
+}
+
+// it can be used to emulate blend faster
+constexpr int blendChoiceDbl(uint32_t mask) {
+  // clamp it 0 and 0xF
+  return blendChoice(mask, 0x3, 0xC);
+}
+
+constexpr vbool32 VsxMask1(uint32_t mask) {
+  uint32_t g0 = (mask & 1) * 0xffffffff;
+  uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+  uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+  uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+  return (vbool32){g0, g1, g2, g3};
+}
+
+constexpr vbool32 VsxMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xFF) >> 4;
+  return VsxMask1(mask2);
+}
+
+constexpr vbool64 VsxDblMask1(uint32_t mask) {
+  uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+  uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+  return (vbool64){g0, g1};
+}
+
+constexpr vbool64 VsxDblMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxDblMask1(mask2);
+}
+
+constexpr int maskForComplex(uint32_t mask) {
+  mask = mask & 0xF;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  if (mask & 4)
+    complex_mask |= (3 << 4);
+  if (mask & 8)
+    complex_mask |= (3 << 6);
+  return complex_mask;
+}
+
+constexpr int maskForComplexDbl(uint32_t mask) {
+  mask = mask & 0x3;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  return complex_mask;
+}
+
+constexpr int blendChoiceComplex(uint32_t mask) {
+  return blendChoice(maskForComplex(mask));
+}
+
+constexpr int blendChoiceComplexDbl(uint32_t mask) {
+  return blendChoiceDbl(maskForComplexDbl(mask));
+}
+
+constexpr vbool32 VsxComplexMask1(uint32_t mask) {
+  return VsxMask1(maskForComplex(mask));
+}
+
+constexpr vbool32 VsxComplexMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxMask1(maskForComplex(mask2));
+}
+
+constexpr vbool64 VsxComplexDblMask1(uint32_t mask) {
+  return VsxDblMask1(mask);
+}
+
+constexpr vbool64 VsxComplexDblMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxDblMask1(mask2);
+}
+
+// constants
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+//
+constexpr int offset0 = 0;
+constexpr int offset16 = 16;
+
+// #Constants
+const vuint8 mask_zero_bits = vuint8{
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    96,
+    64,
+    32,
+    0};
+
+const vuint8 swap_mask =
+    vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
+
+const vint32 v0x7f = vec_splats(0x7f);
+const vint32 vi_0 = vec_splats((int)(0));
+const vint32 vi_1 = vec_splats((int)1);
+const vint32 vi_2 = vec_splats((int)2);
+const vint32 vi_4 = vec_splats((int)4);
+const vint32 vi_inv1 = vec_splats((int)~1);
+const vuint32 vu_29 = vec_splats(29u);
+const vuint32 vu_23 = vec_splats(23u);
+
+const vbool32 inv_mant_mask = (vbool32)vec_splats((unsigned int)~0xff800000);
+const vbool32 sign_mask = (vbool32)vec_splats((int)0x80000000);
+const vbool32 real_mask = vbool32{0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0};
+const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
+const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
+const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
+
+const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000};
+const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
+const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
+const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};
+const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0};
+
+const vfloat32 zero = vec_splats(0.f);
+const vfloat32 half = vec_splats(0.5f);
+const vfloat32 one = vec_splats(1.f);
+const vfloat32 two = vec_splats(2.0f);
+const vfloat32 _4div_pi = vec_splats(1.27323954473516f);
+const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u);
+const vfloat32 v_minus_inf =
+    vfloat32{0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u};
+const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff);
+const vfloat32 log10e_inv = vec_splats(0.43429448190325176f);
+const vfloat32 log2e_inv = vec_splats(1.4426950408889634f);
+const vfloat32 log2eB_inv = vec_splats(1.442695036924675f);
+const vfloat32 cephes_SQRTHF = vec_splats(0.707106781186547524f);
+const vfloat32 coscof_p0 = vec_splats(2.443315711809948E-005f);
+const vfloat32 coscof_p1 = vec_splats(-1.388731625493765E-003f);
+const vfloat32 coscof_p2 = vec_splats(4.166664568298827E-002f);
+const vfloat32 exp_hi = vec_splats(104.f);
+const vfloat32 exp_lo = vec_splats(-104.f);
+const vfloat32 exp_p0 = vec_splats(0.000198527617612853646278381f);
+const vfloat32 exp_p1 = vec_splats((0.00139304355252534151077271f));
+const vfloat32 exp_p2 = vec_splats(0.00833336077630519866943359f);
+const vfloat32 exp_p3 = vec_splats(0.0416664853692054748535156f);
+const vfloat32 exp_p4 = vec_splats(0.166666671633720397949219f);
+const vfloat32 exp_p5 = vec_splats(0.5f);
+const vfloat32 log_p0 = vec_splats(7.0376836292E-2f);
+const vfloat32 log_p1 = vec_splats(-1.1514610310E-1f);
+const vfloat32 log_p2 = vec_splats(1.1676998740E-1f);
+const vfloat32 log_p3 = vec_splats(-1.2420140846E-1f);
+const vfloat32 log_p4 = vec_splats(+1.4249322787E-1f);
+const vfloat32 log_p5 = vec_splats(-1.6668057665E-1f);
+const vfloat32 log_p6 = vec_splats(+2.0000714765E-1f);
+const vfloat32 log_p7 = vec_splats(-2.4999993993E-1f);
+const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f);
+const vfloat32 log_q1 = vec_splats(-2.12194440e-4f);
+const vfloat32 log_q2 = vec_splats(0.693359375f);
+const vfloat32 max_logf = vec_splats(88.02969187150841f);
+const vfloat32 max_numf =
+    vec_splats(1.7014117331926442990585209174225846272e38f);
+const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u);
+const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u);
+const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f);
+const vfloat32 minus_cephes_dp2 = vec_splats(-2.4187564849853515625e-4f);
+const vfloat32 minus_cephes_dp3 = vec_splats(-3.77489497744594108e-8f);
+const vfloat32 negln2f_hi = vec_splats(-0.693145751953125f);
+const vfloat32 negln2f_lo = vec_splats(-1.428606765330187045e-06f);
+const vfloat32 p0 = vec_splats(2.03721912945E-4f);
+const vfloat32 p1 = vec_splats(8.33028376239E-3f);
+const vfloat32 p2 = vec_splats(1.66667160211E-1f);
+const vfloat32 sincof_p0 = vec_splats(-1.9515295891E-4f);
+const vfloat32 sincof_p1 = vec_splats(8.3321608736E-3f);
+const vfloat32 sincof_p2 = vec_splats(-1.6666654611E-1f);
+const vfloat32 tanh_0p625 = vec_splats(0.625f);
+const vfloat32 tanh_half_max = vec_splats(44.014845935754205f);
+const vfloat32 tanh_p0 = vec_splats(-5.70498872745E-3f);
+const vfloat32 tanh_p1 = vec_splats(2.06390887954E-2f);
+const vfloat32 tanh_p2 = vec_splats(-5.37397155531E-2f);
+const vfloat32 tanh_p3 = vec_splats(1.33314422036E-1f);
+const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f);
+const vfloat32 vcheck = vec_splats((float)(1LL << 24));
+const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f};
+const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f};
+const vfloat32 sqrt2_2 = vfloat32{
+    0.70710676908493042f,
+    0.70710676908493042,
+    0.70710676908493042,
+    0.70710676908493042};
+const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0};
+const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f};
+const vfloat64 vd_one = vec_splats(1.0);
+const vfloat64 vd_zero = vec_splats(0.0);
+const vfloat64 vd_log10e_inv = vec_splats(0.43429448190325176);
+const vfloat64 vd_log2e_inv = vec_splats(1.4426950408889634);
+const vfloat64 vd_imag_one = vfloat64{0.0, 1.0};
+const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
+const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
+const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
+
+template <typename T>
+Vectorized<T> VsxShiftRightArith(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT - std::is_signed_v<T>);
+  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
+  const auto shift = Vectorized<T>::blendv(b, max_shift, mask);
+  return Vectorized<T>{
+      vec_sra(a.vec0(), make_vuint(shift.vec0())),
+      vec_sra(a.vec1(), make_vuint(shift.vec1()))};
+}
+
+template <typename T>
+Vectorized<T> VsxShiftLeftArith(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT);
+  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
+  Vectorized<T> ret(
+      vec_sl(a.vec0(), make_vuint(b.vec0())),
+      vec_sl(a.vec1(), make_vuint(b.vec1())));
+  return Vectorized<T>::blendv(ret, Vectorized<T>(0), mask);
+}
+
+#define DEFINE_SHIFT_FUNCS(operand_type)                                      \
+  template <>                                                                 \
+  Vectorized<operand_type> C10_ALWAYS_INLINE operator>>(                      \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return VsxShiftRightArith(a, b);                                          \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<operand_type> C10_ALWAYS_INLINE operator<<(                      \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return VsxShiftLeftArith(a, b);                                           \
+  }
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c48ae8c5732d8276a45ac698dedf87f27678d582
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -0,0 +1,2978 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#if defined(__clang__)
+#include <sleef.h>
+#elif defined(__GNUC__) || defined(__GNUG__)
+#include <sleef.h>
+#include <vecintrin.h>
+#endif
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+
+namespace at {
+namespace vec {
+
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <typename T>
+constexpr bool is_zarch_implemented() {
+  return (
+      std::is_same_v<T, float> || std::is_same_v<T, double> ||
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t> ||
+      std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+      std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t>);
+}
+
+template <typename T>
+constexpr bool is_zarch_implemented_quant() {
+  return (
+      std::is_same_v<T, c10::qint32> || std::is_same_v<T, c10::qint8> ||
+      std::is_same_v<T, c10::quint8>);
+}
+
+template <typename T>
+constexpr bool is_zarch_implemented_complex() {
+  return std::is_same_v<T, c10::complex<float>> ||
+      std::is_same_v<T, c10::complex<double>>;
+}
+
+constexpr int offset0 = 0;
+constexpr int offset16 = 16;
+
+template <int N>
+struct VecBinaryType {
+  using type __attribute__((vector_size(16))) = uintmax_t;
+};
+
+template <>
+struct VecBinaryType<8> {
+  using type = __attribute__((vector_size(16))) unsigned long long;
+};
+
+template <>
+struct VecBinaryType<4> {
+  using type = __attribute__((vector_size(16))) unsigned int;
+};
+
+template <>
+struct VecBinaryType<2> {
+  using type = __attribute__((vector_size(16))) unsigned short;
+};
+
+template <>
+struct VecBinaryType<1> {
+  using type = __attribute__((vector_size(16))) unsigned char;
+};
+
+template <typename T>
+struct VecInnerType {
+  using Type __attribute__((vector_size(16))) = T;
+  using BinaryType = typename VecBinaryType<sizeof(T)>::type;
+  using ElementType = T;
+  static constexpr int size = 16 / sizeof(T);
+};
+
+// define for int64_t properly for load
+template <>
+struct VecInnerType<int64_t> {
+  using Type = __attribute__((vector_size(16))) signed long long;
+  using ElementType = signed long long;
+  using BinaryType = typename VecBinaryType<sizeof(signed long long)>::type;
+  static constexpr int size = 16 / sizeof(signed long long);
+};
+
+template <typename T>
+using ZSimdVect = typename VecInnerType<T>::Type;
+template <typename T>
+using ZSimdVectBinary = typename VecInnerType<T>::BinaryType;
+template <typename T>
+using ZSimdVectElement = typename VecInnerType<T>::ElementType;
+
+constexpr int blendChoiceInner(
+    const uint64_t mask,
+    const uint64_t half1 = 0xF,
+    const uint64_t half2 = 0xF0) {
+  uint64_t none = 0;
+  uint64_t both = half1 | half2;
+  // clamp it between 0 and both
+  auto res_mask = mask & both;
+  // return  (a._vec0, a._vec1)
+  if (res_mask == none)
+    return 0;
+  // return (b._vec0,b._vec1)
+  else if (res_mask == both)
+    return 1;
+  // return  (b._vec0, a._vec1)
+  else if (res_mask == half1)
+    return 2;
+  // return  (a._vec0,b._vec1)
+  else if (res_mask == half2)
+    return 3;
+  // return  (*_vec0,a._vec1)
+  else if (res_mask > 0 && res_mask < half1)
+    return 4;
+  // return  (*_vec0,b._vec1)
+  else if ((res_mask & half2) == half2)
+    return 5;
+  // return (a._vec0,*_vec1)
+  else if ((res_mask & half1) == 0 && res_mask > half1)
+    return 6;
+  // return (b._vec0,*_vec1)
+  else if ((res_mask & half1) == half1 && res_mask > half1)
+    return 7;
+  // return (*_vec0,*_vec1)
+  return 8;
+}
+
+// it can be used to emulate blend faster
+template <int Z>
+constexpr int blendChoice(const uint64_t mask) {
+  static_assert(Z < 1 || Z > 8, "not implemented");
+  return blendChoiceInner(mask);
+}
+
+template <>
+constexpr int blendChoice<1>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0x0000FFFF, 0xFFFF0000);
+}
+
+template <>
+constexpr int blendChoice<2>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0x00FF, 0xFF00);
+}
+
+template <>
+constexpr int blendChoice<4>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0xF, 0xF0);
+}
+
+template <>
+constexpr int blendChoice<8>(const uint64_t mask) {
+  // clamp it 0 and 0xF
+  return blendChoiceInner(mask, 0x3, 0xC);
+}
+
+template <int N>
+constexpr auto GetMask1(const uint64_t mask) {
+  return typename VecBinaryType<N>::type{};
+}
+
+template <int N>
+constexpr auto GetMask2(const uint64_t mask) {
+  return typename VecBinaryType<N>::type{};
+}
+
+template <>
+constexpr auto GetMask1<1>(const uint64_t mask) {
+  constexpr uint8_t t = (int)0xFF;
+  uint8_t g0 = (mask & 1) * t;
+  uint8_t g1 = ((mask & 2) >> 1) * t;
+  uint8_t g2 = ((mask & 4) >> 2) * t;
+  uint8_t g3 = ((mask & 8) >> 3) * t;
+  uint8_t g4 = ((mask & 16) >> 4) * t;
+  uint8_t g5 = ((mask & 32) >> 5) * t;
+  uint8_t g6 = ((mask & 64) >> 6) * t;
+  uint8_t g7 = ((mask & 128) >> 7) * t;
+  uint8_t g8 = ((mask & 256) >> 8) * t;
+  uint8_t g9 = ((mask & 512) >> 9) * t;
+  uint8_t g10 = ((mask & 1024) >> 10) * t;
+  uint8_t g11 = ((mask & 2048) >> 11) * t;
+  uint8_t g12 = ((mask & 4096) >> 12) * t;
+  uint8_t g13 = ((mask & 8192) >> 13) * t;
+  uint8_t g14 = ((mask & 16384) >> 14) * t;
+  uint8_t g15 = ((mask & 32768) >> 15) * t;
+  return (typename VecBinaryType<1>::type){
+      g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15};
+}
+
+template <>
+constexpr auto GetMask2<1>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFFFFFFFF) >> 16;
+  return GetMask1<1>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<2>(const uint64_t mask) {
+  constexpr uint16_t t = (int)0xFFFF;
+  uint16_t g0 = (mask & 1) * t;
+  uint16_t g1 = ((mask & 2) >> 1) * t;
+  uint16_t g2 = ((mask & 4) >> 2) * t;
+  uint16_t g3 = ((mask & 8) >> 3) * t;
+  uint16_t g4 = ((mask & 16) >> 4) * t;
+  uint16_t g5 = ((mask & 32) >> 5) * t;
+  uint16_t g6 = ((mask & 64) >> 6) * t;
+  uint16_t g7 = ((mask & 128) >> 7) * t;
+  return (typename VecBinaryType<2>::type){g0, g1, g2, g3, g4, g5, g6, g7};
+}
+
+template <>
+constexpr auto GetMask2<2>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFFFF) >> 8;
+  return GetMask1<2>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<4>(const uint64_t mask) {
+  uint32_t g0 = (mask & 1) * 0xffffffff;
+  uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+  uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+  uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+  return (typename VecBinaryType<4>::type){g0, g1, g2, g3};
+}
+
+template <>
+constexpr auto GetMask2<4>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFF) >> 4;
+  return GetMask1<4>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<8>(const uint64_t mask) {
+  uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+  uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+  return (typename VecBinaryType<8>::type){g0, g1};
+}
+
+template <>
+constexpr auto GetMask2<8>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xF) >> 2;
+  return GetMask1<8>(mask2);
+}
+
+template <int Z>
+constexpr int maskForComplex(uint32_t mask) {
+  return 0;
+}
+
+template <>
+constexpr int maskForComplex<8>(uint32_t mask) {
+  mask = mask & 0xF;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  if (mask & 4)
+    complex_mask |= (3 << 4);
+  if (mask & 8)
+    complex_mask |= (3 << 6);
+  return complex_mask;
+}
+
+template <>
+constexpr int maskForComplex<16>(uint32_t mask) {
+  mask = mask & 0x3;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  return complex_mask;
+}
+
+template <typename T = c10::complex<float>>
+constexpr int blend_choice() {
+  return 0xAA;
+}
+
+template <>
+constexpr int blend_choice<c10::complex<double>>() {
+  return 0x0A;
+}
+
+constexpr int64_t allbitset(int16_t x) {
+  int64_t onex = 1;
+  return (onex << x) - onex;
+}
+
+namespace { /* unnamed namespace */
+
+ZSimdVect<float> vec_mergee(ZSimdVect<float> x, ZSimdVect<float> y) {
+  constexpr ZSimdVectBinary<uint8_t> mergee_mask{
+      0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27};
+  return vec_perm(x, y, mergee_mask);
+}
+
+ZSimdVect<double> vec_mergee(ZSimdVect<double> x, ZSimdVect<double> y) {
+  return vec_mergeh(x, y);
+}
+
+ZSimdVect<float> vec_mergeo(ZSimdVect<float> x, ZSimdVect<float> y) {
+  constexpr ZSimdVectBinary<uint8_t> mergeo_mask{
+      4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+  return vec_perm(x, y, mergeo_mask);
+}
+
+ZSimdVect<double> vec_mergeo(ZSimdVect<double> x, ZSimdVect<double> y) {
+  return vec_mergel(x, y);
+}
+
+} /* unnamed namespace */
+
+//
+template <typename T>
+constexpr auto GetBpermZeroMask() {
+  return ZSimdVectBinary<uint8_t>{
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      96,
+      64,
+      32,
+      0};
+}
+
+template <>
+constexpr auto GetBpermZeroMask<double>() {
+  return ZSimdVectBinary<uint8_t>{
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      64,
+      0};
+}
+
+constexpr auto GetSwapMaskFloat() {
+  return ZSimdVectBinary<uint8_t>{
+      4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
+}
+
+template <typename T>
+struct is_vec_specialized_for<T, std::enable_if_t<is_zarch_implemented<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
+ public:
+  using value_type = T;
+  using vtype = ZSimdVect<T>;
+  using vmaskType = ZSimdVectBinary<T>;
+  using size_type = int;
+  // because of gcc inconsistency for int64_t we are obliged to use this, not
+  // value_type
+  using ElementType = ZSimdVectElement<T>;
+  using vinner_data = std::pair<vtype, vtype>;
+
+ private:
+  vtype _vec0;
+  vtype _vec1;
+
+ public:
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(ElementType);
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(const vinner_data& v)
+      : _vec0{v.first}, _vec1{v.second} {}
+  C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(T s)
+      : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {}
+
+  template <typename U, typename DUMMY = void>
+  struct LoaduHelper {
+    static Vectorized<T> C10_ALWAYS_INLINE
+    loadu(const U* ptr, int count = size()) {
+      __at_align__ ElementType tmp_values[size()] = {};
+      std::memcpy(
+          tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+
+      return {
+          vec_xl(offset0, &(tmp_values[0])),
+          vec_xl(offset16, &(tmp_values[0]))};
+    }
+  };
+
+  template <typename DUMMY>
+  struct LoaduHelper<ElementType, DUMMY> {
+    static Vectorized<T> C10_ALWAYS_INLINE
+    loadu(const ElementType* ptr, int count = size()) {
+      if (count == size()) {
+        return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)};
+      }
+
+      __at_align__ ElementType tmp_values[size()] = {};
+      std::memcpy(
+          tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+
+      return {
+          vec_xl(offset0, &(tmp_values[0])),
+          vec_xl(offset16, &(tmp_values[0]))};
+    }
+  };
+
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const U* ptr, int count = size()) {
+    return LoaduHelper<U>::loadu(ptr, count);
+  }
+
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) {
+    // load only first 8 bytes
+    // only intended to be used with uint8_t
+    return loadu(ptr, 8 / sizeof(ElementType));
+  }
+
+  template <typename U, typename DUMMY = void>
+  struct StoreHelper {
+    static void C10_ALWAYS_INLINE
+    store(const Vectorized<T>& vec, U* ptr, int count = size()) {
+      if (count > 0) {
+        __at_align__ ElementType tmp_values[size()];
+        vec_xst(vec._vec0, offset0, &(tmp_values[0]));
+        vec_xst(vec._vec1, offset16, &(tmp_values[0]));
+        std::memcpy(
+            ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+      }
+    }
+  };
+
+  template <typename DUMMY>
+  struct StoreHelper<ElementType, DUMMY> {
+    static void C10_ALWAYS_INLINE
+    store(const Vectorized<T>& vec, ElementType* ptr, int count = size()) {
+      if (count == size()) {
+        vec_xst(vec._vec0, offset0, ptr);
+        vec_xst(vec._vec1, offset16, ptr);
+      } else if (count > 0) {
+        __at_align__ ElementType tmp_values[size()];
+        vec_xst(vec._vec0, offset0, &(tmp_values[0]));
+        vec_xst(vec._vec1, offset16, &(tmp_values[0]));
+        std::memcpy(
+            ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+      }
+    }
+  };
+
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
+    return StoreHelper<U>::store(*this, ptr, count);
+  }
+
+  C10_ALWAYS_INLINE const vtype& vec0() const {
+    return _vec0;
+  }
+
+  C10_ALWAYS_INLINE const vtype& vec1() const {
+    return _vec1;
+  }
+
+  C10_ALWAYS_INLINE vinner_data data() const {
+    return std::make_pair<>(_vec0, _vec1);
+  }
+
+  C10_ALWAYS_INLINE operator vinner_data() const {
+    return data();
+  }
+
+  C10_ALWAYS_INLINE const vmaskType vecb0() const {
+    return (vmaskType)_vec0;
+  }
+  C10_ALWAYS_INLINE const vmaskType vecb1() const {
+    return (vmaskType)_vec1;
+  }
+
+  static Vectorized<T> C10_ALWAYS_INLINE blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, mask.vecb0()),
+        vec_sel(a._vec1, b._vec1, mask.vecb1())};
+  }
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4)
+      : _vec0{s1, s2}, _vec1{s3, s4} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 4), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8)
+      : _vec0{s1, s2, s3, s4}, _vec1{s5, s6, s7, s8} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 2), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(
+      T s1,
+      T s2,
+      T s3,
+      T s4,
+      T s5,
+      T s6,
+      T s7,
+      T s8,
+      T s9,
+      T s10,
+      T s11,
+      T s12,
+      T s13,
+      T s14,
+      T s15,
+      T s16)
+      : _vec0{s1, s2, s3, s4, s5, s6, s7, s8},
+        _vec1{s9, s10, s11, s12, s13, s14, s15, s16} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 1), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(
+      T s1,
+      T s2,
+      T s3,
+      T s4,
+      T s5,
+      T s6,
+      T s7,
+      T s8,
+      T s9,
+      T s10,
+      T s11,
+      T s12,
+      T s13,
+      T s14,
+      T s15,
+      T s16,
+      T s17,
+      T s18,
+      T s19,
+      T s20,
+      T s21,
+      T s22,
+      T s23,
+      T s24,
+      T s25,
+      T s26,
+      T s27,
+      T s28,
+      T s29,
+      T s30,
+      T s31,
+      T s32)
+      : _vec0{s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16},
+        _vec1{
+            s17,
+            s18,
+            s19,
+            s20,
+            s21,
+            s22,
+            s23,
+            s24,
+            s25,
+            s26,
+            s27,
+            s28,
+            s29,
+            s30,
+            s31,
+            s32} {}
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 8, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 4, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 2, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 1, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+
+  // blend section
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 0, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 1, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 2, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 3, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 4, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 5, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 6, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    // generated masks
+    return {a._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 7, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    // generated masks
+    return {b._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 8, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    return {
+        (vtype)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z >= C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    return b;
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z < C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    if (count == Z)
+      return blend<allbitset(Z)>(a, b);
+    else
+      return set_inner<Z + 1, C>(a, b, count);
+  }
+
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count = size()) {
+    if (count == 0)
+      return a;
+    return set_inner<1, size()>(a, b, count);
+  }
+
+  const ElementType& operator[](int idx) const = delete;
+  ElementType& operator[](int idx) = delete;
+
+  Vectorized<T> _not() const {
+    return {(vtype)vec_nor(vecb0(), vecb0()), (vtype)vec_nor(vecb1(), vecb1())};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    return (*this == other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    return (*this != other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE gt(const Vectorized<T>& other) const {
+    return (*this > other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ge(const Vectorized<T>& other) const {
+    return (*this >= other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE lt(const Vectorized<T>& other) const {
+    return (*this < other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE le(const Vectorized<T>& other) const {
+    return (*this <= other) & Vectorized<T>((T)1.0);
+  }
+
+  template <typename U = T, std::enable_if_t<!std::is_unsigned_v<U>, int> = 0>
+  Vectorized<U> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_unsigned_v<U>, int> = 0>
+  Vectorized<U> C10_ALWAYS_INLINE abs() const {
+    return {_vec0, _vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE neg() const {
+    return {-_vec0, -_vec1};
+  }
+
+  Vectorized<T> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._not();
+  }
+
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<U> angle() const {
+    auto tmp = blendv(
+        Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
+  Vectorized<U> angle() const {
+    return blendv(
+        Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
+  }
+
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return Vectorized<T>{0};
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  int zero_mask() const {
+    auto cmp = (*this == Vectorized<U>(0));
+    constexpr auto mask_zero_bits = GetBpermZeroMask<U>();
+    ZSimdVectBinary<uint64_t> result0 =
+        vec_bperm_u128((ZSimdVectBinary<uint8_t>)cmp.vecb0(), mask_zero_bits);
+    ZSimdVectBinary<uint64_t> result1 =
+        vec_bperm_u128((ZSimdVectBinary<uint8_t>)cmp.vecb1(), mask_zero_bits);
+    return (result0[0] | (result1[0] << (size() / 2)));
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE rint() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE reciprocal() const {
+    return Vectorized<T>((T)1) / (*this);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+  inline Vectorized<T> mapOrdinary(float (*const f)(float)) const {
+    float a00 = f(_vec0[0]);
+    float a01 = f(_vec0[1]);
+    float a02 = f(_vec0[2]);
+    float a03 = f(_vec0[3]);
+    float a10 = f(_vec1[0]);
+    float a11 = f(_vec1[1]);
+    float a12 = f(_vec1[2]);
+    float a13 = f(_vec1[3]);
+    return Vectorized<T>{a00, a01, a02, a03, a10, a11, a12, a13};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
+  inline Vectorized<T> mapOrdinary(double (*const f)(double)) const {
+    return Vectorized<T>(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1]));
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      float (*const f)(float, float),
+      const Vectorized<T>& b) const {
+    float a00 = f(_vec0[0], b._vec0[0]);
+    float a01 = f(_vec0[1], b._vec0[1]);
+    float a02 = f(_vec0[2], b._vec0[2]);
+    float a03 = f(_vec0[3], b._vec0[3]);
+    float a10 = f(_vec1[0], b._vec1[0]);
+    float a11 = f(_vec1[1], b._vec1[1]);
+    float a12 = f(_vec1[2], b._vec1[2]);
+    float a13 = f(_vec1[3], b._vec1[3]);
+    return Vectorized<T>{a00, a01, a02, a03, a10, a11, a12, a13};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      double (*const f)(double, double),
+      const Vectorized<T>& b) const {
+    return Vectorized<T>(
+        f(_vec0[0], b._vec0[0]),
+        f(_vec0[1], b._vec0[1]),
+        f(_vec1[0], b._vec1[0]),
+        f(_vec1[1], b._vec1[1]));
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
+    vtype a0 = f(_vec0);
+    vtype a1 = f(_vec1);
+    return Vectorized<T>{a0, a1};
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
+    return Vectorized<T>(d(_vec0), d(_vec1));
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
+      const {
+    vtype a0 = f(_vec0, b._vec0);
+    vtype a1 = f(_vec1, b._vec1);
+    return Vectorized<T>{a0, a1};
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
+      const {
+    return Vectorized<T>(d(_vec0, b._vec0), d(_vec1, b._vec1));
+  }
+
+  Vectorized<T> acos() const {
+    return mapSleef(Sleef_acosf4_u10, Sleef_acosd2_u10);
+  }
+  Vectorized<T> asin() const {
+    return mapSleef(Sleef_asinf4_u10, Sleef_asind2_u10);
+  }
+  Vectorized<T> atan() const {
+    return mapSleef(Sleef_atanf4_u10, Sleef_atand2_u10);
+  }
+  Vectorized<T> atanh() const {
+    return mapSleef(Sleef_atanhf4_u10, Sleef_atanhd2_u10);
+  }
+
+  Vectorized<T> erf() const {
+    return mapSleef(Sleef_erff4_u10, Sleef_erfd2_u10);
+  }
+  Vectorized<T> erfc() const {
+    return mapSleef(Sleef_erfcf4_u15, Sleef_erfcd2_u15);
+  }
+
+  Vectorized<T> exp() const {
+    return mapSleef(Sleef_expf4_u10, Sleef_expd2_u10);
+  }
+  Vectorized<T> exp2() const {
+    return mapSleef(Sleef_exp2f4_u10, Sleef_exp2d2_u10);
+  }
+  Vectorized<T> expm1() const {
+    return mapSleef(Sleef_expm1f4_u10, Sleef_expm1d2_u10);
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }
+
+  Vectorized<T> log() const {
+    return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
+  }
+  Vectorized<T> log2() const {
+    return mapSleef(Sleef_log2f4_u10, Sleef_log2d2_u10);
+  }
+  Vectorized<T> log10() const {
+    return mapSleef(Sleef_log10f4_u10, Sleef_log10d2_u10);
+  }
+  Vectorized<T> log1p() const {
+    return mapSleef(Sleef_log1pf4_u10, Sleef_log1pd2_u10);
+  }
+
+  Vectorized<T> sin() const {
+    return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10);
+  }
+  Vectorized<T> sinh() const {
+    return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10);
+  }
+  Vectorized<T> cos() const {
+    return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10);
+  }
+  Vectorized<T> cosh() const {
+    return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10);
+  }
+
+  Vectorized<T> tan() const {
+    return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10);
+  }
+  Vectorized<T> tanh() const {
+    return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10);
+  }
+
+  Vectorized<T> lgamma() const {
+    return mapSleef(Sleef_lgammaf4_u10, Sleef_lgammad2_u10);
+  }
+
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_atan2f4_u10, Sleef_atan2d2_u10, b);
+  }
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    return mapSleef(Sleef_copysignf4, Sleef_copysignd2, sign);
+  }
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+    return mapSleef(Sleef_fmodf4, Sleef_fmodd2, q);
+  }
+
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_hypotf4_u05, Sleef_hypotd2_u05, b);
+  }
+
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_powf4_u10, Sleef_powd2_u10, b);
+  }
+
+  Vectorized<T> nextafter(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_nextafterf4, Sleef_nextafterd2, b);
+  }
+
+  Vectorized<T> erfinv() const {
+    return mapOrdinary(calc_erfinv);
+  }
+
+  Vectorized<T> digamma() const {
+    return mapOrdinary(calc_digamma);
+  }
+
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+    return mapOrdinary(calc_igamma, x);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+    return mapOrdinary(calc_igammac, x);
+  }
+
+  Vectorized<T> i0() const {
+    return mapOrdinary(calc_i0);
+  }
+
+  Vectorized<T> i0e() const {
+    return mapOrdinary(calc_i0e);
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    return {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+  }
+
+  /* Propagates NaN if either input is a NaN. */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    Vectorized<T> tmp = {
+        vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+    tmp = blendv(tmp, *this, isnan());
+    return blendv(tmp, other, other.isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    return {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+  }
+
+  /* Propagates NaN if either input is a NaN. */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    Vectorized<T> tmp = {
+        vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+    tmp = blendv(tmp, *this, isnan());
+    return blendv(tmp, other, other.isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    return {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
+  }
+
+  /* Keeps NaN if actual value is NaN */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    Vectorized<T> tmp = {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    return {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
+  }
+
+  /* Keeps NaN if actual value is NaN */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    Vectorized<T> tmp = {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+  Vectorized<T> swapped() const {
+    auto swap_mask = GetSwapMaskFloat();
+    vtype v0 = vec_perm(_vec0, _vec0, swap_mask);
+    vtype v1 = vec_perm(_vec1, _vec1, swap_mask);
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
+  Vectorized<T> swapped() const {
+    vtype v0 = {_vec0[1], _vec0[0]};
+    vtype v1 = {_vec1[1], _vec1[0]};
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  static Vectorized<T> mergee(Vectorized<T>& first, Vectorized<T>& second) {
+    return {
+        vec_mergee(first._vec0, second._vec0),
+        vec_mergee(first._vec1, second._vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  static Vectorized<T> mergeo(Vectorized<T>& first, Vectorized<T>& second) {
+    return {
+        vec_mergeo(first._vec0, second._vec0),
+        vec_mergeo(first._vec1, second._vec1)};
+  }
+
+  static Vectorized<T> horizontal_add_perm(
+      Vectorized<T>& first,
+      Vectorized<T>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.swapped(); // 2perm
+    auto second_perm = second.swapped(); // 2perm
+    // summ
+    auto first_ret = first + first_perm; // 2add
+    auto second_ret = second + second_perm; // 2 add
+    // now lets choose evens
+    return mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  static Vectorized<T> horizontal_sub_perm(
+      Vectorized<T>& first,
+      Vectorized<T>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.swapped(); // 2perm
+    auto second_perm = second.swapped(); // 2perm
+    // summ
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> mergee() const {
+    return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
+  Vectorized<T> mergeo() const {
+    return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, uint8_t>, int> = 0>
+  Vectorized<int32_t> to_vec_float_helper() const {
+    int32_t values[8] = {
+        _vec0[0],
+        _vec0[1],
+        _vec0[2],
+        _vec0[3],
+        _vec0[4],
+        _vec0[5],
+        _vec0[6],
+        _vec0[7],
+    };
+
+    return Vectorized<int32_t>{
+        values[0],
+        values[1],
+        values[2],
+        values[3],
+        values[4],
+        values[5],
+        values[6],
+        values[7]};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same_v<U, int32_t>, int> = 0>
+  Vectorized<uint8_t> to_vec_uint8_helper() const {
+    // helper function for float to uint8_t conversion
+    uint8_t values[8] = {
+        static_cast<uint8_t>(_vec0[0]),
+        static_cast<uint8_t>(_vec0[1]),
+        static_cast<uint8_t>(_vec0[2]),
+        static_cast<uint8_t>(_vec0[3]),
+        static_cast<uint8_t>(_vec1[0]),
+        static_cast<uint8_t>(_vec1[1]),
+        static_cast<uint8_t>(_vec1[2]),
+        static_cast<uint8_t>(_vec1[3]),
+    };
+
+    return Vectorized<uint8_t>{
+        values[0], values[1], values[2], values[3], values[4], values[5],
+        values[6], values[7], 0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,
+    };
+  }
+};
+
+#define ZVECTOR_OPERATORS(typex)                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() + b.vec0(), a.vec1() + b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() - b.vec0(), a.vec1() - b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator*(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() * b.vec0(), a.vec1() * b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator/(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() & b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() & b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() | b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() | b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() ^ b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() ^ b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())}   \
+        ._not();                                                        \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpgt(a.vec0(), b.vec0()), vec_cmpgt(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpge(a.vec0(), b.vec0()), vec_cmpge(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmplt(a.vec0(), b.vec0()), vec_cmplt(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())};  \
+  }
+
+ZVECTOR_OPERATORS(float)
+ZVECTOR_OPERATORS(double)
+ZVECTOR_OPERATORS(int8_t)
+ZVECTOR_OPERATORS(uint8_t)
+ZVECTOR_OPERATORS(uint16_t)
+ZVECTOR_OPERATORS(int16_t)
+ZVECTOR_OPERATORS(int32_t)
+ZVECTOR_OPERATORS(int64_t)
+
+#undef ZVECTOR_OPERATORS
+
+#define ZVECTOR_OPERATORS(typex)                                          \
+  template <>                                                             \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<<(                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {           \
+    constexpr Vectorized<typex>::ElementType max_shift =                  \
+        sizeof(Vectorized<typex>::ElementType) * CHAR_BIT;                \
+                                                                          \
+    Vectorized<typex>::ElementType a_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType b_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType c_array[Vectorized<typex>::size()];    \
+                                                                          \
+    a.store(a_array);                                                     \
+    b.store(b_array);                                                     \
+                                                                          \
+    for (int i = 0; i != Vectorized<typex>::size(); i++) {                \
+      typex shift = b_array[i];                                           \
+      if ((static_cast<std::make_signed_t<typex>>(shift) < 0) ||          \
+          (shift >= max_shift)) {                                         \
+        c_array[i] = 0;                                                   \
+      } else {                                                            \
+        c_array[i] = static_cast<std::make_unsigned_t<typex>>(a_array[i]) \
+            << shift;                                                     \
+      }                                                                   \
+    }                                                                     \
+                                                                          \
+    return Vectorized<typex>::loadu(c_array);                             \
+  }                                                                       \
+                                                                          \
+  template <>                                                             \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>>(                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {           \
+    /* right shift value to retain sign bit for signed and no bits for    \
+     * unsigned */                                                        \
+    constexpr Vectorized<typex>::ElementType max_shift =                  \
+        sizeof(typex) * CHAR_BIT - std::is_signed_v<typex>;               \
+                                                                          \
+    Vectorized<typex>::ElementType a_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType b_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType c_array[Vectorized<typex>::size()];    \
+                                                                          \
+    a.store(a_array);                                                     \
+    b.store(b_array);                                                     \
+                                                                          \
+    for (int i = 0; i != Vectorized<typex>::size(); i++) {                \
+      typex shift = b_array[i];                                           \
+      if ((static_cast<std::make_signed_t<typex>>(shift) < 0) ||          \
+          (shift >= max_shift)) {                                         \
+        c_array[i] = a_array[i] >> max_shift;                             \
+      } else {                                                            \
+        c_array[i] = a_array[i] >> shift;                                 \
+      }                                                                   \
+    }                                                                     \
+                                                                          \
+    return Vectorized<typex>::loadu(c_array);                             \
+  }                                                                       \
+                                                                          \
+  template <>                                                             \
+  inline Vectorized<typex> operator~(const Vectorized<typex>& a) {        \
+    return a._not();                                                      \
+  }
+
+ZVECTOR_OPERATORS(int8_t)
+ZVECTOR_OPERATORS(uint8_t)
+ZVECTOR_OPERATORS(uint16_t)
+ZVECTOR_OPERATORS(int16_t)
+ZVECTOR_OPERATORS(int32_t)
+ZVECTOR_OPERATORS(int64_t)
+
+#undef ZVECTOR_OPERATORS
+
+#define DEFINE_MAXMIN_FUNCS(operand_type)                                     \
+  template <>                                                                 \
+  Vectorized<operand_type> inline maximum(                                    \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return a.maximum(b);                                                      \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<operand_type> inline minimum(                                    \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return a.minimum(b);                                                      \
+  }
+
+#define DEFINE_CLAMP_MAXMIN_FUNCS(typex)                          \
+  DEFINE_MAXMIN_FUNCS(typex)                                      \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp_min(                  \
+      const Vectorized<typex>& a, const Vectorized<typex>& min) { \
+    return a.clamp_min(min);                                      \
+  }                                                               \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp_max(                  \
+      const Vectorized<typex>& a, const Vectorized<typex>& max) { \
+    return a.clamp_max(max);                                      \
+  }                                                               \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp(                      \
+      const Vectorized<typex>& a,                                 \
+      const Vectorized<typex>& min,                               \
+      const Vectorized<typex>& max) {                             \
+    return clamp_max(clamp_min(a, min), max);                     \
+  }
+
+DEFINE_CLAMP_MAXMIN_FUNCS(int8_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(uint8_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int16_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int32_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int64_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(float)
+DEFINE_CLAMP_MAXMIN_FUNCS(double)
+
+namespace { /* unnamed namespace */
+
+#if !defined(vec_float) || __ARCH__ < 13
+#warning \
+    "float->int and int->float conversion is simulated. compile for z15 for improved performance"
+inline ZSimdVect<float> vec_int_flt(const ZSimdVect<int> x) {
+  return ZSimdVect<float>{float(x[0]), float(x[1]), float(x[2]), float(x[3])};
+}
+inline ZSimdVect<int> vec_flt_int(const ZSimdVect<float> x) {
+  return ZSimdVect<int>{int(x[0]), int(x[1]), int(x[2]), int(x[3])};
+}
+#else
+#define vec_int_flt vec_float
+#define vec_flt_int vec_signed
+#endif
+
+Vectorized<float> zvec_convert_to_float(const Vectorized<int32_t>& x) {
+  return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())};
+}
+
+Vectorized<int32_t> zvec_convert_to_int(const Vectorized<float>& x) {
+  return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())};
+}
+
+Vectorized<double> zvec_convert_to_float(const Vectorized<int64_t>& x) {
+  return {vec_double(x.vec0()), vec_double(x.vec1())};
+}
+
+Vectorized<int64_t> zvec_convert_to_int(const Vectorized<double>& x) {
+  return {vec_signed(x.vec0()), vec_signed(x.vec1())};
+}
+
+} /* unnamed namespace */
+
+template <typename T, typename V>
+Vectorized<V> cast_zvector(const Vectorized<T>& x) {
+  using cast_type = typename Vectorized<V>::vtype;
+  return Vectorized<V>{(cast_type)x.vec0(), (cast_type)x.vec1()};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>{
+      __builtin_s390_vfmasb(a.vec0(), b.vec0(), c.vec0()),
+      __builtin_s390_vfmasb(a.vec1(), b.vec1(), c.vec1())};
+}
+template <>
+Vectorized<double> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return Vectorized<double>{
+      __builtin_s390_vfmadb(a.vec0(), b.vec0(), c.vec0()),
+      __builtin_s390_vfmadb(a.vec1(), b.vec1(), c.vec1())};
+}
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    const Vectorized<int16_t>& c) {
+  return Vectorized<int16_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    const Vectorized<int32_t>& c) {
+  return Vectorized<int32_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c) {
+  return Vectorized<int64_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
+  return zvec_convert_to_int(src);
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<float>(const Vectorized<float>& src) {
+  return zvec_convert_to_int(src);
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  // int32_t and float have same size
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    const int32_t* src_a = src + i;
+    float* dst_a = dst + i;
+    auto input_vec = Vectorized<int32_t>::loadu(src_a);
+    auto output_vec = zvec_convert_to_float(input_vec);
+    output_vec.store(dst_a);
+  }
+
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int64_t* src, double* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    const int64_t* src_a = src + i;
+    double* dst_a = dst + i;
+    auto input_vec = Vectorized<int64_t>::loadu(src_a);
+    auto output_vec = zvec_convert_to_float(input_vec);
+    output_vec.store(dst_a);
+  }
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+#define DEFINE_REINTERPRET_CAST_FUNCS(Fst, Cst)     \
+  template <>                                       \
+  C10_ALWAYS_INLINE Vectorized<Cst> cast<Cst, Fst>( \
+      const Vectorized<Fst>& src) {                 \
+    return cast_zvector<Fst, Cst>(src);             \
+  }
+
+#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(Fst) \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, double)      \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, float)       \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int64_t)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int32_t)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int16_t)
+
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t)
+
+#undef DEFINE_REINTERPRET_CAST_FUNCS
+
+template <typename T>
+struct unpack_type {
+  using type = T;
+};
+template <>
+struct unpack_type<int8_t> {
+  using type = int16_t;
+};
+template <>
+struct unpack_type<uint8_t> {
+  using type = int16_t;
+};
+template <>
+struct unpack_type<int16_t> {
+  using type = int32_t;
+};
+
+template <typename T>
+struct pack_type {
+  using type = T;
+};
+template <>
+struct pack_type<int16_t> {
+  using type = int8_t;
+};
+template <>
+struct pack_type<int32_t> {
+  using type = int16_t;
+};
+
+namespace { /* unnamed namespace */
+
+template <typename T, typename V = typename unpack_type<T>::type>
+std::pair<Vectorized<V>, Vectorized<V>> unpack(const Vectorized<T>& x) {
+  auto vec0 = vec_unpackh(x.vec0());
+  auto vec1 = vec_unpackl(x.vec0());
+  auto vec2 = vec_unpackh(x.vec1());
+  auto vec3 = vec_unpackl(x.vec1());
+  return {Vectorized<V>{vec0, vec1}, Vectorized<V>{vec2, vec3}};
+}
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
+template <>
+std::pair<Vectorized<int16_t>, Vectorized<int16_t>> unpack<uint8_t, int16_t>(
+    const Vectorized<uint8_t>& x) {
+  using typeX = typename Vectorized<uint16_t>::vtype;
+  typeX vec0 = vec_unpackh(x.vec0());
+  typeX vec1 = vec_unpackl(x.vec0());
+  typeX vec2 = vec_unpackh(x.vec1());
+  typeX vec3 = vec_unpackl(x.vec1());
+  // auto mask = Vectorized<uint16_t>(0xFF);
+  // vec0 = vec0 & mask;
+  // vec1 = vec1 & mask;
+  // vec2 = vec2 & mask;
+  // vec3 = vec3 & mask;
+  return {
+      cast_zvector<uint16_t, int16_t>(Vectorized<uint16_t>{vec0, vec1}),
+      cast_zvector<uint16_t, int16_t>(Vectorized<uint16_t>{vec2, vec3})};
+}
+C10_DIAGNOSTIC_POP()
+
+template <typename T, typename V = typename pack_type<T>::type>
+Vectorized<V> pack(const Vectorized<T>& first, const Vectorized<T>& second) {
+  auto vec0 = vec_packs(first.vec0(), first.vec1());
+  auto vec1 = vec_packs(second.vec0(), second.vec1());
+  return Vectorized<V>{vec0, vec1};
+}
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
+template <>
+Vectorized<uint8_t> pack(
+    const Vectorized<int16_t>& first,
+    const Vectorized<int16_t>& second) {
+  auto vec0 = vec_packsu(first.vec0(), first.vec1());
+  auto vec1 = vec_packsu(second.vec0(), second.vec1());
+  return Vectorized<uint8_t>{vec0, vec1};
+}
+C10_DIAGNOSTIC_POP()
+
+} /* unnamed namespace */
+
+//////////////////////////////////QUANT///////////////////////////////////////////
+template <typename T>
+struct is_vec_specialized_for<
+    T,
+    std::enable_if_t<is_zarch_implemented_quant<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
+ public:
+  using value_type = typename T::underlying;
+  using vtype = ZSimdVect<value_type>;
+  using vmaskType = ZSimdVectBinary<value_type>;
+  using vinner_type = Vectorized<value_type>;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(value_type);
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / Vectorized<float>::size();
+  }
+  static constexpr int int_num_vecs() {
+    return float_num_vecs();
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, float_num_vecs()>;
+  using int_vec_return_type =
+      std::array<Vectorized<c10::qint32>, int_num_vecs()>;
+
+ private:
+  vinner_type _vec;
+
+ public:
+  Vectorized() {}
+
+  explicit C10_ALWAYS_INLINE Vectorized(vinner_type v) : _vec{v} {}
+  Vectorized(const T& val) : _vec(val.val_) {}
+
+  C10_ALWAYS_INLINE const vinner_type& vec() const {
+    return _vec;
+  }
+
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const U* ptr, int count = size()) {
+    return Vectorized<T>{vinner_type::loadu(ptr, count)};
+  }
+
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
+    _vec.store(ptr, count);
+  }
+
+  Vectorized<T> relu(Vectorized<T> zero_point) const {
+    return Vectorized<T>{_vec.maximum(zero_point._vec)};
+  }
+
+  Vectorized<T> relu6(Vectorized<T> zero_point, Vectorized<T> q_six) const {
+    auto ret_max = _vec.maximum(zero_point._vec);
+    auto ret_min = ret_max.minimum(q_six._vec);
+    return Vectorized<T>{ret_min};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  int_vec_return_type widening_subtract(Vectorized<T> b) const {
+    return {*this - b};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    auto float_val = zvec_convert_to_float(_vec);
+    return {fmadd(scale, float_val, scale_zp_premul)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    auto float_val = zvec_convert_to_float(_vec);
+    return {(float_val - zero_point) * scale};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  static Vectorized<T> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    Vectorized<float> vecf = rhs[0];
+    vecf = vecf * Vectorized<float>(inverse_scale);
+    vecf = vecf.rint() + Vectorized<float>((float)(zero_point));
+    auto veci = zvec_convert_to_int(vecf);
+
+    return Vectorized<T>{veci};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 1, int> = 0>
+  static Vectorized<T> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<T> vi = inp[0];
+    auto vecf = zvec_convert_to_float(vi.vec());
+    vecf = vecf * Vectorized<float>(multiplier);
+    vecf = vecf.rint();
+    auto veci = zvec_convert_to_int(vecf) + Vectorized<int>(zero_point);
+
+    return Vectorized<T>{veci};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 4, int> = 0>
+  int_vec_return_type widening_subtract(Vectorized<U> b) const {
+    auto ret16 = unpack(_vec);
+    auto ret16B = unpack(b.vec());
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+    auto ret32B_0 = unpack(ret16B.first);
+    auto ret32B_1 = unpack(ret16B.second);
+
+    return {
+        Vectorized<c10::qint32>(ret32_0.first - ret32B_0.first),
+        Vectorized<c10::qint32>(ret32_0.second - ret32B_0.second),
+        Vectorized<c10::qint32>(ret32_1.first - ret32B_1.first),
+        Vectorized<c10::qint32>(ret32_1.second - ret32B_1.second)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    // unpacking unsigned as signed
+    auto ret16 = unpack(_vec);
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+
+    auto vecf_0 = zvec_convert_to_float(ret32_0.first);
+    auto vecf_1 = zvec_convert_to_float(ret32_0.second);
+
+    auto vecf_2 = zvec_convert_to_float(ret32_1.first);
+    auto vecf_3 = zvec_convert_to_float(ret32_1.second);
+    return {
+        fmadd(scale, vecf_0, scale_zp_premul),
+        fmadd(scale, vecf_1, scale_zp_premul),
+        fmadd(scale, vecf_2, scale_zp_premul),
+        fmadd(scale, vecf_3, scale_zp_premul)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    // unpacking unsigned as signed
+    auto ret16 = unpack(_vec);
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+
+    auto vecf_0 = zvec_convert_to_float(ret32_0.first);
+    auto vecf_1 = zvec_convert_to_float(ret32_0.second);
+
+    auto vecf_2 = zvec_convert_to_float(ret32_1.first);
+    auto vecf_3 = zvec_convert_to_float(ret32_1.second);
+
+    return {
+        (vecf_0 - zero_point) * scale,
+        (vecf_1 - zero_point) * scale,
+        (vecf_2 - zero_point) * scale,
+        (vecf_3 - zero_point) * scale};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  static Vectorized<T> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto vec_inverse = Vectorized<float>(inverse_scale);
+    auto vec_zero_point = Vectorized<float>((float)zero_point);
+
+    auto vecf0 = rhs[0];
+    auto vecf2 = rhs[1];
+    auto vecf4 = rhs[2];
+    auto vecf6 = rhs[3];
+
+    vecf0 = vecf0 * vec_inverse;
+    vecf2 = vecf2 * vec_inverse;
+    vecf4 = vecf4 * vec_inverse;
+    vecf6 = vecf6 * vec_inverse;
+
+    vecf0 = vecf0.rint() + vec_zero_point;
+    vecf2 = vecf2.rint() + vec_zero_point;
+    vecf4 = vecf4.rint() + vec_zero_point;
+    vecf6 = vecf6.rint() + vec_zero_point;
+
+    auto veci0 = zvec_convert_to_int(vecf0);
+    auto veci2 = zvec_convert_to_int(vecf2);
+    auto veci4 = zvec_convert_to_int(vecf4);
+    auto veci6 = zvec_convert_to_int(vecf6);
+
+    auto vecshi0 = pack(veci0, veci2);
+    auto vecshi2 = pack(veci4, veci6);
+    auto ret = pack<int16_t, typename U::underlying>(vecshi0, vecshi2);
+
+    return Vectorized<T>{ret};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 4, int> = 0>
+  static Vectorized<U> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<float> vec_multiplier = Vectorized<float>(multiplier);
+    Vectorized<int32_t> vec_zero_point = Vectorized<int32_t>(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    auto vecf0 = zvec_convert_to_float(vi0.vec());
+    auto vecf2 = zvec_convert_to_float(vi1.vec());
+
+    auto vecf4 = zvec_convert_to_float(vi2.vec());
+    auto vecf6 = zvec_convert_to_float(vi3.vec());
+
+    vecf0 = vecf0 * vec_multiplier;
+    vecf2 = vecf2 * vec_multiplier;
+
+    vecf4 = vecf4 * vec_multiplier;
+    vecf6 = vecf6 * vec_multiplier;
+
+    vecf0 = vecf0.rint();
+    vecf2 = vecf2.rint();
+    vecf4 = vecf4.rint();
+    vecf6 = vecf6.rint();
+
+    auto veci0 = zvec_convert_to_int(vecf0);
+    auto veci2 = zvec_convert_to_int(vecf2);
+    auto veci4 = zvec_convert_to_int(vecf4);
+    auto veci6 = zvec_convert_to_int(vecf6);
+
+    veci0 = veci0 + vec_zero_point;
+    veci2 = veci2 + vec_zero_point;
+
+    veci4 = veci4 + vec_zero_point;
+    veci6 = veci6 + vec_zero_point;
+
+    auto vecshi0 = pack<int32_t, int16_t>(veci0, veci2);
+    auto vecshi2 = pack<int32_t, int16_t>(veci4, veci6);
+
+    auto ret = pack<int16_t, typename U::underlying>(vecshi0, vecshi2);
+
+    return Vectorized<U>{ret};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.eq(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.ne(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE gt(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.gt(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ge(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.ge(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE lt(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.lt(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE le(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.le(other._vec)};
+  }
+
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    return Vectorized<T>{_vec.clamp_min(min._vec)};
+  }
+
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    return Vectorized<T>{_vec.clamp_max(max._vec)};
+  }
+
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.minimum(other._vec)};
+  }
+
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.maximum(other._vec)};
+  }
+};
+
+#define ZVECTOR_OPERATORS(typex)                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() + b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() - b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator*(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() * b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator/(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() / b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() & b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() | b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() ^ b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() == b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() != b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() > b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() >= b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() < b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() <= b.vec()};               \
+  }
+
+ZVECTOR_OPERATORS(c10::qint32)
+ZVECTOR_OPERATORS(c10::qint8)
+ZVECTOR_OPERATORS(c10::quint8)
+
+#undef ZVECTOR_OPERATORS
+
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::quint8)
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint8)
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint32)
+
+template <typename U = float>
+constexpr auto real_mask() {
+  return (ZSimdVect<U>)ZSimdVectBinary<float>{0xFFFFFFFF, 0, 0xFFFFFFFF, 0};
+}
+
+template <>
+constexpr auto real_mask<double>() {
+  return (ZSimdVect<double>)ZSimdVectBinary<double>{0xFFFFFFFFFFFFFFFF, 0};
+}
+
+template <typename U = float>
+constexpr auto image_mask() {
+  return (ZSimdVect<U>)ZSimdVectBinary<U>{0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
+}
+
+template <>
+constexpr auto image_mask<double>() {
+  return (ZSimdVect<double>)ZSimdVectBinary<double>{0, 0xFFFFFFFFFFFFFFFF};
+}
+
+template <typename U = float>
+constexpr auto rsign_mask() {
+  return ZSimdVect<U>{-0.f, 0.f, -0.f, 0.f};
+}
+
+template <>
+constexpr auto rsign_mask<double>() {
+  return ZSimdVect<double>{-0.0, 0.f};
+}
+
+template <typename U = float>
+constexpr auto isign_mask() {
+  return ZSimdVect<U>{0.0, -0.f, 0.0, -0.f};
+}
+
+template <>
+constexpr auto isign_mask<double>() {
+  return ZSimdVect<double>{0.0, -0.0};
+}
+
+template <typename U = float>
+constexpr auto image_one() {
+  return ZSimdVect<U>{0, 1.f, 0, 1.f};
+}
+
+template <>
+constexpr auto image_one<double>() {
+  return ZSimdVect<double>{0.0, 1.0};
+}
+
+template <typename U = float>
+constexpr auto pi_half() {
+  return ZSimdVect<U>{(float)(M_PI / 2.0), 0.f, (float)(M_PI / 2.0), 0.f};
+}
+
+template <>
+constexpr auto pi_half<double>() {
+  return ZSimdVect<double>{M_PI / 2.0, 0.0};
+}
+
+template <typename U = float>
+constexpr auto image_half() {
+  return ZSimdVect<U>{0, 0.5f, 0, 0.5f};
+}
+
+template <>
+constexpr auto image_half<double>() {
+  return ZSimdVect<double>{0.0, 0.5};
+}
+
+template <typename U>
+constexpr U log2e_inv() {
+  return static_cast<U>(1.4426950408889634);
+}
+
+template <typename U>
+constexpr U log10e_inv() {
+  return static_cast<U>(0.43429448190325176);
+}
+
+template <typename T>
+struct is_vec_specialized_for<
+    T,
+    std::enable_if_t<is_zarch_implemented_complex<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
+ public:
+  using underline_type = decltype(std::declval<T>().imag());
+  using value_type = T;
+  using vtype = ZSimdVect<underline_type>;
+  using vmaskType = ZSimdVectBinary<underline_type>;
+  using vinner_type = Vectorized<underline_type>;
+  using size_type = int;
+  using vinner_data = typename Vectorized<underline_type>::vinner_data;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(value_type);
+  }
+
+ private:
+  vinner_type _vec;
+
+ public:
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(const vinner_data& v)
+      : _vec{v.first, v.second} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 16), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2)
+      : _vec{s1.real(), s1.imag(), s2.real(), s2.imag()} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4)
+      : _vec{
+            s1.real(),
+            s1.imag(),
+            s2.real(),
+            s2.imag(),
+            s3.real(),
+            s3.imag(),
+            s4.real(),
+            s4.imag()} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 16), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s) : Vectorized<T>(s, s) {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s) : Vectorized<T>(s, s, s, s) {}
+
+  C10_ALWAYS_INLINE operator vinner_type() const {
+    return _vec;
+  }
+
+  C10_ALWAYS_INLINE const vinner_type& vec() const {
+    return _vec;
+  }
+
+  C10_ALWAYS_INLINE operator vinner_data() const {
+    return _vec.data();
+  }
+
+  C10_ALWAYS_INLINE vinner_data data() const {
+    return _vec.data();
+  }
+
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const U* ptr, int count = size()) {
+    return Vectorized<T>{vinner_type::loadu(ptr, 2 * count)};
+  }
+
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
+    return _vec.store(ptr, 2 * count);
+  }
+
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    vinner_type vmask = mask.vec();
+    auto mask_complex = vinner_type(
+        vec_mergeh(vmask.vec0(), vmask.vec0()),
+        vec_mergeh(vmask.vec1(), vmask.vec1()));
+    return Vectorized<T>{vinner_type::blendv(a.vec(), b.vec(), mask_complex)};
+  }
+
+  template <int64_t mask>
+  static auto C10_ALWAYS_INLINE
+  blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    constexpr int mask_complex = maskForComplex<sizeof(T)>(mask);
+    return Vectorized<T>{
+        vinner_type::template blend<mask_complex>(a.vec(), b.vec())};
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 16, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(base, base + step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 8, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + value_type(2) * step,
+        base + value_type(3) * step);
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z >= C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    return b;
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z < C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    if (count == Z)
+      return blend<allbitset(Z)>(a, b);
+    else
+      return set_inner<Z + 1, C>(a, b, count);
+  }
+
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count = size()) {
+    if (count == 0)
+      return a;
+    return set_inner<1, size()>(a, b, count);
+  }
+
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(const T&)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{
+        f(T(v0[0], v0[1])),
+        f(T(v0[2], v0[3])),
+        f(T(v1[0], v1[1])),
+        f(T(v1[2], v1[3]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  Vectorized<U> mapOrdinary(T (*const f)(const T&)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(T)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{
+        f(T(v0[0], v0[1])),
+        f(T(v0[2], v0[3])),
+        f(T(v1[0], v1[1])),
+        f(T(v1[2], v1[3]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(T)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      T (*const f)(const T&, const T&),
+      const Vectorized<T>& b) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    auto bvec = b.vec();
+    auto b0 = bvec.vec0();
+    auto b1 = bvec.vec1();
+    T a00 = f(T(v0[0], v0[1]), T(b0[0], b0[1]));
+    T a01 = f(T(v0[2], v0[3]), T(b0[2], b0[3]));
+    T a02 = f(T(v1[0], v1[1]), T(b1[0], b1[1]));
+    T a03 = f(T(v1[2], v1[3]), T(b1[2], b1[3]));
+    return Vectorized<T>{a00, a01, a02, a03};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      T (*const f)(const T&, const T&),
+      const Vectorized<T>& b) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    auto bvec = b.vec();
+    auto b0 = bvec.vec0();
+    auto b1 = bvec.vec1();
+    U a00 = f(U(v0[0], v0[1]), U(b0[0], b0[1]));
+    U a01 = f(U(v1[0], v1[1]), U(b1[0], b1[1]));
+    return Vectorized<T>{a00, a01};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  static typename Vectorized<T>::vinner_type real_neg(
+      const typename Vectorized<T>::vinner_type& a) {
+    const auto swap_mask = ZSimdVectBinary<uint8_t>{
+        0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31};
+
+    auto a_neg = a.neg();
+    vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask);
+    vtype v1 = vec_perm(a_neg.vec1(), a.vec1(), swap_mask);
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  static typename Vectorized<T>::vinner_type real_neg(
+      const typename Vectorized<T>::vinner_type& a) {
+    auto a_neg = a.neg();
+    vtype v0 = {a_neg.vec0()[0], a.vec0()[1]};
+    vtype v1 = {a_neg.vec1()[0], a.vec1()[1]};
+    return {v0, v1};
+  }
+
+  Vectorized<T> angle2_() const {
+    auto b_a = _vec.swapped(); // b        a
+    return Vectorized<T>{_vec.atan2(b_a).swapped()};
+  }
+
+  Vectorized<T> angle() const {
+    return angle2_().real();
+  }
+
+  Vectorized<T> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized<T>{vinner_type(image_one<underline_type>())};
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln *
+        Vectorized<T>{vinner_type(image_half<underline_type>())}; // i/2*ln()
+  }
+
+  Vectorized<T> atanh() const {
+    return mapOrdinary(std::atanh);
+  }
+
+  Vectorized<T> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+#if 1
+    vinner_type cnj = conj().vec();
+    vinner_type b_a = cnj.swapped();
+    vinner_type ab = cnj * b_a;
+    vinner_type im = ab + ab;
+    vinner_type val_2 = _vec * _vec;
+    vinner_type val_2_swapped = val_2.swapped();
+    vinner_type re = vinner_type::horizontal_sub_perm(val_2, val_2_swapped);
+    re = vinner_type(static_cast<underline_type>(1)) - re;
+    constexpr int blend_mask =
+        blend_choice<T>(); // 0x0A for complex<double> , 0xAA for complex<float>
+    vinner_type blendx = vinner_type::template blend<blend_mask>(re, im);
+    auto root = Vectorized<T>(blendx).sqrt();
+    auto ln = Vectorized<T>(Vectorized<T>(b_a) + root).log();
+    return Vectorized<T>(ln.vec().swapped()).conj();
+#else
+    return mapOrdinary(std::asin);
+#endif
+  }
+
+  Vectorized<T> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized<T>(vinner_type(pi_half<underline_type>())) - asin();
+  }
+
+  Vectorized<T> sin() const {
+    return mapOrdinary(std::sin);
+  }
+  Vectorized<T> sinh() const {
+    return mapOrdinary(std::sinh);
+  }
+  Vectorized<T> cos() const {
+    return mapOrdinary(std::cos);
+  }
+  Vectorized<T> cosh() const {
+    return mapOrdinary(std::cosh);
+  }
+  Vectorized<T> ceil() const {
+    return Vectorized<T>{_vec.ceil()};
+  }
+  Vectorized<T> floor() const {
+    return Vectorized<T>{_vec.floor()};
+  }
+  Vectorized<T> neg() const {
+    return Vectorized<T>(_vec.neg());
+  }
+  Vectorized<T> round() const {
+    return Vectorized<T>{_vec.round()};
+  }
+  Vectorized<T> tan() const {
+    return mapOrdinary(std::tan);
+  }
+  Vectorized<T> tanh() const {
+    return mapOrdinary(std::tanh);
+  }
+  Vectorized<T> trunc() const {
+    return Vectorized<T>{_vec.trunc()};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    auto eq = _vec.eq(other._vec); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+    auto real = eq & vinner_type(real_mask<underline_type>());
+    auto imag = (eq & vinner_type(image_mask<underline_type>())).swapped();
+    return Vectorized<T>{real & imag};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    auto ne = _vec.ne(other._vec); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+    auto real = ne & vinner_type(real_mask<underline_type>());
+    auto imag = (ne & vinner_type(image_mask<underline_type>())).swapped();
+    return Vectorized<T>{real | imag};
+  }
+
+  Vectorized<T> real() const {
+    return Vectorized<T>(_vec & vinner_type(real_mask<underline_type>()));
+  }
+  Vectorized<T> imag_() const {
+    return Vectorized<T>(_vec & vinner_type(image_mask<underline_type>()));
+  }
+  Vectorized<T> imag() const {
+    return Vectorized<T>{
+        (_vec & vinner_type(image_mask<underline_type>())).swapped()};
+  }
+
+  Vectorized<T> conj() const {
+    return Vectorized<T>(_vec ^ vinner_type(isign_mask<underline_type>()));
+  }
+
+  vinner_data abs_2_() const {
+    auto a = _vec * _vec;
+    a = a + a.swapped();
+    return a.mergee().data();
+  }
+
+  static T abs_helper(const T& value) {
+    return T(std::abs(value));
+  }
+
+  Vectorized<T> abs() const {
+    return mapOrdinary(abs_helper);
+  }
+
+  Vectorized<T> exp() const {
+    return mapOrdinary(std::exp);
+  }
+
+  Vectorized<T> exp2() const {
+    return mapOrdinary(exp2_impl);
+  }
+
+  Vectorized<T> expm1() const {
+    return mapOrdinary(std::expm1);
+  }
+
+  Vectorized<T> log() const {
+    return mapOrdinary(std::log);
+  }
+
+  Vectorized<T> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return Vectorized<T>{ret._vec * vinner_type(log2e_inv<underline_type>())};
+  }
+
+  Vectorized<T> log10() const {
+    auto ret = log();
+    return Vectorized<T>{ret._vec * vinner_type(log10e_inv<underline_type>())};
+  }
+
+  Vectorized<T> log1p() const {
+    return mapOrdinary(std::log1p);
+  }
+
+  Vectorized<T> sgn() const {
+    return mapOrdinary(at::native::sgn_impl);
+  }
+
+  Vectorized<T> pow(const Vectorized<T>& exp) const {
+    return mapOrdinary(std::pow, exp);
+  }
+
+  Vectorized<T> sqrt() const {
+    return mapOrdinary(std::sqrt);
+  }
+
+  Vectorized<T> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    vinner_type c_d = _vec ^ vinner_type(isign_mask<underline_type>());
+    vinner_type abs = abs_2_();
+    return Vectorized<T>{c_d / abs};
+  }
+
+  Vectorized<T> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<T> lt(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> le(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> gt(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> ge(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+};
+
+#define ZVECTOR_OPERATORS(typex)                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() + b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() - b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> inline operator*(                                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    /* (a + bi)  * (c + di) = (ac - bd) + (ad + bc)i */                       \
+    Vectorized<typex>::vinner_type bv = b.vec();                              \
+                                                                              \
+    /* this is more z arch friendly than simulating horizontal from x86 */    \
+    Vectorized<typex>::vinner_type vi = bv.mergeo();                          \
+    Vectorized<typex>::vinner_type vr = bv.mergee();                          \
+    vi = vi ^                                                                 \
+        Vectorized<typex>::vinner_type(                                       \
+             rsign_mask<Vectorized<typex>::underline_type>());                \
+    Vectorized<typex>::vinner_type ret = a.vec() * vr;                        \
+    Vectorized<typex>::vinner_type vx_swapped = a.vec().swapped();            \
+    ret = fmadd(vx_swapped, vi, ret);                                         \
+                                                                              \
+    return Vectorized<typex>{ret};                                            \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> inline operator/(                                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    /* Unfortunately, this breaks some tests */                               \
+    /* Implement it like it's done for avx2 */                                \
+    auto fabs_cd = b.vec().abs(); /* |c|    |d| */                            \
+    auto fabs_dc = fabs_cd.swapped(); /* |d|    |c| */                        \
+    auto scale = Vectorized<typex>::vinner_type{1.0} /                        \
+        maximum(fabs_cd, fabs_dc); /* 1/sc     1/sc */                        \
+    auto a2 = a.vec() * scale; /* a/sc     b/sc */                            \
+    auto b2 = b.vec() * scale; /* c/sc     d/sc */                            \
+    auto acbd2 = a2 * b2; /* ac/sc^2  bd/sc^2 */                              \
+                                                                              \
+    auto dc2 = b2.swapped(); /* d/sc         c/sc */                          \
+    dc2 = Vectorized<typex>::real_neg(dc2); /* -d/|c,d|        c/sc */        \
+    auto adbc2 = a2 * dc2; /* -ad/sc^2      bc/sc^2 */                        \
+    auto sum1 = acbd2 + acbd2.swapped(); /* (ac+bd)/sc^2  (ac+bd)/sc^2 */     \
+    auto sum2 = adbc2 + adbc2.swapped(); /* (bc-ad)/sc^2  (bc-ad)/sc^2 */     \
+    auto res2 = Vectorized<typex>::vinner_type::mergee(                       \
+        sum1, sum2); /* (ac+bd)/sc^2  (bc-ad)/sc^2 */                         \
+                                                                              \
+    /* get the denominator */                                                 \
+    Vectorized<typex>::vinner_type denom2 =                                   \
+        Vectorized<typex>{b2}.abs_2_(); /* (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 */ \
+    res2 = res2 / denom2;                                                     \
+    return Vectorized<typex>{res2};                                           \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() & b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() | b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() ^ b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() == b.vec()};                             \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() != b.vec()};                             \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }
+
+ZVECTOR_OPERATORS(c10::complex<float>)
+ZVECTOR_OPERATORS(c10::complex<double>)
+
+#undef ZVECTOR_OPERATORS
+
+template <typename T, std::enable_if_t<(sizeof(T) == 8), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_interleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a      = {a0, a1, a2, a3}
+  //   b      = {b0, b1, b2, b3}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype ab00 = {a.vec0()[0], b.vec0()[0]};
+  vtype ab11 = {a.vec0()[1], b.vec0()[1]};
+  vtype ab2_00 = {a.vec1()[0], b.vec1()[0]};
+  vtype ab2_11 = {a.vec1()[1], b.vec1()[1]};
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      Vectorized<T>{ab00, ab11}, Vectorized<T>{ab2_00, ab2_11});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 8), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_deinterleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype aa01 = {a.vec0()[0], a.vec1()[0]};
+  vtype aa23 = {b.vec0()[0], b.vec1()[0]};
+
+  vtype bb_01 = {a.vec0()[1], a.vec1()[1]};
+  vtype bb_23 = {b.vec0()[1], b.vec1()[1]};
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(Vectorized<T>{aa01, aa23}, Vectorized<T>{bb_01, bb_23});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 4), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_interleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3,, b4, b5, b6, b7}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype ab0011 = vec_mergeh(a.vec0(), b.vec0());
+  vtype ab2233 = vec_mergel(a.vec0(), b.vec0());
+
+  vtype ab2_0011 = vec_mergeh(a.vec1(), b.vec1());
+  vtype ab2_2233 = vec_mergel(a.vec1(), b.vec1());
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  return std::make_pair(
+      Vectorized<T>{ab0011, ab2233}, Vectorized<T>{ab2_0011, ab2_2233});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 4), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_deinterleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5,, a6, b6, a7, b7}
+  using vtype = typename Vectorized<T>::vtype;
+  // {a0,a2,b0,b2} {a1,a3,b1,b3}
+  vtype a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1());
+  vtype a1a3b1b3 = vec_mergel(a.vec0(), a.vec1());
+
+  vtype aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3);
+  vtype bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3);
+
+  vtype a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1());
+  vtype a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1());
+
+  vtype aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2);
+  vtype bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2);
+
+  // it could be done with vec_perm ,too
+  // swap lanes:
+  //   return {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  return std::make_pair(
+      Vectorized<T>{aa0123, aa0123_2}, Vectorized<T>{bb0123, bb0123_2});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return inner_interleave2<float>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int32_t>, Vectorized<int32_t>> inline interleave2<int32_t>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return inner_interleave2<int32_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return inner_interleave2<double>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline interleave2<int64_t>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return inner_interleave2<int64_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return inner_deinterleave2<float>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int32_t>, Vectorized<int32_t>> inline deinterleave2<
+    int32_t>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return inner_deinterleave2<int32_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return inner_deinterleave2<double>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline deinterleave2<
+    int64_t>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return inner_deinterleave2<int64_t>(a, b);
+}
+
+template <typename T>
+std::enable_if_t<
+    std::is_same_v<T, uint8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(const Vectorized<T>& src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 64 bits
+  auto vec_int = src.to_vec_float_helper();
+
+  return zvec_convert_to_float(vec_int);
+}
+
+template <typename T>
+std::enable_if_t<
+    std::is_same_v<T, uint8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(const Vectorized<float>& src) {
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  auto vec_int = clamp(
+      zvec_convert_to_int(src),
+      Vectorized<int32_t>(min_val),
+      Vectorized<int32_t>(max_val));
+
+  return vec_int.to_vec_uint8_helper();
+}
+
+#undef DEFINE_CLAMP_MAXMIN_FUNCS
+#undef DEFINE_MAXMIN_FUNCS
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0250e40e3a7ecb2dfdf5ce4da5e2f22289b1a83
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h
@@ -0,0 +1,414 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+// clang-format off
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec512/vec512_float.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec512/vec512_float8.h>
+#include <ATen/cpu/vec/vec512/vec512_double.h>
+#include <ATen/cpu/vec/vec512/vec512_int.h>
+#include <ATen/cpu/vec/vec512/vec512_qint.h>
+#include <ATen/cpu/vec/vec512/vec512_complex_float.h>
+#include <ATen/cpu/vec/vec512/vec512_complex_double.h>
+#include <ATen/cpu/vec/vec512/vec512_convert.h>
+#include <ATen/cpu/vec/vec512/vec512_mask.h>
+// clang-format on
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+namespace at {
+namespace vec {
+
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << ']';
+  return stream;
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return _mm512_castpd_ps(src);
+}
+
+template <>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return _mm512_castps_pd(src);
+}
+
+template <>
+inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
+  return _mm512_castsi512_ps(src);
+}
+
+template <>
+inline Vectorized<double> cast<double, int64_t>(
+    const Vectorized<int64_t>& src) {
+  return _mm512_castsi512_pd(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm512_i64gather_pd(vindex, base_addr, scale);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+  return _mm512_i32gather_ps(vindex, base_addr, scale);
+}
+#endif
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex,
+        Vectorized<double>& mask) {
+  auto all_ones = _mm512_castsi512_pd(_mm512_set1_epi64(0xFFFFFFFFFFFFFFFF));
+  auto mask_ = _mm512_cmp_pd_mask(all_ones, mask.values, _CMP_EQ_OQ);
+  return _mm512_mask_i64gather_pd(src, mask_, vindex, base_addr, scale);
+}
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex,
+        Vectorized<float>& mask) {
+  auto all_ones = _mm512_castsi512_ps(_mm512_set1_epi32(0xFFFFFFFF));
+  auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
+  return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
+}
+#endif
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  return _mm512_cvtpd_epi64(src);
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+  return _mm512_cvttps_epi32(src);
+}
+
+template <>
+Vectorized<double> inline convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src) {
+  return _mm512_cvtepi64_pd(src);
+}
+
+template <>
+Vectorized<float> inline convert_to_fp_of_same_size<float>(
+    const Vectorized<int32_t>& src) {
+  return _mm512_cvtepi32_ps(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  __m512i idx1 = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
+  return std::make_pair(
+      _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+      _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
+  //   a15} b = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13,
+  //   b14, b15}
+  //
+  //  return:
+  //    {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //    {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15,
+  //    b15}
+  __m512i idx1 =
+      _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+  __m512i idx2 = _mm512_set_epi32(
+      31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+  return std::make_pair(
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // output:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  // The members of indices have been written in binary format for better
+  // understandability
+  __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(
+      _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+      _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //   b = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14,
+  //   a15, b15}
+  // output:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
+  //   a15}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14,
+  //          b15}
+  __m512i idx1 = _mm512_set_epi32(
+      30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi32(
+      31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+inline Vectorized<float> flip(const Vectorized<float>& v) {
+  const __m512i mask =
+      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_ps(mask, v);
+}
+
+template <>
+inline Vectorized<double> flip(const Vectorized<double>& v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_pd(mask, v);
+}
+
+template <>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t>& v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_epi64(mask, v);
+}
+
+template <>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t>& v) {
+  const __m512i mask =
+      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_epi32(mask, v);
+}
+
+template <>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t>& v) {
+  const __m512i mask = _mm512_set_epi16(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31);
+  return _mm512_permutexvar_epi16(mask, v);
+}
+
+inline __m512i flip8(const __m512i& v) {
+  const __m512i mask1 = _mm512_set_epi8(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15);
+  const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+  auto reversed_vec = _mm512_shuffle_epi8(v, mask1);
+  return _mm512_permutexvar_epi64(mask2, reversed_vec);
+}
+
+template <>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t>& v) {
+  return flip8(v);
+}
+
+template <>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t>& v) {
+  return flip8(v);
+}
+
+inline Vectorized<bool> operator&&(
+    const Vectorized<bool>& self,
+    const Vectorized<bool>& other) {
+  const __m512i* self_ = reinterpret_cast<const __m512i*>(self.as_bytes());
+  const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
+  __m512i out = _mm512_and_si512(*self_, *other_);
+  Vectorized<bool> ret;
+  // We do not have a constructor that takes __m512i, so we need to memcpy
+  std::memcpy(ret, &out, ret.size() * sizeof(bool));
+  return ret;
+}
+
+#endif // defined(CPU_CAPABILITY_AVX512)
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a632b3fb6ef40b766b95446efd36d3e4d72657
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -0,0 +1,1947 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif
+
+// bfloat16 conversion
+static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
+  o = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
+  __m256i lo = _mm512_extracti32x8_epi32(a, 0);
+  __m256i hi = _mm512_extracti32x8_epi32(a, 1);
+  cvtbf16_fp32(lo, o1);
+  cvtbf16_fp32(hi, o2);
+}
+
+static inline __m256i cvtfp32_bf16(const __m512& src) {
+  __m512i value = _mm512_castps_si512(src);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm512_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm512_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm512_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+  return _mm512_cvtusepi32_epi16(t_value);
+}
+
+static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {
+  __m512i lo = _mm512_castps_si512(a);
+  __m512i hi = _mm512_castps_si512(b);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_lo = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+  auto mask_hi = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_lo = _mm512_and_si512(_mm512_srli_epi32(lo, 16), ones);
+  auto t_hi = _mm512_and_si512(_mm512_srli_epi32(hi, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_lo = _mm512_add_epi32(t_lo, vec_bias);
+  t_hi = _mm512_add_epi32(t_hi, vec_bias);
+  // input += rounding_bias;
+  t_lo = _mm512_add_epi32(t_lo, lo);
+  t_hi = _mm512_add_epi32(t_hi, hi);
+  // input = input >> 16;
+  t_lo = _mm512_srli_epi32(t_lo, 16);
+  t_hi = _mm512_srli_epi32(t_hi, 16);
+  // Check NaN before converting back to bf16
+  t_lo = _mm512_mask_blend_epi32(mask_lo, nan, t_lo);
+  t_hi = _mm512_mask_blend_epi32(mask_hi, nan, t_hi);
+
+  t_lo = _mm512_packus_epi32(
+      t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+  __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+  return _mm512_permutexvar_epi64(idx, t_lo);
+}
+
+static inline __m512i merge_compare_result(const __m512& a, const __m512& b) {
+  __m512i lo = _mm512_castps_si512(a);
+  __m512i hi = _mm512_castps_si512(b);
+  lo = _mm512_srli_epi32(lo, 16);
+  hi = _mm512_srli_epi32(hi, 16);
+  auto out = _mm512_packus_epi32(lo, hi);
+  __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+  return _mm512_permutexvar_epi64(idx, out);
+}
+
+// float16 conversion
+static inline void cvtfp16_fp32(const __m256i& a, __m512& o) {
+  o = _mm512_cvtph_ps(a);
+}
+
+static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
+  __m256i lo = _mm512_extracti32x8_epi32(a, 0);
+  __m256i hi = _mm512_extracti32x8_epi32(a, 1);
+  cvtfp16_fp32(lo, o1);
+  cvtfp16_fp32(hi, o2);
+}
+
+static inline __m256i cvtfp32_fp16(const __m512& src) {
+  return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
+  __m256i lo =
+      _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m256i hi =
+      _mm512_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m512 t_lo = _mm512_castsi512_ps(_mm512_castsi256_si512(lo));
+  __m256 t_hi = _mm256_castsi256_ps(hi);
+  return _mm512_castps_si512(_mm512_insertf32x8(t_lo, t_hi, 1));
+}
+
+// dtype conversion between float16/bfloat16 and float32
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m512& o);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m512& o) {
+  cvtbf16_fp32(a, o);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m256i& a, __m512& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <
+    typename T,
+    bool is_compare_op = false,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m512i cvt_from_fp32(const __m512& a, const __m512& b);
+template <>
+inline __m512i cvt_from_fp32<BFloat16, false>(
+    const __m512& a,
+    const __m512& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<BFloat16, true>(const __m512& a, const __m512& b) {
+  return merge_compare_result(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<Half, false>(const __m512& a, const __m512& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<Half, true>(const __m512& a, const __m512& b) {
+  return cvtfp32_fp16(a, b);
+}
+
+template <typename T>
+class Vectorized16 {
+  static_assert(
+      is_reduced_floating_point_v<T>,
+      "Support only float16 and bfloat16.");
+
+ private:
+  __m512i values;
+
+ public:
+  using value_type = uint16_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+  Vectorized16() {
+    values = _mm512_setzero_si512();
+  }
+  Vectorized16(__m512i v) : values(v) {}
+  Vectorized16(T val) {
+    value_type uw = val.x;
+    values = _mm512_set1_epi16(uw);
+  }
+  Vectorized16(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32) {
+    values = _mm512_set_epi16(
+        val32.x,
+        val31.x,
+        val30.x,
+        val29.x,
+        val28.x,
+        val27.x,
+        val26.x,
+        val25.x,
+        val24.x,
+        val23.x,
+        val22.x,
+        val21.x,
+        val20.x,
+        val19.x,
+        val18.x,
+        val17.x,
+        val16.x,
+        val15.x,
+        val14.x,
+        val13.x,
+        val12.x,
+        val11.x,
+        val10.x,
+        val9.x,
+        val8.x,
+        val7.x,
+        val6.x,
+        val5.x,
+        val4.x,
+        val3.x,
+        val2.x,
+        val1.x);
+  }
+  operator __m512i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    return _mm512_cmpeq_epi16_mask(values, _mm512_set1_epi16(0));
+  }
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+
+    __mmask32 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_epi16(mask, ptr);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask32 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi16(ptr, mask, values);
+    }
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return _mm512_mask_blend_epi16(mask, a.values, b.values);
+  }
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    auto all_ones = _mm512_set1_epi16(0xFFFF);
+    auto mask_ = _mm512_cmp_epi16_mask(mask, all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi16(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+      case 16:
+        return blend<65535>(a, b);
+      case 17:
+        return blend<131071>(a, b);
+      case 18:
+        return blend<262143>(a, b);
+      case 19:
+        return blend<524287>(a, b);
+      case 20:
+        return blend<1048575>(a, b);
+      case 21:
+        return blend<2097151>(a, b);
+      case 22:
+        return blend<4194303>(a, b);
+      case 23:
+        return blend<8388607>(a, b);
+      case 24:
+        return blend<16777215>(a, b);
+      case 25:
+        return blend<33554431>(a, b);
+      case 26:
+        return blend<67108863>(a, b);
+      case 27:
+        return blend<134217727>(a, b);
+      case 28:
+        return blend<268435455>(a, b);
+      case 29:
+        return blend<536870911>(a, b);
+      case 30:
+        return blend<1073741823>(a, b);
+      case 31:
+        return blend<2147483647>(a, b);
+    }
+    return b;
+  }
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+
+  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    const auto o1 = vop(lo);
+    const auto o2 = vop(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> isnan() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __mmask16 lo_mask, hi_mask;
+    __m512 zero = _mm512_set1_ps(0.0);
+    __m512i zeroi = _mm512_castps_si512(zero);
+    lo_mask = _mm512_cmp_ps_mask(lo, zero, _CMP_UNORD_Q);
+    lo = _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF));
+    hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q);
+    hi = _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF));
+    return merge_compare_result(lo, hi);
+  }
+#pragma clang diagnostic pop
+  Vectorized<T> abs() const {
+    return _mm512_andnot_si512(_mm512_set1_epi16(0x8000), values);
+  }
+  Vectorized<T> angle() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto angle_lambda = [](__m512 values) {
+      const auto zero_vec = _mm512_set1_ps(0.f);
+      const auto nan_vec = _mm512_set1_ps(NAN);
+      const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+      const auto non_nan_mask_vec = _mm512_mask_set1_epi32(
+          _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF);
+      const auto nan_mask = _mm512_cmp_ps_mask(
+          _mm512_castsi512_ps(non_nan_mask_vec), zero_vec, _CMP_EQ_OQ);
+      const auto pi = _mm512_set1_ps(c10::pi<float>);
+
+      const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
+      auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi);
+      angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec);
+      return angle;
+    };
+    auto o1 = angle_lambda(lo);
+    auto o2 = angle_lambda(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm512_set1_epi16(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+  Vectorized<T> acos() const {
+    return map(Sleef_acosf16_u10);
+  }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf16_u10);
+  }
+  Vectorized<T> asin() const {
+    return map(Sleef_asinf16_u10);
+  }
+  Vectorized<T> asinh() const {
+    return map(Sleef_asinhf16_u10);
+  }
+  Vectorized<T> atan() const {
+    return map(Sleef_atanf16_u10);
+  }
+  Vectorized<T> atanh() const {
+    return map(Sleef_atanhf16_u10);
+  }
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_atan2f16_u10(lo, b1);
+    auto o2 = Sleef_atan2f16_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m512i mask_value = _mm512_set1_epi32(~0x80008000);
+    __m512i mask_signbit = _mm512_set1_epi32(0x80008000);
+    return Vectorized<T>(_mm512_or_si512(
+        _mm512_and_si512(values, mask_value),
+        _mm512_and_si512(sign, mask_signbit)));
+  }
+  Vectorized<T> erf() const {
+    return map(Sleef_erff16_u10);
+  }
+  Vectorized<T> erfc() const {
+    return map(Sleef_erfcf16_u15);
+  }
+  Vectorized<T> erfinv() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_erfinv(tmp1[i]);
+      tmp2[i] = calc_erfinv(tmp2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> exp() const {
+    return map(Sleef_expf16_u10);
+  }
+  Vectorized<T> exp2() const {
+    return map(Sleef_exp2f16_u10);
+  }
+  Vectorized<T> expm1() const {
+    return map(Sleef_expm1f16_u10);
+  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+    __m512 x_lo, x_hi;
+    cvt_to_fp32<T>(values, x_lo, x_hi);
+    __m512 q_lo, q_hi;
+    cvtbf16_fp32(q.values, q_lo, q_hi);
+    auto o1 = Sleef_fmodf16(x_lo, q_lo);
+    auto o2 = Sleef_fmodf16(x_hi, q_hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_hypotf16_u05(lo, b1);
+    auto o2 = Sleef_hypotf16_u05(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_i0(tmp1[i]);
+      tmp2[i] = calc_i0(tmp2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0e() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_i0e(tmp1[i]);
+      tmp2[i] = calc_i0e(tmp2[i]);
+    }
+    const auto o1 = _mm512_loadu_ps(tmp1);
+    const auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> digamma() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_digamma(tmp1[i]);
+      tmp2[i] = calc_digamma(tmp2[i]);
+    }
+    const auto o1 = _mm512_loadu_ps(tmp1);
+    const auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+    __m512 lo, hi;
+    __m512 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+    __m512 lo, hi;
+    __m512 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> log() const {
+    return map(Sleef_logf16_u10);
+  }
+  Vectorized<T> log2() const {
+    return map(Sleef_log2f16_u10);
+  }
+  Vectorized<T> log10() const {
+    return map(Sleef_log10f16_u10);
+  }
+  Vectorized<T> log1p() const {
+    return map(Sleef_log1pf16_u10);
+  }
+  Vectorized<T> sin() const {
+    return map(Sleef_sinf16_u10);
+  }
+  Vectorized<T> sinh() const {
+    return map(Sleef_sinhf16_u10);
+  }
+  Vectorized<T> cos() const {
+    return map(Sleef_cosf16_u10);
+  }
+  Vectorized<T> cosh() const {
+    return map(Sleef_coshf16_u10);
+  }
+  Vectorized<T> ceil() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_ceil_ps(lo);
+    auto o2 = _mm512_ceil_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> floor() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_floor_ps(lo);
+    auto o2 = _mm512_floor_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> neg() const {
+    return _mm512_xor_si512(values, _mm512_set1_epi16(0x8000));
+  }
+  Vectorized<T> round() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_roundscale_ps(
+        lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 = _mm512_roundscale_ps(
+        hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> tan() const {
+    return map(Sleef_tanf16_u10);
+  }
+  Vectorized<T> tanh() const {
+    return map(Sleef_tanhf16_u10);
+  }
+  Vectorized<T> trunc() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 =
+        _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    auto o2 =
+        _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> lgamma() const {
+    return map(Sleef_lgammaf16_u10);
+  }
+  Vectorized<T> sqrt() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_sqrt_ps(lo);
+    auto o2 = _mm512_sqrt_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> reciprocal() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm512_set1_ps(1);
+    auto o1 = _mm512_div_ps(ones, lo);
+    auto o2 = _mm512_div_ps(ones, hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> rsqrt() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm512_set1_ps(1);
+    auto o1 = _mm512_div_ps(ones, _mm512_sqrt_ps(lo));
+    auto o2 = _mm512_div_ps(ones, _mm512_sqrt_ps(hi));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_powf16_u10(lo, b1);
+    auto o2 = Sleef_powf16_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+ private:
+  template <typename Op, typename VectorizedType>
+  Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
+    __m512 a_lo, a_hi;
+    __m512 b_lo, b_hi;
+    cvt_to_fp32<T>(values, a_lo, a_hi);
+    cvt_to_fp32<T>(b.values, b_lo, b_hi);
+    auto o1 = op(a_lo, b_lo);
+    auto o2 = op(a_hi, b_hi);
+    return cvt_from_fp32<T, /*is_compare_op*/ true>(o1, o2);
+  }
+
+ public:
+  Vectorized<T> inline operator>(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator<(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator==(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator!=(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+};
+
+template <typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvt_to_fp32<T>(__m512i(a), a_lo, a_hi);
+  cvt_to_fp32<T>(__m512i(b), b_lo, b_hi);
+  auto o1 = op(a_lo, b_lo);
+  auto o2 = op(a_hi, b_hi);
+  return cvt_from_fp32<T>(o1, o2);
+}
+
+template <>
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> : public Vectorized16<BFloat16> {
+ public:
+  using Vectorized16::Vectorized16;
+
+  using value_type = BFloat16;
+
+  Vectorized<BFloat16> frac() const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+Vectorized<BFloat16> inline operator+(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator-(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator*(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator/(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  auto max_lo = _mm512_max_ps(a_lo, b_lo);
+  auto max_hi = _mm512_max_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(max_lo, nan_lo);
+  auto o2 = _mm512_or_ps(max_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512i zero_vec = _mm512_set1_epi32(0);
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  auto min_lo = _mm512_min_ps(a_lo, b_lo);
+  auto min_hi = _mm512_min_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(min_lo, nan_lo);
+  auto o2 = _mm512_or_ps(min_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  __m512 max_lo, max_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(min), min_lo, min_hi);
+  cvtbf16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo));
+  auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi));
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+  __m512 a_lo, a_hi;
+  __m512 max_lo, max_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, a_lo);
+  auto o2 = _mm512_min_ps(max_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(min), min_lo, min_hi);
+  auto o1 = _mm512_max_ps(min_lo, a_lo);
+  auto o2 = _mm512_max_ps(min_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size());
+       i += Vectorized<BFloat16>::size()) {
+    auto vsrc =
+        _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+    __m512 a = _mm512_loadu_ps(&src[i]);
+    __m512 b = _mm512_loadu_ps(&src[i + 16]);
+
+    __m512i bf = cvtfp32_bf16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, BFloat16* dst, int64_t n) {
+  auto load_float = [](const double* src) -> __m512 {
+    // Load one float vector from an array of doubles
+    __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
+    __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
+    return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+    __m512 a = load_float(&src[i]);
+    __m512 b = load_float(&src[i + 16]);
+
+    __m512i bf = cvtfp32_bf16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512 c_lo, c_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  cvtbf16_fp32(__m512i(c), c_lo, c_hi);
+  auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
+  __m512i r[8];
+  // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15   e0e1 e2e3 e4e5 e6e7 e8e9
+  // e10e11 e12e13 e14e15 b0-b15  f0-f15 c0-c15  g0-g15 d0-d15  h0-h15 i0-i15
+  // m0-m15 j0-j15  n0-n15 k0-k15  o0-o15 l0-l15  p0-p15
+#ifndef __msvc_cl__
+#pragma unroll(4)
+#endif
+  for (int i = 0; i < 4; i++) {
+    r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
+    r[i + 4] =
+        _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
+  }
+
+  // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11   e0e1 f0f1 e2e3 f2f3 e8e9
+  // f8f9 e10e11 f10f11 u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15 e4e5
+  // f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15 u2: c0c1 d0d1 c2c3 d2d3 c8c9
+  // d8d9 c10c11 d10d11   g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11 u3: c4c5
+  // d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13
+  // h12h13 g14g15 h14h15 i j  m n k l  o p
+#ifndef __msvc_cl__
+#pragma unroll(4)
+#endif
+  for (int i = 0; i < 8; i += 2) {
+    u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
+    u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
+  }
+
+  // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9  e0e1 f0f1 g0g1 h0h1 e8e9 f8f9
+  // g8g9 h8h9 r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11  e2e3 f2f3
+  // g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13
+  // c12c13 d12d13 r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 r4: i j k
+  // l m n o p
+  r[0] = _mm512_unpacklo_epi64(u[0], u[2]);
+  r[1] = _mm512_unpackhi_epi64(u[0], u[2]);
+  r[2] = _mm512_unpacklo_epi64(u[1], u[3]);
+  r[3] = _mm512_unpackhi_epi64(u[1], u[3]);
+  r[4] = _mm512_unpacklo_epi64(u[4], u[6]);
+  r[5] = _mm512_unpackhi_epi64(u[4], u[6]);
+  r[6] = _mm512_unpacklo_epi64(u[5], u[7]);
+  r[7] = _mm512_unpackhi_epi64(u[5], u[7]);
+
+  __m512i const1 = _mm512_set_epi32(
+      0x00370035,
+      0x00330031,
+      0x00270025,
+      0x00230021,
+      0x00170015,
+      0x00130011,
+      0x00070005,
+      0x00030001,
+      0x00360034,
+      0x00320030,
+      0x00260024,
+      0x00220020,
+      0x00160014,
+      0x00120010,
+      0x00060004,
+      0x00020000);
+  __m512i const2 = _mm512_set_epi32(
+      0x003f003d,
+      0x003b0039,
+      0x002f002d,
+      0x002b0029,
+      0x001f001d,
+      0x001b0019,
+      0x000f000d,
+      0x000b0009,
+      0x003e003c,
+      0x003a0038,
+      0x002e002c,
+      0x002a0028,
+      0x001e001c,
+      0x001a0018,
+      0x000e000c,
+      0x000a0008);
+  // merge values from two regs
+  // 0-- 1--
+  // 8-- 9--
+  // 2-- 3--
+  // 10-- 11--
+  // 4-- 5--
+  // 12-- 13--
+  // 6-- 7--
+  // 14-- 15--
+#ifndef __msvc_cl__
+#pragma unroll(4)
+#endif
+  for (int i = 0; i < 4; i++) {
+    u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
+    u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
+  }
+}
+
+// TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template <>
+inline void transpose_mxn<BFloat16, 16, 16>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // a: a0  a1  a2  a3  a4  a5  a6  a7  a8  a9  a10 a11 a12 a13 a14 a15
+  // b: b0  b1  b2  b3  b4  b5  b6  b7  b8  b9  b10 b11 b12 b13 b14 b15
+  // c: c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+  // d: d0  d1  d2  d3  d4  d5  d6  d7  d8  d9  d10 d11 d12 d13 d14 d15
+  // e: e0  e1  e2  e3  e4  e5  e6  e7  e8  e9  e10 e11 e12 e13 e14 e15
+  // f: f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  f10 f11 f12 f13 f14 f15
+  // g: g0  g1  g2  g3  g4  g5  g6  g7  g8  g9  g10 g11 g12 g13 g14 g15
+  // h: h0  h1  h2  h3  h4  h5  h6  h7  h8  h9  h10 h11 h12 h13 h14 h15
+  // i: i0  i1  i2  i3  i4  i5  i6  i7  i8  i9  i10 i11 i12 i13 i14 i15
+  // j: j0  j1  j2  j3  j4  j5  j6  j7  j8  j9  j10 j11 j12 j13 j14 j15
+  // k: k0  k1  k2  k3  k4  k5  k6  k7  k8  k9  k10 k11 k12 k13 k14 k15
+  // l: l0  l1  l2  l3  l4  l5  l6  l7  l8  l9  l10 l11 l12 l13 l14 l15
+  // m: m0  m1  m2  m3  m4  m5  m6  m7  m8  m9  m10 m11 m12 m13 m14 m15
+  // n: n0  n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14 n15
+  // o: o0  o1  o2  o3  o4  o5  o6  o7  o8  o9  o10 o11 o12 o13 o14 o15
+  // p: p0  p1  p2  p3  p4  p5  p6  p7  p8  p9  p10 p11 p12 p13 p14 p15
+#ifndef __msvc_cl__
+#pragma unroll(16)
+#endif
+  for (int i = 0; i < 16; i++) {
+    t[i] =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#ifndef __msvc_cl__
+#pragma unroll(8)
+#endif
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template <>
+inline void transpose_mxn<Half, 16, 16>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
+#ifndef __msvc_cl__
+#pragma unroll(16)
+#endif
+  for (int i = 0; i < 16; i++) {
+    t[i] =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#ifndef __msvc_cl__
+#pragma unroll(8)
+#endif
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
+  // t[0]: 0 32 1 33 2 34 3 35 8 40 9 41 10 42 11 43 16 ... 59
+  // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63
+  // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123
+  // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127
+  // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144
+  // ... 187 t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143
+  // 175 148 ... 191 t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202
+  // 234 203 235 208 ... 251 t[7]: 196 228 197 229 198 230 199 231 204 236 205
+  // 237 206 238 207 239 212 ... 255 t[8]: 256 288 257 289 258 290 259 291 264
+  // 296 265 297 266 298 267 299 272 ... 315 t[9]: 260 292 261 293 262 294 263
+  // 295 268 300 269 301 270 302 271 303 276 ... 319 t[10]: 320 352 321 353 322
+  // 354 323 355 328 360 329 361 330 362 331 363 336 ... 379 t[11]: 324 356 325
+  // 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383 t[12]: 384
+  // 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443
+  // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404
+  // ... 447 t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459
+  // 491 464 ... 507 t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462
+  // 494 463 495 468 ... 511 t[16]: 512 544 513 545 514 546 515 547 520 552 521
+  // 553 522 554 523 555 528 ... 571
+  // ...
+  // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007
+  // 980 ... 1023
+#ifndef __msvc_cl__
+#pragma unroll(16)
+#endif
+  for (int i = 0; i < 16; ++i) {
+    d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
+    d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
+  }
+
+  // t[0]: 0 32 64 96 1 33 65 97 8 40 72 104 9 41 73 105 16 ... 121
+  // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123
+  // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125
+  // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127
+  // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144
+  // ... 249 t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203
+  // 235 146 ... 251 t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141
+  // 173 205 237 148 ... 253 t[7]: 134 166 198 230 135 167 199 231 142 174 206
+  // 238 143 175 207 239 150 ... 255 t[8]: 256 288 320 352 257 289 321 353 264
+  // 296 328 360 265 297 329 361 272 ... 377 t[9]: 258 290 322 354 259 291 323
+  // 355 266 298 330 362 267 299 331 363 274 ... 379 t[10]: 260 292 324 356 261
+  // 293 325 357 268 300 332 364 269 301 333 365 276 ... 381 t[11]: 262 294 326
+  // 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383 t[12]: 384
+  // 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505
+  // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402
+  // ... 507 t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461
+  // 493 404 ... 509 t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399
+  // 431 463 495 406 ... 511 t[16]: 512 544 576 608 513 545 577 609 520 552 584
+  // 616 521 553 585 617 528 ... 633
+  // ...
+  // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007
+  // 918 ... 1023
+#ifndef __msvc_cl__
+#pragma unroll(8)
+#endif
+  for (int i = 0; i < 8; ++i) {
+    r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 2] = _mm512_unpacklo_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+    r[i * 4 + 3] = _mm512_unpackhi_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 8 40 72 104 136 168 200 232 16 ... 248
+  // t[1]: 1 33 65 97 129 161 193 225 9 41 73 105 137 169 201 233 17 ... 249
+  // t[2]: 2 34 66 98 130 162 194 226 10 42 74 106 138 170 202 234 18 ... 250
+  // t[3]: 3 35 67 99 131 163 195 227 11 43 75 107 139 171 203 235 19 ... 251
+  // t[4]: 4 36 68 100 132 164 196 228 12 44 76 108 140 172 204 236 20 ... 252
+  // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253
+  // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254
+  // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255
+  // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272
+  // ... 504 t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457
+  // 489 273 ... 505 t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394
+  // 426 458 490 274 ... 506 t[11]: 259 291 323 355 387 419 451 483 267 299 331
+  // 363 395 427 459 491 275 ... 507 t[12]: 260 292 324 356 388 420 452 484 268
+  // 300 332 364 396 428 460 492 276 ... 508 t[13]: 261 293 325 357 389 421 453
+  // 485 269 301 333 365 397 429 461 493 277 ... 509 t[14]: 262 294 326 358 390
+  // 422 454 486 270 302 334 366 398 430 462 494 278 ... 510 t[15]: 263 295 327
+  // 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511 t[16]: 512
+  // 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
+  // ...
+  // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791
+  // ... 1023
+#ifndef __msvc_cl__
+#pragma unroll(4)
+#endif
+  for (int i = 0; i < 4; ++i) {
+    d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 2] = _mm512_unpacklo_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 3] = _mm512_unpackhi_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 4] = _mm512_unpacklo_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 5] = _mm512_unpackhi_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 6] = _mm512_unpacklo_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+    d[i * 8 + 7] = _mm512_unpackhi_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 16 ... 496
+  // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497
+  // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498
+  // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ...
+  // 500 t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21
+  // ... 501 t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486
+  // 22 ... 502 t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455
+  // 487 23 ... 503 t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424
+  // 456 488 24 ... 504 t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393
+  // 425 457 489 25 ... 505 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362
+  // 394 426 458 490 26 ... 506 t[11]: 11 43 75 107 139 171 203 235 267 299 331
+  // 363 395 427 459 491 27 ... 507 t[12]: 12 44 76 108 140 172 204 236 268 300
+  // 332 364 396 428 460 492 28 ... 508 t[13]: 13 45 77 109 141 173 205 237 269
+  // 301 333 365 397 429 461 493 29 ... 509 t[14]: 14 46 78 110 142 174 206 238
+  // 270 302 334 366 398 430 462 494 30 ... 510 t[15]: 15 47 79 111 143 175 207
+  // 239 271 303 335 367 399 431 463 495 31 ... 511 t[16]: 512 544 576 608 640
+  // 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008
+  // ...
+  // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543
+  // ... 1023
+  __m512i const1 = _mm512_set_epi64(
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000005,
+      0x0000000000000004,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const2 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000003,
+      0x0000000000000002);
+#ifndef __msvc_cl__
+#pragma unroll(8)
+#endif
+  for (int i = 0; i < 8; ++i) {
+    r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/ const1, d[i + 8]);
+    r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/ const2, d[i + 8]);
+    r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const1, d[i + 24]);
+    r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const2, d[i + 24]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544
+  // ... 992 t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481
+  // 513 545 ... 993 t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418
+  // 450 482 514 546 ... 994 t[3]: 3 35 67 99 131 163 195 227 259 291 323 355
+  // 387 419 451 483 515 547 ... 995 t[4]: 4 36 68 100 132 164 196 228 260 292
+  // 324 356 388 420 452 484 516 548 ... 996 t[5]: 5 37 69 101 133 165 197 229
+  // 261 293 325 357 389 421 453 485 517 549 ... 997 t[6]: 6 38 70 102 134 166
+  // 198 230 262 294 326 358 390 422 454 486 518 550 ... 998 t[7]: 7 39 71 103
+  // 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999 t[8]: 8 40
+  // 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553
+  // ... 1001 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458
+  // 490 522 554 ... 1002 t[11]: 11 43 75 107 139 171 203 235 267 299 331 363
+  // 395 427 459 491 523 555 ... 1003 t[12]: 12 44 76 108 140 172 204 236 268
+  // 300 332 364 396 428 460 492 524 556 ... 1004 t[13]: 13 45 77 109 141 173
+  // 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005 t[14]: 14 46 78
+  // 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006 t[15]:
+  // 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ...
+  // 1007 t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496
+  // 528 560 ... 1008
+  // ...
+  // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575
+  // ... 1023
+  __m512i const3 = _mm512_set_epi64(
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000003,
+      0x0000000000000002,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const4 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x0000000000000005,
+      0x0000000000000004);
+#ifndef __msvc_cl__
+#pragma unroll(16)
+#endif
+  for (int i = 0; i < 16; ++i) {
+    d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/ const3, r[i + 16]);
+    d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/ const4, r[i + 16]);
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6
+template <>
+inline void transpose_mxn<BFloat16>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  // load from src
+  TORCH_CHECK(
+      M <= 32 && N <= 32, "transpose_mxn<BFloat16> expects M, N <= 32.");
+  __m512i r[32];
+  int i;
+  if (N == 32) {
+    for (i = 0; i < M; ++i) {
+      r[i] = _mm512_loadu_si512(&src[i * ld_src]);
+    }
+  } else {
+    __mmask32 src_mask = (1 << N) - 1;
+    for (i = 0; i < M; ++i) {
+      r[i] = _mm512_maskz_loadu_epi16(src_mask, &src[i * ld_src]);
+    }
+  }
+  for (; i < 32; ++i) {
+    r[i] = _mm512_setzero_si512();
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // store to dst
+  if (M == 32) {
+    for (i = 0; i < N; ++i) {
+      _mm512_storeu_si512(&dst[i * ld_dst], d[i]);
+    }
+  } else {
+    __mmask32 dst_mask = (1 << M) - 1;
+    for (i = 0; i < N; ++i) {
+      _mm512_mask_storeu_epi16(&dst[i * ld_dst], dst_mask, d[i]);
+    }
+  }
+}
+
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<
+        std::is_same_v<T, BFloat16> &&
+            ((M <= 32 && M != 16) || (N <= 32 && N != 16)),
+        int> = 0>
+inline void transpose_mxn(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  transpose_mxn<BFloat16>(src, ld_src, dst, ld_dst, M, N);
+}
+
+template <>
+inline void transpose_mxn<Half>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn<Half> expects M, N <= 32.");
+  // load from src
+  __m512i r[32];
+  int i;
+  if (N == 32) {
+    for (i = 0; i < M; ++i) {
+      r[i] = _mm512_loadu_si512(&src[i * ld_src]);
+    }
+  } else {
+    __mmask32 src_mask = (1 << N) - 1;
+    for (i = 0; i < M; ++i) {
+      r[i] = _mm512_maskz_loadu_epi16(src_mask, &src[i * ld_src]);
+    }
+  }
+  for (; i < 32; ++i) {
+    r[i] = _mm512_setzero_si512();
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // store to dst
+  if (M == 32) {
+    for (i = 0; i < N; ++i) {
+      _mm512_storeu_si512(&dst[i * ld_dst], d[i]);
+    }
+  } else {
+    __mmask32 dst_mask = (1 << M) - 1;
+    for (i = 0; i < N; ++i) {
+      _mm512_mask_storeu_epi16(&dst[i * ld_dst], dst_mask, d[i]);
+    }
+  }
+}
+
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<
+        std::is_same_v<T, Half> &&
+            ((M <= 32 && M != 16) || (N <= 32 && N != 16)),
+        int> = 0>
+inline void transpose_mxn(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  transpose_mxn<Half>(src, ld_src, dst, ld_dst, M, N);
+}
+
+template <>
+struct is_vec_specialized_for<Half> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<Half> : public Vectorized16<Half> {
+ public:
+  using Vectorized16::Vectorized16;
+
+  using value_type = Half;
+
+  Vectorized<Half> frac() const;
+
+  Vectorized<Half> eq(const Vectorized<Half>& other) const;
+  Vectorized<Half> ne(const Vectorized<Half>& other) const;
+  Vectorized<Half> gt(const Vectorized<Half>& other) const;
+  Vectorized<Half> ge(const Vectorized<Half>& other) const;
+  Vectorized<Half> lt(const Vectorized<Half>& other) const;
+  Vectorized<Half> le(const Vectorized<Half>& other) const;
+};
+
+Vectorized<Half> inline operator+(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator-(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator*(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator/(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Half> inline operator&(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<Half> inline operator|(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<Half> inline operator^(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(
+    const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ne(
+    const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::gt(
+    const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ge(
+    const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::lt(
+    const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::le(
+    const Vectorized<Half>& other) const {
+  return (*this <= other) & Vectorized<Half>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<Half> Vectorized<Half>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline maximum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  auto max_lo = _mm512_max_ps(a_lo, b_lo);
+  auto max_hi = _mm512_max_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(max_lo, nan_lo);
+  auto o2 = _mm512_or_ps(max_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline minimum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512i zero_vec = _mm512_set1_epi32(0);
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  auto min_lo = _mm512_min_ps(a_lo, b_lo);
+  auto min_hi = _mm512_min_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(min_lo, nan_lo);
+  auto o2 = _mm512_or_ps(min_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min,
+    const Vectorized<Half>& max) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  __m512 max_lo, max_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(min), min_lo, min_hi);
+  cvtfp16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo));
+  auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi));
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_max(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& max) {
+  __m512 a_lo, a_hi;
+  __m512 max_lo, max_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, a_lo);
+  auto o2 = _mm512_min_ps(max_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_min(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(min), min_lo, min_hi);
+  auto o1 = _mm512_max_ps(min_lo, a_lo);
+  auto o2 = _mm512_max_ps(min_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+inline void convert(const Half* src, Half* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<Half>::size());
+       i += Vectorized<Half>::size()) {
+    auto vsrc =
+        _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, Half* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+    __m512 a = _mm512_loadu_ps(&src[i]);
+    __m512 b = _mm512_loadu_ps(&src[i + 16]);
+
+    __m512i bf = cvtfp32_fp16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, Half* dst, int64_t n) {
+  auto load_float = [](const double* src) -> __m512 {
+    // Load one float vector from an array of doubles
+    __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
+    __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
+    return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+    __m512 a = load_float(&src[i]);
+    __m512 b = load_float(&src[i + 16]);
+
+    __m512i bf = cvtfp32_fp16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+Vectorized<Half> inline fmadd(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b,
+    const Vectorized<Half>& c) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512 c_lo, c_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  cvtfp16_fp32(__m512i(c), c_lo, c_hi);
+  auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+#define CONVERT_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>       \
+      convert_##name##_float(const Vectorized<type>& a) {       \
+    __m512 o1, o2;                                              \
+    cvt_to_fp32<type>(__m512i(a), o1, o2);                      \
+    return std::make_tuple(o1, o2);                             \
+  }                                                             \
+                                                                \
+  inline Vectorized<type> convert_float_##name(                 \
+      const Vectorized<float>& a, const Vectorized<float>& b) { \
+    return cvt_from_fp32<type>(__m512(a), __m512(b));           \
+  }
+CONVERT_VECTORIZED_INIT(BFloat16, bfloat16)
+CONVERT_VECTORIZED_INIT(Half, half)
+
+#else // defined(CPU_CAPABILITY_AVX512)
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
+      convert_##name##_float(const Vectorized<type>& a) {           \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr2);                                                  \
+    for (const auto k : c10::irange(K)) {                           \
+      arr[k] = c10::convert<float>(arr2[k]);                        \
+    }                                                               \
+    return std::make_tuple(                                         \
+        Vectorized<float>::loadu(arr),                              \
+        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+  }                                                                 \
+                                                                    \
+  inline Vectorized<type> convert_float_##name(                     \
+      const Vectorized<float>& a, const Vectorized<float>& b) {     \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr);                                                   \
+    b.store(arr + Vectorized<float>::size());                       \
+    for (const auto k : c10::irange(K)) {                           \
+      arr2[k] = c10::convert<type>(arr[k]);                         \
+    }                                                               \
+    return Vectorized<type>::loadu(arr2);                           \
+  }
+CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
+CONVERT_NON_VECTORIZED_INIT(Half, half)
+
+#endif // defined(CPU_CAPABILITY_AVX512)
+
+#if defined(CPU_CAPABILITY_AVX512)
+#define LOAD_FP32_VECTORIZED_INIT(type, name)                                 \
+  inline void load_fp32_from_##name(                                          \
+      const type* data, Vectorized<float>& out) {                             \
+    auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
+    __m512 out_values;                                                        \
+    cvt_to_fp32<type>(values, out_values);                                    \
+    out = out_values;                                                         \
+  }                                                                           \
+                                                                              \
+  inline void load_fp32_from_##name(                                          \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) {   \
+    auto vec = Vectorized<type>::loadu(data);                                 \
+    __m512 out1_values, out2_values;                                          \
+    cvt_to_fp32<type>(vec, out1_values, out2_values);                         \
+    out1 = out1_values;                                                       \
+    out2 = out2_values;                                                       \
+  }
+LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
+LOAD_FP32_VECTORIZED_INIT(Half, fp16)
+
+#else // defined(CPU_CAPABILITY_AVX512)
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    __at_align__ float values[Vectorized<float>::size()];                   \
+    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
+      values[k] = data[k];                                                  \
+    }                                                                       \
+    out = Vectorized<float>::loadu(values);                                 \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    load_fp32_from_##name(data, out1);                                      \
+    data += Vectorized<float>::size();                                      \
+    load_fp32_from_##name(data, out2);                                      \
+  }
+LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
+LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
+
+#endif
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..0779363c788634d77d10dd700b7c203cae2c206d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -0,0 +1,661 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+template <>
+struct is_vec_specialized_for<c10::complex<double>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<double>> {
+ private:
+  __m512d values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+  using value_type = c10::complex<double>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {
+    values = _mm512_setzero_pd();
+  }
+  Vectorized(__m512d v) : values(v) {}
+  Vectorized(c10::complex<double> val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    values = _mm512_setr_pd(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<double> val1,
+      c10::complex<double> val2,
+      c10::complex<double> val3,
+      c10::complex<double> val4) {
+    values = _mm512_setr_pd(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag());
+  }
+  operator __m512d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<double>> blend(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    // NOLINTNEXTLINE(clang-diagnostic-warning)
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm512_mask_blend_pd(
+            0x03, a.values, b.values); // b0000 0001 = b0000 0011
+      case 2:
+        return _mm512_mask_blend_pd(
+            0x0C, a.values, b.values); // b0000 0010 = b0000 1100
+      case 3:
+        return _mm512_mask_blend_pd(
+            0x0F, a.values, b.values); // b0000 0011 = b0000 1111
+      case 4:
+        return _mm512_mask_blend_pd(
+            0x30, a.values, b.values); // b0000 0100 = b0011 0000
+      case 5:
+        return _mm512_mask_blend_pd(
+            0x33, a.values, b.values); // b0000 0101 = b0011 0011
+      case 6:
+        return _mm512_mask_blend_pd(
+            0x3C, a.values, b.values); // b0000 0110 = b0011 1100
+      case 7:
+        return _mm512_mask_blend_pd(
+            0x3F, a.values, b.values); // b0000 0111 = b0011 1111
+      case 8:
+        return _mm512_mask_blend_pd(
+            0xC0, a.values, b.values); // b0000 1000 = b1100 0000
+      case 9:
+        return _mm512_mask_blend_pd(
+            0xC3, a.values, b.values); // b0000 1001 = b1100 0011
+      case 10:
+        return _mm512_mask_blend_pd(
+            0xCC, a.values, b.values); // b0000 1010 = b1100 1100
+      case 11:
+        return _mm512_mask_blend_pd(
+            0xCF, a.values, b.values); // b0000 1011 = b1100 1111
+      case 12:
+        return _mm512_mask_blend_pd(
+            0xF0, a.values, b.values); // b0000 1100 = b1111 0000
+      case 13:
+        return _mm512_mask_blend_pd(
+            0xF3, a.values, b.values); // b0000 1101 = b1111 0011
+      case 14:
+        return _mm512_mask_blend_pd(
+            0xFC, a.values, b.values); // b0000 1110 = b1111 1100
+      case 15:
+        return _mm512_mask_blend_pd(
+            0xFF, a.values, b.values); // b0000 1111 = b1111 1111
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_pd(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(
+        _mm512_castpd_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<double>> arange(
+      c10::complex<double> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(
+        base,
+        base + c10::complex<double>(1) * step,
+        base + c10::complex<double>(2) * step,
+        base + c10::complex<double>(3) * step);
+  }
+  static Vectorized<c10::complex<double>> set(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(c10::complex<double>));
+    return _mm512_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[2 * size()];
+      _mm512_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
+    }
+  }
+  const c10::complex<double>& operator[](int idx) const = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(
+      c10::complex<double> (*const f)(const c10::complex<double>&)) const {
+    __at_align__ c10::complex<double> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  // AVX512 doesn't have horizontal add & horizontal sub instructions.
+  // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
+  static inline __m512d hadd_pd(__m512d a, __m512d b) {
+    __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+    __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+    return _mm512_add_pd(
+        _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  static inline __m512d hsub_pd(__m512d a, __m512d b) {
+    __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+    __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+    return _mm512_sub_pd(
+        _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  __m512d abs_2_() const {
+    auto val_2 = _mm512_mul_pd(values, values); // a*a     b*b
+    return hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b
+  }
+  __m512d abs_() const {
+    auto real = _mm512_movedup_pd(values); // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm512_permute_pd(values, 0xff); // imag imag
+    return Sleef_hypotd8_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm512_and_pd(abs_(), real_mask); // abs     0
+  }
+  __m512d angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm512_permute_pd(values, 0x55); // b        a
+    return Sleef_atan2d8_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    auto angle = _mm512_permute_pd(angle_(), 0x55); // angle    90-angle
+    return _mm512_and_pd(angle, real_mask); // angle    0
+  }
+  Vectorized<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm512_setzero_pd();
+    auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
+    auto div = _mm512_div_pd(values, abs);
+    return _mm512_mask_blend_pd(mask, div, zero);
+  }
+  __m512d real_() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm512_and_pd(values, real_mask);
+  }
+  Vectorized<c10::complex<double>> real() const {
+    return real_();
+  }
+  __m512d imag_() const {
+    const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF));
+    return _mm512_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm512_permute_pd(imag_(), 0x55); // b        a
+  }
+  __m512d conj_() const {
+    const __m512d sign_mask =
+        _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm512_xor_pd(values, sign_mask); // a       -b
+  }
+  Vectorized<c10::complex<double>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<double>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<double>> log2() const {
+    const __m512d log2_ = _mm512_set1_pd(std::log(2));
+    return _mm512_div_pd(log(), log2_);
+  }
+  Vectorized<c10::complex<double>> log10() const {
+    const __m512d log10_ = _mm512_set1_pd(std::log(10));
+    return _mm512_div_pd(log(), log10_);
+  }
+  Vectorized<c10::complex<double>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<double>> asin() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // // asin(x)
+    // // = -i*ln(iz + sqrt(1 -z^2))
+    // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    // const __m512d one = _mm512_set1_pd(1);
+
+    // auto conj = conj_();
+    // auto b_a = _mm512_permute_pd(conj, 0x55);                         //-b a
+    // auto ab = _mm512_mul_pd(conj, b_a);                               //-ab
+    // -ab auto im = _mm512_add_pd(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm512_mul_pd(values, values);                       // a*a
+    // b*b auto re = hsub_pd(val_2, _mm512_permute_pd(val_2, 0x55));  // a*a-b*b
+    // b*b-a*a re = _mm512_sub_pd(one, re);
+
+    // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt();
+    // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_pd(b_a, root)).log();
+    // //ln(iz + sqrt()) return Vectorized(_mm512_permute_pd(ln.values,
+    // 0x55)).conj();         //-i*ln()
+    return map(std::asin);
+  }
+  Vectorized<c10::complex<double>> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    constexpr auto pi_2d = c10::pi<double> / 2;
+    const __m512d pi_2 =
+        _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0);
+    return _mm512_sub_pd(pi_2, asin());
+  }
+  Vectorized<c10::complex<double>> atan() const;
+  Vectorized<c10::complex<double>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<double>> exp() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expd8_u10(values); //exp(a)           exp(b) exp =
+    // _mm512_mask_blend_pd(0xAA, exp, _mm512_permute_pd(exp, 0x55));   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosd8_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm512_mask_blend_pd(0xAA,
+    // _mm512_permute_pd(sin_cos.y, 0x55),
+    //                                sin_cos.x);                  //cos(b)
+    //                                sin(b)
+    // return _mm512_mul_pd(exp, cos_sin);
+    return map(std::exp);
+  }
+  Vectorized<c10::complex<double>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m512d ln_2 = _mm512_set1_pd(c10::ln_2<double>);
+    Vectorized<c10::complex<double>> scaled_values =
+        _mm512_mul_pd(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<double>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<double>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<double>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<double>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<double>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<double>> ceil() const {
+    return _mm512_ceil_pd(values);
+  }
+  Vectorized<c10::complex<double>> floor() const {
+    return _mm512_floor_pd(values);
+  }
+  Vectorized<c10::complex<double>> neg() const {
+    auto zero = _mm512_setzero_pd();
+    return _mm512_sub_pd(zero, values);
+  }
+  Vectorized<c10::complex<double>> round() const {
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<double>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<double>> trunc() const {
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<double>> reciprocal() const;
+  Vectorized<c10::complex<double>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<double>> pow(
+      const Vectorized<c10::complex<double>>& exp) const {
+    __at_align__ c10::complex<double> x_tmp[size()];
+    __at_align__ c10::complex<double> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<double>> operator==(
+      const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator!=(
+      const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator<(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(
+      const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(
+      const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<double>> inline operator+(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator-(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator*(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512d sign_mask =
+      _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm512_mul_pd(a, b); // ac       bd
+
+  auto d_c = _mm512_permute_pd(b, 0x55); // d        c
+  d_c = _mm512_xor_pd(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm512_mul_pd(a, d_c); // ad      -bc
+
+  auto ret = Vectorized<c10::complex<double>>::hsub_pd(
+      ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator/(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // auto mask = _mm512_set1_pd(-0.f);
+  // auto fabs_cd = _mm512_andnot_pd(mask, b);     // |c|    |d|
+  // auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55);   // |d|    |c|
+  // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc));  // 1/sc
+  // 1/sc auto a2 = _mm512_mul_pd(a, scale);         // a/sc     b/sc auto b2 =
+  // _mm512_mul_pd(b, scale);         // c/sc     d/sc auto acbd2 =
+  // _mm512_mul_pd(a2, b2);
+
+  // const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0); auto dc2 = _mm512_permute_pd(b2, 0x55);    // d/sc         c/sc
+  // dc2 = _mm512_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm512_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = Vectorized<c10::complex<double>>::hadd_pd(acbd2, adbc2);
+  // //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm512_div_pd(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return _mm512_loadu_pd(reinterpret_cast<const double*>(out));
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<double>> Vectorized<
+    c10::complex<double>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0); auto c_d = _mm512_xor_pd(sign_mask, values);    //c       -d
+  // return _mm512_div_pd(c_d, abs_2_());
+  __at_align__ c10::complex<double> tmp[size()];
+  store(tmp);
+  for (const auto i : c10::irange(size())) {
+    tmp[i] = c10::complex<double>(1) / tmp[i];
+  }
+  return loadu(tmp);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m512d i = _mm512_setr_pd(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm512_setr_pd(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5);
+
+  // auto sum = Vectorized(_mm512_add_pd(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm512_sub_pd(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+  return map(std::atan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline maximum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm512_mask_blend_pd(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF);
+  return _mm512_or_pd(max, _mm512_castsi512_pd(isnan));
+}
+
+template <>
+Vectorized<c10::complex<double>> inline minimum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm512_mask_blend_pd(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF);
+  return _mm512_or_pd(min, _mm512_castsi512_pd(isnan));
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator&(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_and_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator|(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_or_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator^(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..59fce4ea931c3671dfe3c87387a524bcc6666690
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -0,0 +1,1229 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+template <>
+struct is_vec_specialized_for<c10::complex<float>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<float>> {
+ private:
+  __m512 values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+  using value_type = c10::complex<float>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {
+    values = _mm512_setzero_ps();
+  }
+  Vectorized(__m512 v) : values(v) {}
+  Vectorized(c10::complex<float> val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    values = _mm512_setr_ps(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<float> val1,
+      c10::complex<float> val2,
+      c10::complex<float> val3,
+      c10::complex<float> val4,
+      c10::complex<float> val5,
+      c10::complex<float> val6,
+      c10::complex<float> val7,
+      c10::complex<float> val8) {
+    values = _mm512_setr_ps(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag(),
+        val5.real(),
+        val5.imag(),
+        val6.real(),
+        val6.imag(),
+        val7.real(),
+        val7.imag(),
+        val8.real(),
+        val8.imag());
+  }
+  operator __m512() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<float>> blend(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 256, "Unexpected mask value");
+    // The compiler would hopefully convert this switch condition
+    // into a jump table
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm512_mask_blend_ps(0x03, a.values, b.values);
+      case 2:
+        return _mm512_mask_blend_ps(0x0C, a.values, b.values);
+      case 3:
+        return _mm512_mask_blend_ps(0x0F, a.values, b.values);
+      case 4:
+        return _mm512_mask_blend_ps(0x30, a.values, b.values);
+      case 5:
+        return _mm512_mask_blend_ps(0x33, a.values, b.values);
+      case 6:
+        return _mm512_mask_blend_ps(0x3C, a.values, b.values);
+      case 7:
+        return _mm512_mask_blend_ps(0x3F, a.values, b.values);
+      case 8:
+        return _mm512_mask_blend_ps(0xC0, a.values, b.values);
+      case 9:
+        return _mm512_mask_blend_ps(0xC3, a.values, b.values);
+      case 10:
+        return _mm512_mask_blend_ps(0xCC, a.values, b.values);
+      case 11:
+        return _mm512_mask_blend_ps(0xCF, a.values, b.values);
+      case 12:
+        return _mm512_mask_blend_ps(0xF0, a.values, b.values);
+      case 13:
+        return _mm512_mask_blend_ps(0xF3, a.values, b.values);
+      case 14:
+        return _mm512_mask_blend_ps(0xFC, a.values, b.values);
+      case 15:
+        return _mm512_mask_blend_ps(0xFF, a.values, b.values);
+      case 16:
+        return _mm512_mask_blend_ps(0x300, a.values, b.values);
+      case 17:
+        return _mm512_mask_blend_ps(0x303, a.values, b.values);
+      case 18:
+        return _mm512_mask_blend_ps(0x30C, a.values, b.values);
+      case 19:
+        return _mm512_mask_blend_ps(0x30F, a.values, b.values);
+      case 20:
+        return _mm512_mask_blend_ps(0x330, a.values, b.values);
+      case 21:
+        return _mm512_mask_blend_ps(0x333, a.values, b.values);
+      case 22:
+        return _mm512_mask_blend_ps(0x33C, a.values, b.values);
+      case 23:
+        return _mm512_mask_blend_ps(0x33F, a.values, b.values);
+      case 24:
+        return _mm512_mask_blend_ps(0x3C0, a.values, b.values);
+      case 25:
+        return _mm512_mask_blend_ps(0x3C3, a.values, b.values);
+      case 26:
+        return _mm512_mask_blend_ps(0x3CC, a.values, b.values);
+      case 27:
+        return _mm512_mask_blend_ps(0x3CF, a.values, b.values);
+      case 28:
+        return _mm512_mask_blend_ps(0x3F0, a.values, b.values);
+      case 29:
+        return _mm512_mask_blend_ps(0x3F3, a.values, b.values);
+      case 30:
+        return _mm512_mask_blend_ps(0x3FC, a.values, b.values);
+      case 31:
+        return _mm512_mask_blend_ps(0x3FF, a.values, b.values);
+      case 32:
+        return _mm512_mask_blend_ps(0xC00, a.values, b.values);
+      case 33:
+        return _mm512_mask_blend_ps(0xC03, a.values, b.values);
+      case 34:
+        return _mm512_mask_blend_ps(0xC0C, a.values, b.values);
+      case 35:
+        return _mm512_mask_blend_ps(0xC0F, a.values, b.values);
+      case 36:
+        return _mm512_mask_blend_ps(0xC30, a.values, b.values);
+      case 37:
+        return _mm512_mask_blend_ps(0xC33, a.values, b.values);
+      case 38:
+        return _mm512_mask_blend_ps(0xC3C, a.values, b.values);
+      case 39:
+        return _mm512_mask_blend_ps(0xC3F, a.values, b.values);
+      case 40:
+        return _mm512_mask_blend_ps(0xCC0, a.values, b.values);
+      case 41:
+        return _mm512_mask_blend_ps(0xCC3, a.values, b.values);
+      case 42:
+        return _mm512_mask_blend_ps(0xCCC, a.values, b.values);
+      case 43:
+        return _mm512_mask_blend_ps(0xCCF, a.values, b.values);
+      case 44:
+        return _mm512_mask_blend_ps(0xCF0, a.values, b.values);
+      case 45:
+        return _mm512_mask_blend_ps(0xCF3, a.values, b.values);
+      case 46:
+        return _mm512_mask_blend_ps(0xCFC, a.values, b.values);
+      case 47:
+        return _mm512_mask_blend_ps(0xCFF, a.values, b.values);
+      case 48:
+        return _mm512_mask_blend_ps(0xF00, a.values, b.values);
+      case 49:
+        return _mm512_mask_blend_ps(0xF03, a.values, b.values);
+      case 50:
+        return _mm512_mask_blend_ps(0xF0C, a.values, b.values);
+      case 51:
+        return _mm512_mask_blend_ps(0xF0F, a.values, b.values);
+      case 52:
+        return _mm512_mask_blend_ps(0xF30, a.values, b.values);
+      case 53:
+        return _mm512_mask_blend_ps(0xF33, a.values, b.values);
+      case 54:
+        return _mm512_mask_blend_ps(0xF3C, a.values, b.values);
+      case 55:
+        return _mm512_mask_blend_ps(0xF3F, a.values, b.values);
+      case 56:
+        return _mm512_mask_blend_ps(0xFC0, a.values, b.values);
+      case 57:
+        return _mm512_mask_blend_ps(0xFC3, a.values, b.values);
+      case 58:
+        return _mm512_mask_blend_ps(0xFCC, a.values, b.values);
+      case 59:
+        return _mm512_mask_blend_ps(0xFCF, a.values, b.values);
+      case 60:
+        return _mm512_mask_blend_ps(0xFF0, a.values, b.values);
+      case 61:
+        return _mm512_mask_blend_ps(0xFF3, a.values, b.values);
+      case 62:
+        return _mm512_mask_blend_ps(0xFFC, a.values, b.values);
+      case 63:
+        return _mm512_mask_blend_ps(0xFFF, a.values, b.values);
+      case 64:
+        return _mm512_mask_blend_ps(0x3000, a.values, b.values);
+      case 65:
+        return _mm512_mask_blend_ps(0x3003, a.values, b.values);
+      case 66:
+        return _mm512_mask_blend_ps(0x300C, a.values, b.values);
+      case 67:
+        return _mm512_mask_blend_ps(0x300F, a.values, b.values);
+      case 68:
+        return _mm512_mask_blend_ps(0x3030, a.values, b.values);
+      case 69:
+        return _mm512_mask_blend_ps(0x3033, a.values, b.values);
+      case 70:
+        return _mm512_mask_blend_ps(0x303C, a.values, b.values);
+      case 71:
+        return _mm512_mask_blend_ps(0x303F, a.values, b.values);
+      case 72:
+        return _mm512_mask_blend_ps(0x30C0, a.values, b.values);
+      case 73:
+        return _mm512_mask_blend_ps(0X30C3, a.values, b.values);
+      case 74:
+        return _mm512_mask_blend_ps(0x30CC, a.values, b.values);
+      case 75:
+        return _mm512_mask_blend_ps(0x30CF, a.values, b.values);
+      case 76:
+        return _mm512_mask_blend_ps(0x30F0, a.values, b.values);
+      case 77:
+        return _mm512_mask_blend_ps(0x30F3, a.values, b.values);
+      case 78:
+        return _mm512_mask_blend_ps(0x30FC, a.values, b.values);
+      case 79:
+        return _mm512_mask_blend_ps(0x30FF, a.values, b.values);
+      case 80:
+        return _mm512_mask_blend_ps(0x3300, a.values, b.values);
+      case 81:
+        return _mm512_mask_blend_ps(0X3303, a.values, b.values);
+      case 82:
+        return _mm512_mask_blend_ps(0x330C, a.values, b.values);
+      case 83:
+        return _mm512_mask_blend_ps(0x330F, a.values, b.values);
+      case 84:
+        return _mm512_mask_blend_ps(0x3330, a.values, b.values);
+      case 85:
+        return _mm512_mask_blend_ps(0x3333, a.values, b.values);
+      case 86:
+        return _mm512_mask_blend_ps(0x333C, a.values, b.values);
+      case 87:
+        return _mm512_mask_blend_ps(0X333F, a.values, b.values);
+      case 88:
+        return _mm512_mask_blend_ps(0x33C0, a.values, b.values);
+      case 89:
+        return _mm512_mask_blend_ps(0x33C3, a.values, b.values);
+      case 90:
+        return _mm512_mask_blend_ps(0x33CC, a.values, b.values);
+      case 91:
+        return _mm512_mask_blend_ps(0x33CF, a.values, b.values);
+      case 92:
+        return _mm512_mask_blend_ps(0x33F0, a.values, b.values);
+      case 93:
+        return _mm512_mask_blend_ps(0x33F3, a.values, b.values);
+      case 94:
+        return _mm512_mask_blend_ps(0x33FC, a.values, b.values);
+      case 95:
+        return _mm512_mask_blend_ps(0x33FF, a.values, b.values);
+      case 96:
+        return _mm512_mask_blend_ps(0X3C00, a.values, b.values);
+      case 97:
+        return _mm512_mask_blend_ps(0x3C03, a.values, b.values);
+      case 98:
+        return _mm512_mask_blend_ps(0x3C0C, a.values, b.values);
+      case 99:
+        return _mm512_mask_blend_ps(0x3C0F, a.values, b.values);
+      case 100:
+        return _mm512_mask_blend_ps(0x3C30, a.values, b.values);
+      case 101:
+        return _mm512_mask_blend_ps(0x3C33, a.values, b.values);
+      case 102:
+        return _mm512_mask_blend_ps(0x3C3C, a.values, b.values);
+      case 103:
+        return _mm512_mask_blend_ps(0x3C3F, a.values, b.values);
+      case 104:
+        return _mm512_mask_blend_ps(0x3CC0, a.values, b.values);
+      case 105:
+        return _mm512_mask_blend_ps(0x3CC3, a.values, b.values);
+      case 106:
+        return _mm512_mask_blend_ps(0x3CCC, a.values, b.values);
+      case 107:
+        return _mm512_mask_blend_ps(0x3CCF, a.values, b.values);
+      case 108:
+        return _mm512_mask_blend_ps(0x3CF0, a.values, b.values);
+      case 109:
+        return _mm512_mask_blend_ps(0x3CF3, a.values, b.values);
+      case 110:
+        return _mm512_mask_blend_ps(0x3CFC, a.values, b.values);
+      case 111:
+        return _mm512_mask_blend_ps(0x3CFF, a.values, b.values);
+      case 112:
+        return _mm512_mask_blend_ps(0x3F00, a.values, b.values);
+      case 113:
+        return _mm512_mask_blend_ps(0x3F03, a.values, b.values);
+      case 114:
+        return _mm512_mask_blend_ps(0x3F0C, a.values, b.values);
+      case 115:
+        return _mm512_mask_blend_ps(0x3F0F, a.values, b.values);
+      case 116:
+        return _mm512_mask_blend_ps(0x3F30, a.values, b.values);
+      case 117:
+        return _mm512_mask_blend_ps(0x3F33, a.values, b.values);
+      case 118:
+        return _mm512_mask_blend_ps(0x3F3C, a.values, b.values);
+      case 119:
+        return _mm512_mask_blend_ps(0x3F3F, a.values, b.values);
+      case 120:
+        return _mm512_mask_blend_ps(0x3FC0, a.values, b.values);
+      case 121:
+        return _mm512_mask_blend_ps(0x3FC3, a.values, b.values);
+      case 122:
+        return _mm512_mask_blend_ps(0x3FCC, a.values, b.values);
+      case 123:
+        return _mm512_mask_blend_ps(0x3FCF, a.values, b.values);
+      case 124:
+        return _mm512_mask_blend_ps(0x3FF0, a.values, b.values);
+      case 125:
+        return _mm512_mask_blend_ps(0x3FF3, a.values, b.values);
+      case 126:
+        return _mm512_mask_blend_ps(0x3FFC, a.values, b.values);
+      case 127:
+        return _mm512_mask_blend_ps(0x3FFF, a.values, b.values);
+      case 128:
+        return _mm512_mask_blend_ps(0xC000, a.values, b.values);
+      case 129:
+        return _mm512_mask_blend_ps(0xC003, a.values, b.values);
+      case 130:
+        return _mm512_mask_blend_ps(0xC00C, a.values, b.values);
+      case 131:
+        return _mm512_mask_blend_ps(0xC00F, a.values, b.values);
+      case 132:
+        return _mm512_mask_blend_ps(0xC030, a.values, b.values);
+      case 133:
+        return _mm512_mask_blend_ps(0xC033, a.values, b.values);
+      case 134:
+        return _mm512_mask_blend_ps(0xC03C, a.values, b.values);
+      case 135:
+        return _mm512_mask_blend_ps(0xC03F, a.values, b.values);
+      case 136:
+        return _mm512_mask_blend_ps(0xC0C0, a.values, b.values);
+      case 137:
+        return _mm512_mask_blend_ps(0xC0C3, a.values, b.values);
+      case 138:
+        return _mm512_mask_blend_ps(0xC0CC, a.values, b.values);
+      case 139:
+        return _mm512_mask_blend_ps(0xC0CF, a.values, b.values);
+      case 140:
+        return _mm512_mask_blend_ps(0xC0F0, a.values, b.values);
+      case 141:
+        return _mm512_mask_blend_ps(0xC0F3, a.values, b.values);
+      case 142:
+        return _mm512_mask_blend_ps(0xC0FC, a.values, b.values);
+      case 143:
+        return _mm512_mask_blend_ps(0xC0FF, a.values, b.values);
+      case 144:
+        return _mm512_mask_blend_ps(0xC300, a.values, b.values);
+      case 145:
+        return _mm512_mask_blend_ps(0xC303, a.values, b.values);
+      case 146:
+        return _mm512_mask_blend_ps(0xC30C, a.values, b.values);
+      case 147:
+        return _mm512_mask_blend_ps(0xC30F, a.values, b.values);
+      case 148:
+        return _mm512_mask_blend_ps(0xC330, a.values, b.values);
+      case 149:
+        return _mm512_mask_blend_ps(0xC333, a.values, b.values);
+      case 150:
+        return _mm512_mask_blend_ps(0xC33C, a.values, b.values);
+      case 151:
+        return _mm512_mask_blend_ps(0xC33F, a.values, b.values);
+      case 152:
+        return _mm512_mask_blend_ps(0xC3C0, a.values, b.values);
+      case 153:
+        return _mm512_mask_blend_ps(0xC3C3, a.values, b.values);
+      case 154:
+        return _mm512_mask_blend_ps(0xC3CC, a.values, b.values);
+      case 155:
+        return _mm512_mask_blend_ps(0xC3CF, a.values, b.values);
+      case 156:
+        return _mm512_mask_blend_ps(0xC3F0, a.values, b.values);
+      case 157:
+        return _mm512_mask_blend_ps(0xC3F3, a.values, b.values);
+      case 158:
+        return _mm512_mask_blend_ps(0xC3FC, a.values, b.values);
+      case 159:
+        return _mm512_mask_blend_ps(0xC3FF, a.values, b.values);
+      case 160:
+        return _mm512_mask_blend_ps(0xCC00, a.values, b.values);
+      case 161:
+        return _mm512_mask_blend_ps(0xCC03, a.values, b.values);
+      case 162:
+        return _mm512_mask_blend_ps(0xCC0C, a.values, b.values);
+      case 163:
+        return _mm512_mask_blend_ps(0xCC0F, a.values, b.values);
+      case 164:
+        return _mm512_mask_blend_ps(0xCC30, a.values, b.values);
+      case 165:
+        return _mm512_mask_blend_ps(0xCC33, a.values, b.values);
+      case 166:
+        return _mm512_mask_blend_ps(0xCC3C, a.values, b.values);
+      case 167:
+        return _mm512_mask_blend_ps(0xCC3F, a.values, b.values);
+      case 168:
+        return _mm512_mask_blend_ps(0xCCC0, a.values, b.values);
+      case 169:
+        return _mm512_mask_blend_ps(0xCCC3, a.values, b.values);
+      case 170:
+        return _mm512_mask_blend_ps(0xCCCC, a.values, b.values);
+      case 171:
+        return _mm512_mask_blend_ps(0xCCCF, a.values, b.values);
+      case 172:
+        return _mm512_mask_blend_ps(0xCCF0, a.values, b.values);
+      case 173:
+        return _mm512_mask_blend_ps(0xCCF3, a.values, b.values);
+      case 174:
+        return _mm512_mask_blend_ps(0xCCFC, a.values, b.values);
+      case 175:
+        return _mm512_mask_blend_ps(0xCCFF, a.values, b.values);
+      case 176:
+        return _mm512_mask_blend_ps(0xCF00, a.values, b.values);
+      case 177:
+        return _mm512_mask_blend_ps(0xCF03, a.values, b.values);
+      case 178:
+        return _mm512_mask_blend_ps(0xCF0C, a.values, b.values);
+      case 179:
+        return _mm512_mask_blend_ps(0xCF0F, a.values, b.values);
+      case 180:
+        return _mm512_mask_blend_ps(0xCF30, a.values, b.values);
+      case 181:
+        return _mm512_mask_blend_ps(0xCF33, a.values, b.values);
+      case 182:
+        return _mm512_mask_blend_ps(0xCF3C, a.values, b.values);
+      case 183:
+        return _mm512_mask_blend_ps(0xCF3F, a.values, b.values);
+      case 184:
+        return _mm512_mask_blend_ps(0xCFC0, a.values, b.values);
+      case 185:
+        return _mm512_mask_blend_ps(0xCFC3, a.values, b.values);
+      case 186:
+        return _mm512_mask_blend_ps(0xCFCC, a.values, b.values);
+      case 187:
+        return _mm512_mask_blend_ps(0xCFCF, a.values, b.values);
+      case 188:
+        return _mm512_mask_blend_ps(0xCFF0, a.values, b.values);
+      case 189:
+        return _mm512_mask_blend_ps(0xCFF3, a.values, b.values);
+      case 190:
+        return _mm512_mask_blend_ps(0xCFFC, a.values, b.values);
+      case 191:
+        return _mm512_mask_blend_ps(0xCFFF, a.values, b.values);
+      case 192:
+        return _mm512_mask_blend_ps(0xF000, a.values, b.values);
+      case 193:
+        return _mm512_mask_blend_ps(0xF003, a.values, b.values);
+      case 194:
+        return _mm512_mask_blend_ps(0xF00C, a.values, b.values);
+      case 195:
+        return _mm512_mask_blend_ps(0xF00F, a.values, b.values);
+      case 196:
+        return _mm512_mask_blend_ps(0xF030, a.values, b.values);
+      case 197:
+        return _mm512_mask_blend_ps(0xF033, a.values, b.values);
+      case 198:
+        return _mm512_mask_blend_ps(0xF03C, a.values, b.values);
+      case 199:
+        return _mm512_mask_blend_ps(0xF03F, a.values, b.values);
+      case 200:
+        return _mm512_mask_blend_ps(0XF0C0, a.values, b.values);
+      case 201:
+        return _mm512_mask_blend_ps(0xF0C3, a.values, b.values);
+      case 202:
+        return _mm512_mask_blend_ps(0xF0CC, a.values, b.values);
+      case 203:
+        return _mm512_mask_blend_ps(0xF0CF, a.values, b.values);
+      case 204:
+        return _mm512_mask_blend_ps(0xF0F0, a.values, b.values);
+      case 205:
+        return _mm512_mask_blend_ps(0xF0F3, a.values, b.values);
+      case 206:
+        return _mm512_mask_blend_ps(0xF0FC, a.values, b.values);
+      case 207:
+        return _mm512_mask_blend_ps(0xF0FF, a.values, b.values);
+      case 208:
+        return _mm512_mask_blend_ps(0XF300, a.values, b.values);
+      case 209:
+        return _mm512_mask_blend_ps(0xF303, a.values, b.values);
+      case 210:
+        return _mm512_mask_blend_ps(0xF30C, a.values, b.values);
+      case 211:
+        return _mm512_mask_blend_ps(0xF30F, a.values, b.values);
+      case 212:
+        return _mm512_mask_blend_ps(0xF330, a.values, b.values);
+      case 213:
+        return _mm512_mask_blend_ps(0xF333, a.values, b.values);
+      case 214:
+        return _mm512_mask_blend_ps(0XF33C, a.values, b.values);
+      case 215:
+        return _mm512_mask_blend_ps(0xF33F, a.values, b.values);
+      case 216:
+        return _mm512_mask_blend_ps(0xF3C0, a.values, b.values);
+      case 217:
+        return _mm512_mask_blend_ps(0xF3C3, a.values, b.values);
+      case 218:
+        return _mm512_mask_blend_ps(0xF3CC, a.values, b.values);
+      case 219:
+        return _mm512_mask_blend_ps(0xF3CF, a.values, b.values);
+      case 220:
+        return _mm512_mask_blend_ps(0xF3F0, a.values, b.values);
+      case 221:
+        return _mm512_mask_blend_ps(0xF3F3, a.values, b.values);
+      case 222:
+        return _mm512_mask_blend_ps(0xF3FC, a.values, b.values);
+      case 223:
+        return _mm512_mask_blend_ps(0XF3FF, a.values, b.values);
+      case 224:
+        return _mm512_mask_blend_ps(0xFC00, a.values, b.values);
+      case 225:
+        return _mm512_mask_blend_ps(0xFC03, a.values, b.values);
+      case 226:
+        return _mm512_mask_blend_ps(0xFC0C, a.values, b.values);
+      case 227:
+        return _mm512_mask_blend_ps(0xFC0F, a.values, b.values);
+      case 228:
+        return _mm512_mask_blend_ps(0xFC30, a.values, b.values);
+      case 229:
+        return _mm512_mask_blend_ps(0xFC33, a.values, b.values);
+      case 230:
+        return _mm512_mask_blend_ps(0xFC3C, a.values, b.values);
+      case 231:
+        return _mm512_mask_blend_ps(0xFC3F, a.values, b.values);
+      case 232:
+        return _mm512_mask_blend_ps(0xFCC0, a.values, b.values);
+      case 233:
+        return _mm512_mask_blend_ps(0xFCC3, a.values, b.values);
+      case 234:
+        return _mm512_mask_blend_ps(0xFCCC, a.values, b.values);
+      case 235:
+        return _mm512_mask_blend_ps(0xFCCF, a.values, b.values);
+      case 236:
+        return _mm512_mask_blend_ps(0xFCF0, a.values, b.values);
+      case 237:
+        return _mm512_mask_blend_ps(0xFCF3, a.values, b.values);
+      case 238:
+        return _mm512_mask_blend_ps(0xFCFC, a.values, b.values);
+      case 239:
+        return _mm512_mask_blend_ps(0xFCFF, a.values, b.values);
+      case 240:
+        return _mm512_mask_blend_ps(0xFF00, a.values, b.values);
+      case 241:
+        return _mm512_mask_blend_ps(0xFF03, a.values, b.values);
+      case 242:
+        return _mm512_mask_blend_ps(0xFF0C, a.values, b.values);
+      case 243:
+        return _mm512_mask_blend_ps(0xFF0F, a.values, b.values);
+      case 244:
+        return _mm512_mask_blend_ps(0xFF30, a.values, b.values);
+      case 245:
+        return _mm512_mask_blend_ps(0xFF33, a.values, b.values);
+      case 246:
+        return _mm512_mask_blend_ps(0xFF3C, a.values, b.values);
+      case 247:
+        return _mm512_mask_blend_ps(0xFF3F, a.values, b.values);
+      case 248:
+        return _mm512_mask_blend_ps(0xFFC0, a.values, b.values);
+      case 249:
+        return _mm512_mask_blend_ps(0xFFC3, a.values, b.values);
+      case 250:
+        return _mm512_mask_blend_ps(0xFFCC, a.values, b.values);
+      case 251:
+        return _mm512_mask_blend_ps(0xFFCF, a.values, b.values);
+      case 252:
+        return _mm512_mask_blend_ps(0xFFF0, a.values, b.values);
+      case 253:
+        return _mm512_mask_blend_ps(0xFFF3, a.values, b.values);
+      case 254:
+        return _mm512_mask_blend_ps(0xFFFC, a.values, b.values);
+      default:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_ps(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(
+        _mm512_castps_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<float>> arange(
+      c10::complex<float> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(
+        base,
+        base + step,
+        base + c10::complex<float>(2) * step,
+        base + c10::complex<float>(3) * step,
+        base + c10::complex<float>(4) * step,
+        base + c10::complex<float>(5) * step,
+        base + c10::complex<float>(6) * step,
+        base + c10::complex<float>(7) * step);
+  }
+  static Vectorized<c10::complex<float>> set(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float*>(ptr),
+        count * sizeof(c10::complex<float>));
+    return _mm512_load_ps(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[2 * size()];
+      _mm512_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
+    }
+  }
+  // AVX512 doesn't have horizontal add & horizontal sub instructions.
+  // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
+  static inline __m512 hadd_ps(__m512 a, __m512 b) {
+    __m512i idx1 = _mm512_set_epi32(
+        30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+    __m512i idx2 = _mm512_set_epi32(
+        31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+    return _mm512_add_ps(
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  static inline __m512 hsub_ps(__m512 a, __m512 b) {
+    __m512i idx1 = _mm512_set_epi32(
+        30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+    __m512i idx2 = _mm512_set_epi32(
+        31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+    return _mm512_sub_ps(
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  const c10::complex<float>& operator[](int idx) const = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(
+      c10::complex<float> (*const f)(const c10::complex<float>&)) const {
+    __at_align__ c10::complex<float> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m512 abs_2_() const {
+    auto val_2 = _mm512_mul_ps(values, values); // a*a     b*b
+    auto ret = hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b
+    return ret;
+  }
+  __m512 abs_() const {
+    auto real = _mm512_moveldup_ps(values); // real real
+    auto imag = _mm512_movehdup_ps(values); // imag imag
+    return Sleef_hypotf16_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm512_and_ps(abs_(), real_mask); // abs     0
+  }
+  __m512 angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm512_permute_ps(values, 0xB1); // b        a
+    return Sleef_atan2f16_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm512_and_ps(angle, real_mask); // angle    0
+  }
+  Vectorized<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm512_setzero_ps();
+    auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
+    auto div = _mm512_div_ps(values, abs);
+    return _mm512_mask_blend_ps(mask, div, zero);
+  }
+  __m512 real_() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm512_and_ps(values, real_mask);
+  }
+  Vectorized<c10::complex<float>> real() const {
+    return real_();
+  }
+  __m512 imag_() const {
+    const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF));
+    return _mm512_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm512_permute_ps(imag_(), 0xB1); // b        a
+  }
+  __m512 conj_() const {
+    const __m512 sign_mask = _mm512_setr_ps(
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0);
+    return _mm512_xor_ps(values, sign_mask); // a       -b
+  }
+  Vectorized<c10::complex<float>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<float>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<float>> log2() const {
+    const __m512 log2_ = _mm512_set1_ps(std::log(2));
+    return _mm512_div_ps(log(), log2_);
+  }
+  Vectorized<c10::complex<float>> log10() const {
+    const __m512 log10_ = _mm512_set1_ps(std::log(10));
+    return _mm512_div_ps(log(), log10_);
+  }
+  Vectorized<c10::complex<float>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<float>> asin() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // // asin(x)
+    // // = -i*ln(iz + sqrt(1 -z^2))
+    // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    // const __m512 one = _mm512_set1_ps(1);
+
+    // auto conj = conj_();
+    // auto b_a = _mm512_permute_ps(conj, 0xB1);                         //-b a
+    // auto ab = _mm512_mul_ps(conj, b_a);                               //-ab
+    // -ab auto im = _mm512_add_ps(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm512_mul_ps(values, values);                       // a*a
+    // b*b auto re = hsub_ps(val_2, _mm512_permute_ps(val_2, 0xB1));  // a*a-b*b
+    // b*b-a*a re = _mm512_sub_ps(one, re);
+
+    // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt();
+    // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_ps(b_a, root)).log();
+    // //ln(iz + sqrt()) return Vectorized(_mm512_permute_ps(ln.values,
+    // 0xB1)).conj();         //-i*ln()
+    return map(std::asin);
+  }
+  Vectorized<c10::complex<float>> acos() const {
+    return map(std::acos);
+  }
+  Vectorized<c10::complex<float>> atan() const;
+  Vectorized<c10::complex<float>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<float>> exp() const {
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expf16_u10(values); //exp(a)           exp(b) exp =
+    // _mm512_mask_blend_ps(0xAAAA, exp, _mm512_permute_ps(exp, 0xB1)); //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosf16_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm512_mask_blend_ps(0xAAAA,
+    // _mm512_permute_ps(sin_cos.y, 0xB1),
+    //                                sin_cos.x);                  //cos(b)
+    //                                sin(b)
+    // return _mm512_mul_ps(exp, cos_sin);
+    return map(std::exp);
+  }
+  Vectorized<c10::complex<float>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m512 ln_2 = _mm512_set1_ps(c10::ln_2<float>);
+    Vectorized<c10::complex<float>> scaled_values = _mm512_mul_ps(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<float>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<float>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<float>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<float>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<float>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<float>> ceil() const {
+    return _mm512_ceil_ps(values);
+  }
+  Vectorized<c10::complex<float>> floor() const {
+    return _mm512_floor_ps(values);
+  }
+  Vectorized<c10::complex<float>> neg() const {
+    auto zero = _mm512_setzero_ps();
+    return _mm512_sub_ps(zero, values);
+  }
+  Vectorized<c10::complex<float>> round() const {
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<float>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<float>> trunc() const {
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<float>> reciprocal() const;
+  Vectorized<c10::complex<float>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<float>> pow(
+      const Vectorized<c10::complex<float>>& exp) const {
+    __at_align__ c10::complex<float> x_tmp[size()];
+    __at_align__ c10::complex<float> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<float>> operator==(
+      const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator!=(
+      const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator<(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(
+      const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(
+      const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<float>> inline operator+(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_add_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator-(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_sub_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator*(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512 sign_mask = _mm512_setr_ps(
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0);
+  auto ac_bd = _mm512_mul_ps(a, b); // ac       bd
+
+  auto d_c = _mm512_permute_ps(b, 0xB1); // d        c
+  d_c = _mm512_xor_ps(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm512_mul_ps(a, d_c); // ad      -bc
+
+  auto ret = Vectorized<c10::complex<float>>::hsub_ps(
+      ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator/(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // auto mask = _mm512_set1_ps(-0.f);
+  // auto fabs_cd = _mm512_andnot_ps(mask, b);     // |c|    |d|
+  // auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc));  // 1/sc
+  // 1/sc auto a2 = _mm512_mul_ps(a, scale);         // a/sc     b/sc auto b2 =
+  // _mm512_mul_ps(b, scale);         // c/sc     d/sc auto acbd2 =
+  // _mm512_mul_ps(a2, b2);
+
+  // const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0,
+  //                                         -0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  //                                         -0.0, 0.0);
+  // auto dc2 = _mm512_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  // dc2 = _mm512_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm512_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = Vectorized<c10::complex<float>>::hadd_ps(acbd2, adbc2);
+  // //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm512_div_ps(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<float>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return _mm512_loadu_ps(reinterpret_cast<const float*>(out));
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<float>> Vectorized<
+    c10::complex<float>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0,
+  //                                         0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  //                                         0.0, -0.0);
+  // auto c_d = _mm512_xor_ps(sign_mask, values);    //c       -d
+  // return _mm512_div_ps(c_d, abs_2_());
+  __at_align__ c10::complex<float> tmp[size()];
+  store(tmp);
+  for (const auto i : c10::irange(size())) {
+    tmp[i] = c10::complex<float>(1) / tmp[i];
+  }
+  return loadu(tmp);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m512 i = _mm512_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+  //                                 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm512_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5,
+  //                                         0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  //                                         0.5);
+
+  // auto sum = Vectorized(_mm512_add_ps(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm512_sub_ps(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+  return map(std::atan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline maximum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  auto zero_vector = _mm512_set1_epi32(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm512_mask_blend_ps(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF);
+  return _mm512_or_ps(max, _mm512_castsi512_ps(isnan));
+}
+
+template <>
+Vectorized<c10::complex<float>> inline minimum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  auto zero_vector = _mm512_set1_epi32(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm512_mask_blend_ps(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF);
+  return _mm512_or_ps(min, _mm512_castsi512_ps(isnan));
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator&(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_and_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator|(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_or_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator^(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_xor_ps(a, b);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d8b70fa3c512d3b30557631b7cfed674252df9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_convert.h
@@ -0,0 +1,345 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtbf16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtfp16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 2> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 2>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = convert_float_bfloat16(src[0], src[1]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 2> result;
+    std::tie(result[0], result[1]) = convert_bfloat16_float(src[0]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 2> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 2>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = convert_float_half(src[0], src[1]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, Half, 1> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 2> result;
+    std::tie(result[0], result[1]) = convert_half_float(src[0]);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_ps(src[0]);
+    auto high = _mm512_cvtepi64_ps(src[1]);
+    return Vectorized<float>(
+        _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvt_roundps_epi64(
+        _mm512_castps512_ps256(src[0]), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    result[1] = _mm512_cvt_roundps_epi64(
+        _mm512_extractf32x8_ps(src[0], 1),
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_epi32(src[0]);
+    auto high = _mm512_cvtepi64_epi32(src[1]);
+    return Vectorized<int32_t>(
+        _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src[0]));
+    result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepu8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src256 = _mm512_castsi512_si256(src[0]);
+    return Vectorized<int16_t>(_mm512_cvtepu8_epi16(src256));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int32_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    auto src128 = _mm512_cvtepi32_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi128_si512(src128));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int16_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int16_t, 1>& src) {
+    auto src256 = _mm512_cvtepi16_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi256_si512(src256));
+  }
+};
+
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
+    __m512 result = _mm512_insertf32x4(
+        _mm512_castsi512_ps(vec1),
+        lane2,
+        1); // Insert lane2 into the second 128-bit lane
+    return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_float_to_int8<dst_t>(src[0]);
+  }
+};
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    __m512i src2 =
+        _mm512_castsi128_si512(_mm_castps_si128(_mm512_extractf32x4_ps(
+            _mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
+                                                ));
+    return VectorizedN<float, 2>(
+        convert_int8_to_float<src_t>(src[0]),
+        convert_int8_to_float<src_t>(src2));
+  }
+};
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_to_float<src_t>(src[0]);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    std::enable_if_t<
+        std::is_same_v<dst_t, int8_t> || std::is_same_v<dst_t, uint8_t>>> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+template <>
+struct VecConvert<Float8_e4m3fn, 1, float, 1> {
+  static inline VectorizedN<Float8_e4m3fn, 1> apply(
+      const VectorizedN<float, 1>& src_n) {
+    at::vec::Vectorized<float> src = src_n[0];
+    __m128i res128 = cvtfp32_fp8e4m3(src);
+    return at::vec::Vectorized<Float8_e4m3fn>(_mm512_castsi128_si512(res128));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Float8_e4m3fn, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<Float8_e4m3fn, 1>& src_n) {
+    // cvt first 16x8 bits from Float8_e4m3fn to float
+    at::vec::Vectorized<Float8_e4m3fn> src = src_n[0];
+    __m512 result;
+    cvtfp8e4m3_fp32(_mm512_castsi512_si128(src), result);
+    return at::vec::Vectorized<float>(result);
+  }
+};
+
+template <>
+struct VecConvert<Float8_e5m2, 1, float, 1> {
+  static inline VectorizedN<Float8_e5m2, 1> apply(
+      const VectorizedN<float, 1>& src_n) {
+    at::vec::Vectorized<float> src = src_n[0];
+    __m128i res128 = cvtfp32_fp8e5m2(src);
+    return at::vec::Vectorized<Float8_e5m2>(_mm512_castsi128_si512(res128));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Float8_e5m2, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<Float8_e5m2, 1>& src_n) {
+    // cvt first 16x8 bits from Float8_e5m2 to float
+    at::vec::Vectorized<Float8_e5m2> src = src_n[0];
+    __m512 result;
+    cvtfp8e5m2_fp32(_mm512_castsi512_si128(src), result);
+    return at::vec::Vectorized<float>(result);
+  }
+};
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1ca121d301df6c9fb71b0eef28a9efe8fd03f8b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h
@@ -0,0 +1,571 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if (defined(CPU_CAPABILITY_AVX512))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+  // values needs to be public for compilation with clang
+  // as vec512.h uses it
+  __m512d values;
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {
+    values = _mm512_setzero_pd();
+  }
+  Vectorized(__m512d v) : values(v) {}
+  Vectorized(double val) {
+    values = _mm512_set1_pd(val);
+  }
+  Vectorized(
+      double val1,
+      double val2,
+      double val3,
+      double val4,
+      double val5,
+      double val6,
+      double val7,
+      double val8) {
+    values = _mm512_setr_pd(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  operator __m512d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    return _mm512_mask_blend_pd(mask, a.values, b.values);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(
+        _mm512_castpd_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __mmask8 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_pd(mask, ptr);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      __mmask8 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_pd(reinterpret_cast<double*>(ptr), mask, values);
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    __mmask8 cmp = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_EQ_OQ);
+    return static_cast<int32_t>(cmp);
+  }
+  Vectorized<double> isnan() const {
+    auto cmp_mask =
+        _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512d self_sub = _mm512_sub_pd(values, values);
+    return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) &
+            0x7777777777777777) != 0;
+  }
+  Vectorized<double> map(double (*const f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    auto mask = _mm512_set1_pd(-0.f);
+    return _mm512_andnot_pd(mask, values);
+  }
+  Vectorized<double> angle() const {
+    const auto zero_vec = _mm512_castsi512_pd(zero_vector);
+    const auto nan_vec = _mm512_set1_pd(NAN);
+    const auto not_nan_mask = _mm512_cmp_pd_mask(values, values, _CMP_EQ_OQ);
+    const auto not_nan =
+        _mm512_mask_set1_epi64(zero_vector, not_nan_mask, 0xFFFFFFFFFFFFFFFF);
+    const auto nan_mask =
+        _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan), zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm512_set1_pd(c10::pi<double>);
+
+    const auto neg_mask = _mm512_cmp_pd_mask(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm512_mask_blend_pd(neg_mask, zero_vec, pi);
+    angle = _mm512_mask_blend_pd(nan_mask, angle, nan_vec);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return _mm512_set1_pd(0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return Vectorized<double>(Sleef_acosd8_u10(values));
+  }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd8_u10(values));
+  }
+  Vectorized<double> asin() const {
+    return Vectorized<double>(Sleef_asind8_u10(values));
+  }
+  Vectorized<double> asinh() const {
+    return Vectorized<double>(Sleef_asinhd8_u10(values));
+  }
+  Vectorized<double> atan() const {
+    return Vectorized<double>(Sleef_atand8_u10(values));
+  }
+  Vectorized<double> atanh() const {
+    return Vectorized<double>(Sleef_atanhd8_u10(values));
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_atan2d8_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+    return Vectorized<double>(Sleef_copysignd8(values, sign));
+  }
+  Vectorized<double> erf() const {
+    return Vectorized<double>(Sleef_erfd8_u10(values));
+  }
+  Vectorized<double> erfc() const {
+    return Vectorized<double>(Sleef_erfcd8_u15(values));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return Vectorized<double>(Sleef_expd8_u10(values));
+  }
+  Vectorized<double> exp2() const {
+    return Vectorized<double>(Sleef_exp2d8_u10(values));
+  }
+  Vectorized<double> expm1() const {
+    return Vectorized<double>(Sleef_expm1d8_u10(values));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    return Vectorized<double>(Sleef_fmodd8(values, q));
+  }
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_hypotd8_u05(values, b));
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> log() const {
+    return Vectorized<double>(Sleef_logd8_u10(values));
+  }
+  Vectorized<double> log2() const {
+    return Vectorized<double>(Sleef_log2d8_u10(values));
+  }
+  Vectorized<double> log10() const {
+    return Vectorized<double>(Sleef_log10d8_u10(values));
+  }
+  Vectorized<double> log1p() const {
+    return Vectorized<double>(Sleef_log1pd8_u10(values));
+  }
+  Vectorized<double> sin() const {
+    return Vectorized<double>(Sleef_sind8_u10(values));
+  }
+  Vectorized<double> sinh() const {
+    return Vectorized<double>(Sleef_sinhd8_u10(values));
+  }
+  Vectorized<double> cos() const {
+    return Vectorized<double>(Sleef_cosd8_u10(values));
+  }
+  Vectorized<double> cosh() const {
+    return Vectorized<double>(Sleef_coshd8_u10(values));
+  }
+  Vectorized<double> ceil() const {
+    return _mm512_ceil_pd(values);
+  }
+  Vectorized<double> floor() const {
+    return _mm512_floor_pd(values);
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> neg() const {
+    return _mm512_xor_pd(_mm512_set1_pd(-0.), values);
+  }
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_nextafterd8(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> tan() const {
+    return Vectorized<double>(Sleef_tand8_u10(values));
+  }
+  Vectorized<double> tanh() const {
+    return Vectorized<double>(Sleef_tanhd8_u10(values));
+  }
+  Vectorized<double> trunc() const {
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> lgamma() const {
+    return Vectorized<double>(Sleef_lgammad8_u10(values));
+  }
+  Vectorized<double> sqrt() const {
+    return _mm512_sqrt_pd(values);
+  }
+  Vectorized<double> reciprocal() const {
+    return _mm512_div_pd(_mm512_set1_pd(1), values);
+  }
+  Vectorized<double> rsqrt() const {
+    return _mm512_div_pd(_mm512_set1_pd(1), _mm512_sqrt_pd(values));
+  }
+  Vectorized<double> pow(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_powd8_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LT_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LE_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GT_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GE_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_mul_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_div_pd(a, b);
+}
+
+// frac. Implement this here so we can use subtraction.
+inline Vectorized<double> Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> max = _mm512_max_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_pd(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> min = _mm512_min_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+  return _mm512_min_pd(max, _mm512_max_pd(min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+  return _mm512_max_pd(min, a);
+}
+
+template <>
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+  return _mm512_min_pd(max, a);
+}
+
+template <>
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_and_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_or_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm512_fnmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm512_fmsub_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fnmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return _mm512_fnmsub_pd(a, b, c);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..e390db15bfa62b8607ffa72e8bca018e8e1a9432
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h
@@ -0,0 +1,945 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  static constexpr __m512i zero_vec{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+  __m512 values;
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized() {
+    values = _mm512_setzero_ps();
+  }
+  Vectorized(__m512 v) : values(v) {}
+  Vectorized(float val) {
+    values = _mm512_set1_ps(val);
+  }
+  Vectorized(
+      float val1,
+      float val2,
+      float val3,
+      float val4,
+      float val5,
+      float val6,
+      float val7,
+      float val8,
+      float val9,
+      float val10,
+      float val11,
+      float val12,
+      float val13,
+      float val14,
+      float val15,
+      float val16) {
+    values = _mm512_setr_ps(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  Vectorized(const float (&arr)[16])
+      : Vectorized(
+            arr[0],
+            arr[1],
+            arr[2],
+            arr[3],
+            arr[4],
+            arr[5],
+            arr[6],
+            arr[7],
+            arr[8],
+            arr[9],
+            arr[10],
+            arr[11],
+            arr[12],
+            arr[13],
+            arr[14],
+            arr[15]) {}
+  operator __m512() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    return _mm512_mask_blend_ps(mask, a.values, b.values);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(
+        _mm512_castps_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __mmask16 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_ps(mask, ptr);
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      __mmask16 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_ps(reinterpret_cast<float*>(ptr), mask, values);
+    }
+  }
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    __mmask16 cmp = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_EQ_OQ);
+    return static_cast<int32_t>(cmp);
+  }
+  Vectorized<float> isnan() const {
+    auto mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512 self_sub = _mm512_sub_ps(values, values);
+    return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) &
+            0x7777777777777777) != 0;
+  }
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    auto mask = _mm512_set1_ps(-0.f);
+    return _mm512_andnot_ps(mask, values);
+  }
+  Vectorized<float> angle() const {
+    __m512 zero_vec = _mm512_set1_ps(0.f);
+    const auto nan_vec = _mm512_set1_ps(NAN);
+    const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+    const auto not_nan_vec = _mm512_mask_set1_epi32(
+        _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF);
+    const auto nan_mask = _mm512_cmp_ps_mask(
+        _mm512_castsi512_ps(not_nan_vec), zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm512_set1_ps(c10::pi<double>);
+
+    const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi);
+    angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return _mm512_set1_ps(0);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+  Vectorized<float> acos() const {
+    return Vectorized<float>(Sleef_acosf16_u10(values));
+  }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf16_u10(values));
+  }
+  Vectorized<float> asin() const {
+    return Vectorized<float>(Sleef_asinf16_u10(values));
+  }
+  Vectorized<float> asinh() const {
+    return Vectorized<float>(Sleef_asinhf16_u10(values));
+  }
+  Vectorized<float> atan() const {
+    return Vectorized<float>(Sleef_atanf16_u10(values));
+  }
+  Vectorized<float> atanh() const {
+    return Vectorized<float>(Sleef_atanhf16_u10(values));
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_atan2f16_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+    return Vectorized<float>(Sleef_copysignf16(values, sign));
+  }
+  Vectorized<float> erf() const {
+    // constants
+    const auto neg_zero_vec = _mm512_set1_ps(-0.f);
+    const auto one_vec = _mm512_set1_ps(1.0f);
+    const auto p = _mm512_set1_ps(0.3275911f);
+    const auto p1 = _mm512_set1_ps(0.254829592f);
+    const auto p2 = _mm512_set1_ps(-0.284496736f);
+    const auto p3 = _mm512_set1_ps(1.421413741f);
+    const auto p4 = _mm512_set1_ps(-1.453152027f);
+    const auto p5 = _mm512_set1_ps(1.061405429f);
+    // sign(x)
+    auto sign_mask = _mm512_and_ps(neg_zero_vec, values);
+    auto abs_vec = _mm512_abs_ps(values);
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = _mm512_fmadd_ps(p, abs_vec, one_vec);
+    auto t = _mm512_div_ps(one_vec, tmp0);
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = _mm512_fmadd_ps(p5, t, p4);
+    auto tmp2 = _mm512_fmadd_ps(tmp1, t, p3);
+    auto tmp3 = _mm512_fmadd_ps(tmp2, t, p2);
+    auto r = _mm512_fmadd_ps(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = _mm512_mul_ps(values, values);
+    auto neg_pow_2 = _mm512_xor_ps(neg_zero_vec, pow_2);
+    // auto tmp4 = exp(neg_pow_2);
+    auto tmp4 = Vectorized<float>(Sleef_expf16_u10(neg_pow_2));
+    auto tmp5 = _mm512_xor_ps(neg_zero_vec, tmp4);
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = _mm512_mul_ps(tmp5, t);
+    auto tmp7 = _mm512_fmadd_ps(tmp6, r, one_vec);
+    return _mm512_xor_ps(sign_mask, tmp7);
+  }
+  Vectorized<float> erfc() const {
+    return Vectorized<float>(Sleef_erfcf16_u15(values));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return Vectorized<float>(Sleef_expf16_u10(values));
+  }
+  Vectorized<float> exp2() const {
+    return Vectorized<float>(Sleef_exp2f16_u10(values));
+  }
+  Vectorized<float> expm1() const {
+    return Vectorized<float>(Sleef_expm1f16_u10(values));
+  }
+  Vectorized<float> fexp_u20() const {
+    const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
+    const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
+    const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
+    const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
+
+    const __m512 vec_exp_log2ef =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
+
+    const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
+    const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
+
+    const __m512 vec_ln_flt_min =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+    __m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
+    __m512i vec_zero = _mm512_setzero_epi32();
+
+    // Fast Exponential Computation on SIMD Architectures
+    // A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
+    // Curioni exp(x) = 2**(x * log2(e))
+    //        = 2**xi * 2**xf   - TIPS we are using  the EEEE floating point
+    //        representation with identification to the exponent and the
+    //        mentissa
+    //  2**xf will be approximated to a polynomial of degree 3 computed with
+    //  Horner method
+    // mask for the boundary condition
+    auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
+    auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
+
+    // transformation with log2(e)
+    auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
+    auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
+
+    // compute polynomial using Horner Scheme, for superscalar processor
+    auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
+    vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
+    vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
+
+    vec_src = _mm512_sub_ps(vec_src, vec_res);
+    // the tips is here, headache in perspective
+    auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
+    // headache bis - we loose precision with the cast but it "fits", but ok
+    // after f32 -> f16 later
+    __m512i casted_integer = _mm512_cvttps_epi32(tmp);
+    // boundary condition, lower than the min -> 0
+    casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
+    // boundary condition, larger than the max -> +oo
+    casted_integer =
+        _mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
+    // final interpretation to float
+    return _mm512_castsi512_ps(casted_integer);
+  }
+  Vectorized<float> exp_u20() const {
+    // A faster version of exp with ULP=20
+    const __m512 vec_factorial_1 =
+        _mm512_set1_ps(0.999999701f); // 1/factorial(1)
+    const __m512 vec_factorial_2 =
+        _mm512_set1_ps(0.499991506f); // 1/factorial(2)
+    const __m512 vec_factorial_3 =
+        _mm512_set1_ps(0.166676521f); // 1/factorial(3)
+    const __m512 vec_factorial_4 =
+        _mm512_set1_ps(0.0418978221f); // 1/factorial(4)
+    const __m512 vec_factorial_5 =
+        _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
+    const __m512 vec_exp_log2ef =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
+    const __m512 vec_half = _mm512_set1_ps(0.5f);
+    const __m512 vec_one = _mm512_set1_ps(1.f);
+    const __m512 vec_zero = _mm512_set1_ps(0.f);
+    const __m512 vec_two = _mm512_set1_ps(2.f);
+    const __m512 vec_ln2f =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    const __m512 vec_ln_flt_min =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+    const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+    const int n_mantissa_bits = 23;
+
+    // exp(x) =
+    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
+    // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression
+
+    auto less_ln_flt_min_mask =
+        _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);
+    auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);
+    vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);
+
+    // fx = floorf(x * log2ef + 0.5)
+    auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);
+    auto vec_fx_i = _mm512_cvt_roundps_epi32(
+        vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+    vec_fx = _mm512_cvtepi32_ps(vec_fx_i);
+
+    // x = x - fx * ln2
+    auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);
+
+    // compute polynomial
+    auto vec_res =
+        _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);
+
+    // compute 2^(n-1)
+    auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);
+    auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
+    auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
+    vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
+    vec_two_pow_n =
+        _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
+
+    // y = y * 2^n
+    vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);
+    vec_res = _mm512_mul_ps(vec_res, vec_two);
+    return vec_res;
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return Vectorized<float>(Sleef_fmodf16(values, q));
+  }
+  Vectorized<float> log() const {
+    return Vectorized<float>(Sleef_logf16_u10(values));
+  }
+  Vectorized<float> log2() const {
+    return Vectorized<float>(Sleef_log2f16_u10(values));
+  }
+  Vectorized<float> log10() const {
+    return Vectorized<float>(Sleef_log10f16_u10(values));
+  }
+  Vectorized<float> log1p() const {
+    return Vectorized<float>(Sleef_log1pf16_u10(values));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return Vectorized<float>(Sleef_sinf16_u35(values));
+  }
+  Vectorized<float> sinh() const {
+    return Vectorized<float>(Sleef_sinhf16_u10(values));
+  }
+  Vectorized<float> cos() const {
+    return Vectorized<float>(Sleef_cosf16_u35(values));
+  }
+  Vectorized<float> cosh() const {
+    return Vectorized<float>(Sleef_coshf16_u10(values));
+  }
+  Vectorized<float> ceil() const {
+    return _mm512_ceil_ps(values);
+  }
+  Vectorized<float> floor() const {
+    return _mm512_floor_ps(values);
+  }
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_hypotf16_u05(values, b));
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> neg() const {
+    return _mm512_xor_ps(_mm512_set1_ps(-0.f), values);
+  }
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_nextafterf16(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> tan() const {
+    return Vectorized<float>(Sleef_tanf16_u10(values));
+  }
+  Vectorized<float> tanh() const {
+    return Vectorized<float>(Sleef_tanhf16_u10(values));
+  }
+  Vectorized<float> trunc() const {
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> lgamma() const {
+    return Vectorized<float>(Sleef_lgammaf16_u10(values));
+  }
+  Vectorized<float> sqrt() const {
+    return _mm512_sqrt_ps(values);
+  }
+  Vectorized<float> reciprocal() const {
+    return _mm512_div_ps(_mm512_set1_ps(1), values);
+  }
+  Vectorized<float> rsqrt() const {
+    return _mm512_div_ps(_mm512_set1_ps(1), _mm512_sqrt_ps(values));
+  }
+  Vectorized<float> pow(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_powf16_u10(values, b));
+  }
+  float reduce_add() const {
+    return _mm512_reduce_add_ps(values);
+  }
+  float reduce_max() const {
+    return _mm512_reduce_max_ps(values);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LT_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LE_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GT_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GE_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_add_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_sub_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_mul_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_div_ps(a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto max = _mm512_max_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_ps(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto min = _mm512_min_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+  return _mm512_min_ps(max, _mm512_max_ps(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+  return _mm512_min_ps(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+  return _mm512_max_ps(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_and_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_or_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm512_fnmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm512_fmsub_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fnmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return _mm512_fnmsub_ps(a, b, c);
+}
+
+// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
+// Used by Inductor CPP codegen for micro gemm
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
+// kernel for transposing mxn where m, n <= 16
+// (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + N instructions
+inline void transpose_block(
+    at::vec::VectorizedN<float, 16>& input,
+    int M = 16,
+    int N = 16) {
+  TORCH_CHECK(M <= 16 && N <= 16, "transpose_block expects M, N <= 16.");
+  // unpacking and interleaving 32-bit elements
+  __m512 temp[16];
+  int i;
+  for (i = 0; i < (M + 1) / 2; ++i) {
+    temp[2 * i] = _mm512_unpacklo_ps(input[2 * i], input[2 * i + 1]);
+    temp[2 * i + 1] = _mm512_unpackhi_ps(input[2 * i], input[2 * i + 1]);
+  }
+  for (i = i * 2; i < 16; ++i) {
+    temp[i] = _mm512_setzero_ps();
+  }
+
+  // unpacking and interleaving 64-bit elements
+  for (i = 0; i < (M + 3) / 4; ++i) {
+    input[4 * i] = _mm512_castpd_ps(_mm512_unpacklo_pd(
+        _mm512_castps_pd(temp[4 * i]), _mm512_castps_pd(temp[4 * i + 2])));
+    input[4 * i + 1] = _mm512_castpd_ps(_mm512_unpackhi_pd(
+        _mm512_castps_pd(temp[4 * i]), _mm512_castps_pd(temp[4 * i + 2])));
+    input[4 * i + 2] = _mm512_castpd_ps(_mm512_unpacklo_pd(
+        _mm512_castps_pd(temp[4 * i + 1]), _mm512_castps_pd(temp[4 * i + 3])));
+    input[4 * i + 3] = _mm512_castpd_ps(_mm512_unpackhi_pd(
+        _mm512_castps_pd(temp[4 * i + 1]), _mm512_castps_pd(temp[4 * i + 3])));
+  }
+
+  //  shuffle 128-bits (composed of 4 32-bit elements)
+  for (i = 0; i < (M + 7) / 8; ++i) {
+    temp[8 * i] = _mm512_shuffle_f32x4(input[8 * i], input[8 * i + 4], 0x88);
+    temp[8 * i + 1] =
+        _mm512_shuffle_f32x4(input[8 * i + 1], input[8 * i + 5], 0x88);
+    temp[8 * i + 2] =
+        _mm512_shuffle_f32x4(input[8 * i + 2], input[8 * i + 6], 0x88);
+    temp[8 * i + 3] =
+        _mm512_shuffle_f32x4(input[8 * i + 3], input[8 * i + 7], 0x88);
+    temp[8 * i + 4] =
+        _mm512_shuffle_f32x4(input[8 * i], input[8 * i + 4], 0xdd);
+    temp[8 * i + 5] =
+        _mm512_shuffle_f32x4(input[8 * i + 1], input[8 * i + 5], 0xdd);
+    temp[8 * i + 6] =
+        _mm512_shuffle_f32x4(input[8 * i + 2], input[8 * i + 6], 0xdd);
+    temp[8 * i + 7] =
+        _mm512_shuffle_f32x4(input[8 * i + 3], input[8 * i + 7], 0xdd);
+  }
+
+  for (i = 0; i < N; ++i) {
+    if (i < 8) {
+      input[i] = _mm512_shuffle_f32x4(temp[i], temp[8 + i], 0x88);
+    } else {
+      input[i] = _mm512_shuffle_f32x4(temp[i - 8], temp[i], 0xdd);
+    }
+  }
+}
+
+// TODO(jgong5): rewrite with ATEN vectorized (need to add unpack and shuffle)
+// Used by Inductor CPP codegen
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
+// kernel for transposing mxn where m, n <= 16
+// M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
+inline void transpose_mxn_16x16(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
+  // load from src to registers
+  at::vec::VectorizedN<float, 16> input;
+  int i;
+  if (N == 16) {
+    for (i = 0; i < M; ++i) {
+      input[i] = _mm512_loadu_ps(&src[i * ld_src]);
+    }
+  } else {
+    __mmask16 src_mask = (1 << N) - 1;
+    for (i = 0; i < M; ++i) {
+      input[i] = _mm512_maskz_loadu_ps(src_mask, &src[i * ld_src]);
+    }
+  }
+  for (; i < 16; ++i) {
+    // Not really needed but to avoid uninitialized variable warning.
+    // Shouldn't be much overhead because xor can be executed in parallel with
+    // other instructions.
+    input[i] = _mm512_setzero_ps();
+  }
+
+  transpose_block(input, M, N);
+
+  // store from registers to dst
+  if (M == 16) {
+    for (i = 0; i < N; ++i) {
+      _mm512_storeu_ps(&dst[i * ld_dst], input[i]);
+    }
+  } else {
+    __mmask16 dst_mask = (1 << M) - 1;
+    for (i = 0; i < N; ++i) {
+      _mm512_mask_storeu_ps(&dst[i * ld_dst], dst_mask, input[i]);
+    }
+  }
+}
+
+template <>
+inline void transpose_mxn<float>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  int64_t i = 0;
+  for (; i < M / 16 * 16; i += 16) {
+    int64_t j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    if (nrem > 0) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem);
+    }
+  }
+  // handle remainder i
+  int mrem = M - i;
+  if (mrem > 0) {
+    int j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    transpose_mxn_16x16(
+        src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem);
+  }
+}
+
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<std::is_same_v<T, float>, int> = 0>
+inline void transpose_mxn(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0aa8e3a05cd29529145415da9ba08f356e24d7e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float8.h
@@ -0,0 +1,666 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#if (defined(CPU_CAPABILITY_AVX512))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+static inline void cvtfp8e4m3_fp32(const __m128i& a, __m512& o) {
+  // Zero Extend
+  __m512i x = _mm512_cvtepu8_epi32(a);
+  __m512i val = _mm512_and_epi32(
+      _mm512_slli_epi32(x, 24), _mm512_set1_epi32(0x7FFFFFFF)); // nonsign_val
+  __m512i mant =
+      _mm512_and_si512(x, _mm512_set1_epi32(0x07)); // mantissa = x & 0x07
+  __m512i exp = _mm512_and_si512(
+      _mm512_srli_epi32(x, 3),
+      _mm512_set1_epi32(0x0F)); // exp = (x >> 3) & 0x0F
+  __m512i sign =
+      _mm512_and_si512(x, _mm512_set1_epi32(0x80)); // sign = x & 0x80
+  __m512i _zeros = _mm512_setzero_si512();
+
+  // --- Step 1: Calculate the renorm_shift
+  __m512i renorm_shift = _zeros;
+  // Denorm case (exp == 0 && mant != 0) ---
+  __mmask16 denormal_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) &
+      _mm512_cmpneq_epi32_mask(mant, _zeros);
+  if (denormal_mask) {
+    // An alternative solution is as what scalar did in
+    // pytorch/c10/util/Float8_e4m3fn.h To count the num of leading zeros, since
+    // here we know the unsigned denorm value has zero sign and exp which is 5
+    // leading zeros, we need to count the leading zero of mant (3bit) which may
+    // done through table lookup for example: const uint8_t lz_table[8] = {3, 2,
+    // 1, 1, 0, 0, 0, 0}; num_leading_zero = lz_table[mant] + 5;
+
+    __m512i _ones = _mm512_set1_epi32(1);
+    __m512i _twos = _mm512_set1_epi32(2);
+    __m512i _threes = _mm512_set1_epi32(3);
+
+    // Default leading zero number for denorm value is 1 = 5 - 4
+    __m512i denorm_renorm_shift = _ones;
+    // For mant 001, leading zero number is 3 = 7 -4
+    __mmask16 leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _ones);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _threes);
+    // For mant 010 and 011, leading zero number is 2 = 6 -4
+    leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _twos);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos);
+    leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _threes);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos);
+
+    renorm_shift =
+        _mm512_mask_mov_epi32(renorm_shift, denormal_mask, denorm_renorm_shift);
+  }
+
+  // --- Step 2: calculate norm and denorm ---
+  __m512i norm_shifted =
+      _mm512_srli_epi32(_mm512_sllv_epi32(val, renorm_shift), 4);
+  // exponent bias adjustment: (0x78 - renorm_shift) << 23
+  __m512i exp_bias = _mm512_slli_epi32(
+      _mm512_sub_epi32(_mm512_set1_epi32(0x78), renorm_shift), 23);
+  val = _mm512_add_epi32(norm_shifted, exp_bias);
+
+  // --- Step 3: Nan case (exp == 0xF && mant == 0x07) ---
+  __mmask16 nan_mask = _mm512_cmpeq_epi32_mask(exp, _mm512_set1_epi32(0xF)) &
+      _mm512_cmpeq_epi32_mask(mant, _mm512_set1_epi32(0x07));
+  if (nan_mask) {
+    const __m512i nan_values = _mm512_set1_epi32(0x7FC00000);
+    val = _mm512_mask_mov_epi32(val, nan_mask, nan_values);
+  }
+
+  // --- Step 4: Zero case (exp == 0x00 && mant == 0x00) ---
+  __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) &
+      _mm512_cmpeq_epi32_mask(mant, _zeros);
+  if (zero_mask) {
+    val = _mm512_mask_mov_epi32(val, zero_mask, _zeros);
+  }
+
+  // --- Step 5: OR with sign (sign bit << 24 to get to bit 31) ---
+  val = _mm512_or_si512(val, _mm512_slli_epi32(sign, 24));
+
+  o = _mm512_castsi512_ps(val);
+}
+
+static inline __m128i cvtfp32_fp8e4m3(const __m512& src) {
+  // cvt 16x32 from fp32 to fp8 e4m3
+  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
+  const __m512i fp8_max = _mm512_set1_epi32(UINT32_C(1087) << 20);
+  const __m512i denorm_thresh = _mm512_set1_epi32(UINT32_C(121) << 23);
+  const __m512i denorm_mask = _mm512_set1_epi32(UINT32_C(141) << 23);
+  const __m512i bias_part1 = _mm512_set1_epi32((uint32_t)(7 - 127) << 23);
+  const __m512i rounding_bias = _mm512_set1_epi32(0x7FFFF);
+  __m512i f_bits = _mm512_castps_si512(src);
+  // Extract and save sign
+  __m512i sign = _mm512_and_epi32(f_bits, sign_mask);
+  f_bits = _mm512_xor_epi32(f_bits, sign);
+
+  // Prepare result containers
+  __m512i result = _mm512_setzero_si512();
+
+  // Step 1: Handle case of overflow
+  // (f_bits >= fp8_max): set result = 0x7f
+  __mmask16 overflow_mask = _mm512_cmpge_epu32_mask(f_bits, fp8_max);
+  if (overflow_mask) {
+    result = _mm512_mask_set1_epi32(result, overflow_mask, 0x7f);
+  }
+
+  // Step 2: Handle small numbers (denormals)
+  // Small numbers (f_bits < denorm_thresh)
+  __mmask16 denorm_thresh_mask = _mm512_cmplt_epu32_mask(f_bits, denorm_thresh);
+
+  if (denorm_thresh_mask) {
+    __m512 small_input = _mm512_castsi512_ps(f_bits);
+    __m512 small_denorm =
+        _mm512_add_ps(small_input, _mm512_castsi512_ps(denorm_mask));
+    __m512i small_denorm_bits = _mm512_castps_si512(small_denorm);
+    __m512i small_result = _mm512_sub_epi32(small_denorm_bits, denorm_mask);
+    result = _mm512_mask_mov_epi32(result, denorm_thresh_mask, small_result);
+  }
+
+  // Step 3: Handle normal numbers
+  __mmask16 normal_mask = ~(overflow_mask | denorm_thresh_mask);
+
+  if (normal_mask) {
+    // mant_odd = (f_bits >> 20) & 1
+    __m512i mant_odd =
+        _mm512_and_epi32(_mm512_srli_epi32(f_bits, 20), _mm512_set1_epi32(1));
+    // f_bits += bias_part1 + rounding_bias
+    __m512i rounded = _mm512_add_epi32(f_bits, bias_part1);
+    rounded = _mm512_add_epi32(rounded, rounding_bias);
+    // Add mant_odd
+    rounded = _mm512_add_epi32(rounded, mant_odd);
+    // Shift right by 20 bits
+    __m512i normal_result = _mm512_srli_epi32(rounded, 20);
+    result = _mm512_mask_mov_epi32(result, normal_mask, normal_result);
+  }
+
+  // Merge back the sign
+  __m512i sign_shifted = _mm512_srli_epi32(sign, 24);
+  result = _mm512_or_epi32(result, sign_shifted);
+
+  // Now result is 16 x 32-bit integers, but we only need 8-bit for each
+  __m512i packed = _mm512_and_si512(result, _mm512_set1_epi32(0xFF));
+
+  // Narrow 32-bit integers to 8-bit
+  return _mm512_cvtepi32_epi8(packed);
+}
+
+static inline float fp8e4m3_to_fp32_scalar(uint8_t val) {
+  __m512i v = _mm512_set1_epi8(val);
+  __m128i v_128 = _mm512_castsi512_si128(v);
+  __m512 o;
+  cvtfp8e4m3_fp32(v_128, o);
+  return _mm512_cvtss_f32(o);
+}
+
+static inline uint8_t fp32_to_fp8e4m3_scalar(float val) {
+  __m512 v = _mm512_set1_ps(val);
+  __m128i o = cvtfp32_fp8e4m3(v);
+  return static_cast<std::uint8_t>(_mm_cvtsi128_si32(o));
+}
+
+static inline void cvtfp8e5m2_fp32(const __m128i& a, __m512& o) {
+  __m256i a_256 = _mm256_castsi128_si256(a);
+  __m512i a_512 = _mm512_cvtepu8_epi16(a_256);
+  a_512 = _mm512_slli_epi16(a_512, 8);
+  a_256 = _mm512_castsi512_si256(a_512);
+  cvtfp16_fp32(a_256, o);
+}
+
+static inline __m128i cvtfp32_fp8e5m2(const __m512& src) {
+  constexpr uint32_t fp32_inf = UINT32_C(255) << 23;
+  constexpr uint32_t fp8_max = UINT32_C(143) << 23;
+  constexpr uint32_t denorm_mask = UINT32_C(134) << 23;
+
+  // Cvt to bits
+  __m512i input_bits = _mm512_castps_si512(src);
+  __m512i result = _mm512_setzero_si512();
+
+  // Get the sign
+  __m512i sign = _mm512_and_si512(input_bits, _mm512_set1_epi32(0x80000000));
+
+  // Get the unsigned input
+  input_bits = _mm512_xor_si512(input_bits, sign);
+
+  // Calculate the mask for inf, nan and denorm
+  __mmask16 greater_than_fp8_max =
+      _mm512_cmpge_epi32_mask(input_bits, _mm512_set1_epi32(fp8_max));
+  __mmask16 greater_than_fp32_inf =
+      _mm512_cmpgt_epi32_mask(input_bits, _mm512_set1_epi32(fp32_inf));
+  __mmask16 less_than_normal = _mm512_cmpgt_epi32_mask(
+      _mm512_set1_epi32((UINT32_C(113) << 23)), input_bits);
+  __m512i temp_bits_for_denorm = _mm512_setzero_si512();
+  if (less_than_normal) {
+    __m512i denorm_mask_512i = _mm512_set1_epi32(denorm_mask);
+    temp_bits_for_denorm = _mm512_castps_si512(_mm512_add_ps(
+        _mm512_castsi512_ps(input_bits),
+        _mm512_castsi512_ps(denorm_mask_512i)));
+    temp_bits_for_denorm =
+        _mm512_sub_epi32(temp_bits_for_denorm, denorm_mask_512i);
+  }
+
+  // Step 1: Norm Val
+  __m512i mant_odd_mask =
+      _mm512_and_epi32(_mm512_srli_epi32(input_bits, 21), _mm512_set1_epi32(1));
+  input_bits = _mm512_add_epi32(
+      input_bits, _mm512_set1_epi32(((uint32_t)(15 - 127) << 23) + 0xFFFFF));
+  input_bits = _mm512_add_epi32(input_bits, mant_odd_mask);
+  result = _mm512_srli_epi32(input_bits, 21);
+
+  // Step 2: INF and NAN
+  if (greater_than_fp8_max) {
+    result = _mm512_mask_mov_epi32(
+        result, greater_than_fp8_max, _mm512_set1_epi8(0x7C));
+    if (greater_than_fp32_inf) {
+      result = _mm512_mask_mov_epi32(
+          result, greater_than_fp32_inf, _mm512_set1_epi8(0x7F));
+    }
+  }
+
+  // Step 3: Denorm val
+  if (less_than_normal) {
+    result =
+        _mm512_mask_mov_epi32(result, less_than_normal, temp_bits_for_denorm);
+  }
+
+  // Step 4: restore sign
+  result = _mm512_or_si512(result, _mm512_srli_epi32(sign, 24));
+
+  return _mm512_cvtepi32_epi8(result);
+}
+
+static inline float fp8e5m2_to_fp32_scalar(uint8_t val) {
+  __m512i v = _mm512_set1_epi8(val);
+  __m128i v_128 = _mm512_castsi512_si128(v);
+  __m512 o;
+  cvtfp8e5m2_fp32(v_128, o);
+  return _mm512_cvtss_f32(o);
+}
+
+static inline uint8_t fp32_to_fp8e5m2_scalar(float val) {
+  __m512 v = _mm512_set1_ps(val);
+  __m128i o = cvtfp32_fp8e5m2(v);
+  return static_cast<std::uint8_t>(_mm_cvtsi128_si32(o));
+}
+
+template <typename T>
+class Vectorizedf8 {
+  static_assert(
+      std::integral_constant < bool,
+      std::is_same_v<T, at::Float8_e4m3fn> || std::is_same_v < T,
+      at::Float8_e5m2 >> ::value,
+      "Support only float8 e4m3.");
+
+ private:
+  __m512i values;
+  template <typename Op, typename VectorizedType>
+  Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
+    __m512 a0, a1, a2, a3;
+    __m512 b0, b1, b2, b3;
+    __m512 o0, o1, o2, o3;
+    if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 0), a0);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 1), a1);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 2), a2);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 3), a3);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3);
+    } else {
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 0), a0);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 1), a1);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 2), a2);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 3), a3);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3);
+    }
+
+    o0 = op(a0, b0);
+    o1 = op(a1, b1);
+    o2 = op(a2, b2);
+    o3 = op(a3, b3);
+    __m128i o128_0, o128_1, o128_2, o128_3;
+    if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+      o128_0 = cvtfp32_fp8e4m3(o0);
+      o128_1 = cvtfp32_fp8e4m3(o1);
+      o128_2 = cvtfp32_fp8e4m3(o2);
+      o128_3 = cvtfp32_fp8e4m3(o3);
+    } else {
+      o128_0 = cvtfp32_fp8e5m2(o0);
+      o128_1 = cvtfp32_fp8e5m2(o1);
+      o128_2 = cvtfp32_fp8e5m2(o2);
+      o128_3 = cvtfp32_fp8e5m2(o3);
+    }
+
+    __m512i result = _mm512_setzero_si512();
+    result = _mm512_inserti32x4(result, o128_0, 0);
+    result = _mm512_inserti32x4(result, o128_1, 1);
+    result = _mm512_inserti32x4(result, o128_2, 2);
+    result = _mm512_inserti32x4(result, o128_3, 3);
+
+    return result;
+  }
+
+ public:
+  using value_type = uint8_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 64;
+  }
+  Vectorizedf8() {}
+  Vectorizedf8(__m512i v) : values(v) {}
+  Vectorizedf8(T val) {
+    value_type uw = val.x;
+    values = _mm512_set1_epi8(uw);
+  }
+  operator __m512i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const = delete;
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else if (count == 16) {
+      // Fast path if only load element number of 16
+      __m128i input_128 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+      return _mm512_castsi128_si512(input_128);
+    } else {
+      __mmask64 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi8(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 16) {
+        // Fast path if only store element number of 16
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values));
+      } else {
+        __mmask64 mask = (1ULL << count) - 1;
+        _mm512_mask_storeu_epi8(ptr, mask, values);
+      }
+    }
+  }
+
+  Vectorized<T> abs() const {
+    return _mm512_andnot_si512(_mm512_set1_epi8(0x80), values);
+  }
+
+  Vectorized<T> inline operator==(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator!=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator>(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator>=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator<(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator<=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+};
+
+template <>
+class Vectorized<Float8_e4m3fn> : public Vectorizedf8<Float8_e4m3fn> {
+ public:
+  using Vectorizedf8::Vectorizedf8;
+
+  using value_type = Float8_e4m3fn;
+
+  Vectorized<Float8_e4m3fn> eq(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> ne(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> gt(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> ge(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> lt(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> le(const Vectorized<Float8_e4m3fn>& other) const;
+};
+
+template <
+    typename T,
+    typename Op,
+    std::enable_if_t<
+        std::is_same_v<T, c10::Float8_e4m3fn> ||
+            std::is_same_v<T, c10::Float8_e5m2>,
+        int> = 0>
+static inline Vectorized<T> binary_fp8_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  __m512 a0, a1, a2, a3;
+  __m512 b0, b1, b2, b3;
+  __m512 o0, o1, o2, o3;
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 0), a0);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 0), b0);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 1), a1);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 1), b1);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 2), a2);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 2), b2);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 3), a3);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 3), b3);
+  } else {
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 0), a0);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 0), b0);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 1), a1);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 1), b1);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 2), a2);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 2), b2);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 3), a3);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 3), b3);
+  }
+  o0 = op(a0, b0);
+  o1 = op(a1, b1);
+  o2 = op(a2, b2);
+  o3 = op(a3, b3);
+
+  __m128i o128_0, o128_1, o128_2, o128_3;
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    o128_0 = cvtfp32_fp8e4m3(o0);
+    o128_1 = cvtfp32_fp8e4m3(o1);
+    o128_2 = cvtfp32_fp8e4m3(o2);
+    o128_3 = cvtfp32_fp8e4m3(o3);
+  } else {
+    o128_0 = cvtfp32_fp8e5m2(o0);
+    o128_1 = cvtfp32_fp8e5m2(o1);
+    o128_2 = cvtfp32_fp8e5m2(o2);
+    o128_3 = cvtfp32_fp8e5m2(o3);
+  }
+
+  __m512i result = _mm512_setzero_si512();
+  result = _mm512_inserti32x4(result, o128_0, 0);
+  result = _mm512_inserti32x4(result, o128_1, 1);
+  result = _mm512_inserti32x4(result, o128_2, 2);
+  result = _mm512_inserti32x4(result, o128_3, 3);
+
+  return result;
+}
+
+// Refer to
+// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
+// -, *, /, planned to be deleted in the future and here is just to make
+// compiler happy
+Vectorized<Float8_e4m3fn> inline operator+(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator-(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator*(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator/(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator&(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return _mm512_and_si512(a, b);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::eq(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this == other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::ne(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this == other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::gt(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this > other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::ge(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this >= other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::lt(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this < other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::le(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this <= other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+template <>
+class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {
+ public:
+  using Vectorizedf8::Vectorizedf8;
+
+  using value_type = Float8_e5m2;
+
+  Vectorized<Float8_e5m2> eq(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> ne(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> gt(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> ge(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> lt(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> le(const Vectorized<Float8_e5m2>& other) const;
+};
+
+// Refer to
+// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
+// -, *, /, planned to be deleted in the future and here is just to make
+// compiler happy
+Vectorized<Float8_e5m2> inline operator+(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator-(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator*(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator/(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator&(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return _mm512_and_si512(a, b);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::eq(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this == other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::ne(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this == other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::gt(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this > other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::ge(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this >= other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::lt(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this < other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::le(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this <= other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..2044a199105a3dfe76e9fda09acc68251510651b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h
@@ -0,0 +1,2126 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#ifdef CPU_CAPABILITY_AVX512
+
+struct Vectorizedi {
+ protected:
+  __m512i values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+  static inline __m512i invert(const __m512i& v) {
+    const auto ones = _mm512_set1_epi64(-1);
+    return _mm512_xor_si512(ones, v);
+  }
+
+ public:
+  Vectorizedi() {}
+  Vectorizedi(__m512i v) : values(v) {}
+  operator __m512i() const {
+    return values;
+  }
+};
+
+#else
+
+struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined
+
+#endif // CPU_CAPABILITY_AVX512
+
+#ifdef CPU_CAPABILITY_AVX512
+
+template <>
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int64_t> ones;
+
+ public:
+  using value_type = int64_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {
+    values = _mm512_setzero_si512();
+  }
+  Vectorized(int64_t v) {
+    values = _mm512_set1_epi64(v);
+  }
+  Vectorized(
+      int64_t val1,
+      int64_t val2,
+      int64_t val3,
+      int64_t val4,
+      int64_t val5,
+      int64_t val6,
+      int64_t val7,
+      int64_t val8) {
+    values = _mm512_setr_epi64(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int64_t> blend(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b) {
+    return _mm512_mask_blend_epi64(mask, a.values, b.values);
+  }
+  static Vectorized<int64_t> blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+    auto msb_one = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mask_ = _mm512_cmp_epi64_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi64(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(
+      int64_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int64_t> set(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int64_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask8 mask = (1ULL << count) - 1;
+      auto ones = _mm512_set1_epi64(1);
+      return _mm512_mask_loadu_epi64(ones, mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask8 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi64(ptr, mask, values);
+    }
+  }
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+  Vectorized<int64_t> abs() const {
+    auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values);
+    auto is_larger =
+        _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF);
+    auto inverse = _mm512_xor_si512(values, is_larger);
+    return _mm512_sub_epi64(inverse, is_larger);
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return _mm512_set1_epi64(0);
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+  Vectorized<int64_t> neg() const;
+  Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpeq_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator!=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpneq_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator<(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmplt_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator<=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmple_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator>(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpgt_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator>=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpge_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+
+  Vectorized<int64_t> eq(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ne(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> gt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ge(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> lt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> le(const Vectorized<int64_t>& other) const;
+};
+
+template <>
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+ private:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<int32_t> ones;
+
+ public:
+  using value_type = int32_t;
+  static constexpr int size() {
+    return 16;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int32_t v) {
+    values = _mm512_set1_epi32(v);
+  }
+  Vectorized(
+      int32_t val1,
+      int32_t val2,
+      int32_t val3,
+      int32_t val4,
+      int32_t val5,
+      int32_t val6,
+      int32_t val7,
+      int32_t val8,
+      int32_t val9,
+      int32_t val10,
+      int32_t val11,
+      int32_t val12,
+      int32_t val13,
+      int32_t val14,
+      int32_t val15,
+      int32_t val16) {
+    values = _mm512_setr_epi32(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b) {
+    return _mm512_mask_blend_epi32(mask, a.values, b.values);
+  }
+  static Vectorized<int32_t> blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+    auto msb_one = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mask_ = _mm512_cmp_epi32_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi32(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int32_t> arange(
+      int32_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int32_t> set(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b,
+      int32_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int32_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int32_t> loadu(const void* ptr, int32_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask16 mask = (1ULL << count) - 1;
+      auto ones = _mm512_set1_epi32(1);
+      return _mm512_mask_loadu_epi32(ones, mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask16 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi32(ptr, mask, values);
+    }
+  }
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+  Vectorized<int32_t> abs() const {
+    return _mm512_abs_epi32(values);
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return _mm512_set1_epi32(0);
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+  Vectorized<int32_t> neg() const;
+  int32_t reduce_add() const {
+    return _mm512_reduce_add_epi32(values);
+  }
+  int32_t reduce_max() const {
+    return _mm512_reduce_max_epi32(values);
+  }
+  Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpeq_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator!=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpneq_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator<(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmplt_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator<=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmple_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator>(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpgt_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator>=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpge_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> eq(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ne(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> gt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ge(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> lt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> le(const Vectorized<int32_t>& other) const;
+};
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size());
+       i += Vectorized<int32_t>::size()) {
+    auto input_vec =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i));
+    auto output_vec = _mm512_cvtepi32_ps(input_vec);
+    _mm512_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, double* dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    auto input_256_vec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm512_cvtepi32_pd(input_256_vec);
+    _mm512_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+template <>
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int16_t> ones;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+  using value_type = int16_t;
+  static constexpr int size() {
+    return 32;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int16_t v) {
+    values = _mm512_set1_epi16(v);
+  }
+  Vectorized(
+      int16_t val1,
+      int16_t val2,
+      int16_t val3,
+      int16_t val4,
+      int16_t val5,
+      int16_t val6,
+      int16_t val7,
+      int16_t val8,
+      int16_t val9,
+      int16_t val10,
+      int16_t val11,
+      int16_t val12,
+      int16_t val13,
+      int16_t val14,
+      int16_t val15,
+      int16_t val16,
+      int16_t val17,
+      int16_t val18,
+      int16_t val19,
+      int16_t val20,
+      int16_t val21,
+      int16_t val22,
+      int16_t val23,
+      int16_t val24,
+      int16_t val25,
+      int16_t val26,
+      int16_t val27,
+      int16_t val28,
+      int16_t val29,
+      int16_t val30,
+      int16_t val31,
+      int16_t val32) {
+    values = _mm512_set_epi16(
+        val32,
+        val31,
+        val30,
+        val29,
+        val28,
+        val27,
+        val26,
+        val25,
+        val24,
+        val23,
+        val22,
+        val21,
+        val20,
+        val19,
+        val18,
+        val17,
+        val16,
+        val15,
+        val14,
+        val13,
+        val12,
+        val11,
+        val10,
+        val9,
+        val8,
+        val7,
+        val6,
+        val5,
+        val4,
+        val3,
+        val2,
+        val1);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b) {
+    return _mm512_mask_blend_epi16(mask, a.values, b.values);
+  }
+  static Vectorized<int16_t> blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+    auto msb_one = _mm512_set1_epi16(0xFFFF);
+    auto mask_ = _mm512_cmp_epi16_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi16(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<int16_t> set(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b,
+      int16_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int16_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int16_t> loadu(const void* ptr, int16_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask32 mask = (1ULL << count) - 1;
+      auto ones = _mm512_set1_epi16(1);
+      return _mm512_mask_loadu_epi16(ones, mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask32 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi16(ptr, mask, values);
+    }
+  }
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+  Vectorized<int16_t> abs() const {
+    return _mm512_abs_epi16(values);
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return _mm512_set1_epi16(0);
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+  Vectorized<int16_t> neg() const;
+  Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpeq_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator!=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpneq_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator<(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmplt_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator<=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmple_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator>(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpgt_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator>=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpge_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+
+  Vectorized<int16_t> eq(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ne(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> gt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ge(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> lt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
+};
+
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+
+ protected:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<T> ones;
+
+ public:
+  using value_type = T;
+  static constexpr int size() {
+    return 64;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized8() {}
+  Vectorized8(T v) {
+    values = _mm512_set1_epi8(v);
+  }
+  Vectorized8(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32,
+      T val33,
+      T val34,
+      T val35,
+      T val36,
+      T val37,
+      T val38,
+      T val39,
+      T val40,
+      T val41,
+      T val42,
+      T val43,
+      T val44,
+      T val45,
+      T val46,
+      T val47,
+      T val48,
+      T val49,
+      T val50,
+      T val51,
+      T val52,
+      T val53,
+      T val54,
+      T val55,
+      T val56,
+      T val57,
+      T val58,
+      T val59,
+      T val60,
+      T val61,
+      T val62,
+      T val63,
+      T val64) {
+    values = _mm512_set_epi8(
+        val64,
+        val63,
+        val62,
+        val61,
+        val60,
+        val59,
+        val58,
+        val57,
+        val56,
+        val55,
+        val54,
+        val53,
+        val52,
+        val51,
+        val50,
+        val49,
+        val48,
+        val47,
+        val46,
+        val45,
+        val44,
+        val43,
+        val42,
+        val41,
+        val40,
+        val39,
+        val38,
+        val37,
+        val36,
+        val35,
+        val34,
+        val33,
+        val32,
+        val31,
+        val30,
+        val29,
+        val28,
+        val27,
+        val26,
+        val25,
+        val24,
+        val23,
+        val22,
+        val21,
+        val20,
+        val19,
+        val18,
+        val17,
+        val16,
+        val15,
+        val14,
+        val13,
+        val12,
+        val11,
+        val10,
+        val9,
+        val8,
+        val7,
+        val6,
+        val5,
+        val4,
+        val3,
+        val2,
+        val1);
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
+    return _mm512_mask_blend_epi8(mask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step,
+        base + 32 * step,
+        base + 33 * step,
+        base + 34 * step,
+        base + 35 * step,
+        base + 36 * step,
+        base + 37 * step,
+        base + 38 * step,
+        base + 39 * step,
+        base + 40 * step,
+        base + 41 * step,
+        base + 42 * step,
+        base + 43 * step,
+        base + 44 * step,
+        base + 45 * step,
+        base + 46 * step,
+        base + 47 * step,
+        base + 48 * step,
+        base + 49 * step,
+        base + 50 * step,
+        base + 51 * step,
+        base + 52 * step,
+        base + 53 * step,
+        base + 54 * step,
+        base + 55 * step,
+        base + 56 * step,
+        base + 57 * step,
+        base + 58 * step,
+        base + 59 * step,
+        base + 60 * step,
+        base + 61 * step,
+        base + 62 * step,
+        base + 63 * step);
+  }
+  static Vectorized<T> set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+      case 32:
+        return blend<0xFFFFFFFF>(a, b);
+      case 33:
+        return blend<0x1FFFFFFFF>(a, b);
+      case 34:
+        return blend<0x3FFFFFFFF>(a, b);
+      case 35:
+        return blend<0x7FFFFFFFF>(a, b);
+      case 36:
+        return blend<0xFFFFFFFFF>(a, b);
+      case 37:
+        return blend<0x1FFFFFFFFF>(a, b);
+      case 38:
+        return blend<0x3FFFFFFFFF>(a, b);
+      case 39:
+        return blend<0x7FFFFFFFFF>(a, b);
+      case 40:
+        return blend<0xFFFFFFFFFF>(a, b);
+      case 41:
+        return blend<0x1FFFFFFFFFF>(a, b);
+      case 42:
+        return blend<0x3FFFFFFFFFF>(a, b);
+      case 43:
+        return blend<0x7FFFFFFFFFF>(a, b);
+      case 44:
+        return blend<0xFFFFFFFFFFF>(a, b);
+      case 45:
+        return blend<0x1FFFFFFFFFFF>(a, b);
+      case 46:
+        return blend<0x3FFFFFFFFFFF>(a, b);
+      case 47:
+        return blend<0x7FFFFFFFFFFF>(a, b);
+      case 48:
+        return blend<0xFFFFFFFFFFFF>(a, b);
+      case 49:
+        return blend<0x1FFFFFFFFFFFF>(a, b);
+      case 50:
+        return blend<0x3FFFFFFFFFFFF>(a, b);
+      case 51:
+        return blend<0x7FFFFFFFFFFFF>(a, b);
+      case 52:
+        return blend<0xFFFFFFFFFFFFF>(a, b);
+      case 53:
+        return blend<0x1FFFFFFFFFFFFF>(a, b);
+      case 54:
+        return blend<0x3FFFFFFFFFFFFF>(a, b);
+      case 55:
+        return blend<0x7FFFFFFFFFFFFF>(a, b);
+      case 56:
+        return blend<0xFFFFFFFFFFFFFF>(a, b);
+      case 57:
+        return blend<0x1FFFFFFFFFFFFFF>(a, b);
+      case 58:
+        return blend<0x3FFFFFFFFFFFFFF>(a, b);
+      case 59:
+        return blend<0x7FFFFFFFFFFFFFF>(a, b);
+      case 60:
+        return blend<0xFFFFFFFFFFFFFFF>(a, b);
+      case 61:
+        return blend<0x1FFFFFFFFFFFFFFF>(a, b);
+      case 62:
+        return blend<0x3FFFFFFFFFFFFFFF>(a, b);
+      case 63:
+        return blend<0x7FFFFFFFFFFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<T> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+    // Fast path if only load element number of 16.
+    // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+    // Because loadu(const void* ptr, T count) requires zero initialization for
+    // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
+    // bits of the result are undefined.
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
+    // since gcc 9.3 doesn't support it now.
+    __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+    return _mm512_castsi128_si512(input_128);
+  }
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else if (count == 16) {
+      // Fast path if only load element number of 16
+      return loadu_one_fourth(ptr);
+    } else {
+      __mmask64 mask = (1ULL << count) - 1;
+      auto ones = _mm512_set1_epi8(1);
+      return _mm512_mask_loadu_epi8(ones, mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 16) {
+        // Fast path if only store element number of 16
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values));
+      } else {
+        __mmask64 mask = (1ULL << count) - 1;
+        _mm512_mask_storeu_epi8(ptr, mask, values);
+      }
+    }
+  }
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm512_set1_epi8(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+};
+
+template <>
+struct is_vec_specialized_for<int8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int8_t> : public Vectorized8<int8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<int8_t> blendv(
+      const Vectorized<int8_t>& a,
+      const Vectorized<int8_t>& b,
+      const Vectorized<int8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+
+  Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+    return _mm512_abs_epi8(values);
+  }
+
+  Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmpeq_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator!=(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmpneq_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator<(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmplt_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator<=(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmple_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ne(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> gt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ge(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> lt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
+};
+
+template <>
+struct is_vec_specialized_for<uint8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<uint8_t> : public Vectorized8<uint8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<uint8_t> blendv(
+      const Vectorized<uint8_t>& a,
+      const Vectorized<uint8_t>& b,
+      const Vectorized<uint8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+
+  Vectorized<uint8_t> neg() const;
+
+  Vectorized<uint8_t> abs() const {
+    return *this;
+  }
+
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpeq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpneq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmplt_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmple_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
+template <>
+Vectorized<int64_t> inline operator+(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_add_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator+(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_add_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator+(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_add_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator+(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm512_add_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator+(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm512_add_epi8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator-(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_sub_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator-(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_sub_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator-(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_sub_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator-(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm512_sub_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator-(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm512_sub_epi8(a, b);
+}
+
+// Negation. Defined here so we can utilize operator-
+inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
+  return Vectorized<int64_t>(0) - *this;
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::neg() const {
+  return Vectorized<int32_t>(0) - *this;
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::neg() const {
+  return Vectorized<int16_t>(0) - *this;
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
+  return Vectorized<int8_t>(0) - *this;
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
+template <>
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_mullo_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_mullo_epi16(a, b);
+}
+
+template <typename T, typename Op>
+Vectorized<T> inline int_elementwise_binary_512(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] = op(values_a[i], values_b[i]);
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  // We don't have an instruction for multiplying int8_t
+#ifndef CPU_CAPABILITY_AVX512
+  return int_elementwise_binary_512(a, b, std::multiplies<int8_t>());
+#else
+  __m512i mask00FF = _mm512_set1_epi16(0x00FF);
+  __m512i a_lo = _mm512_srai_epi16(_mm512_slli_epi16(a, 8), 8);
+  __m512i b_lo = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);
+  __m512i a_hi = _mm512_srai_epi16(a, 8);
+  __m512i b_hi = _mm512_srai_epi16(b, 8);
+  __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+  __m512i res = _mm512_or_si512(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<uint8_t> inline operator*(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+#ifndef CPU_CAPABILITY_AVX512
+  return int_elementwise_binary_512(a, b, std::multiplies<uint8_t>());
+#else
+  __m512i mask00FF = _mm512_set1_epi16(0x00FF);
+  __m512i a_lo = _mm512_and_si512(a, mask00FF);
+  __m512i b_lo = _mm512_and_si512(b, mask00FF);
+  __m512i a_hi = _mm512_srli_epi16(a, 8);
+  __m512i b_hi = _mm512_srli_epi16(b, 8);
+  __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+  __m512i res = _mm512_or_si512(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_min_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_min_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_min_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm512_min_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline minimum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm512_min_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_max_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_max_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_max_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return _mm512_max_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline maximum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return _mm512_max_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val,
+    const Vectorized<int64_t>& max_val) {
+  return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val));
+}
+
+template <>
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val,
+    const Vectorized<int32_t>& max_val) {
+  return _mm512_min_epi32(max_val, _mm512_max_epi32(a, min_val));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val,
+    const Vectorized<int16_t>& max_val) {
+  return _mm512_min_epi16(max_val, _mm512_max_epi16(a, min_val));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val,
+    const Vectorized<int8_t>& max_val) {
+  return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val));
+}
+
+template <>
+Vectorized<uint8_t> inline clamp(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val,
+    const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max_val) {
+  return _mm512_min_epi64(max_val, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max_val) {
+  return _mm512_min_epi32(max_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max_val) {
+  return _mm512_min_epi16(max_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max_val) {
+  return _mm512_min_epi8(max_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_max(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val) {
+  return _mm512_max_epi64(min_val, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val) {
+  return _mm512_max_epi32(min_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val) {
+  return _mm512_max_epi16(min_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val) {
+  return _mm512_max_epi8(min_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_min(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val) {
+  return _mm512_max_epu8(min_val, a);
+}
+
+template <typename T>
+std::enable_if_t<
+    !(std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>),
+    Vectorized<
+        int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized<int32_t>::size()) {
+  return Vectorized<int32_t>::loadu(ptr, count);
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const int8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm512_cvtepi8_epi32(
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+  } else {
+    auto a = Vectorized<int8_t>::loadu(ptr, count);
+    return _mm512_cvtepi8_epi32(_mm512_castsi512_si128(a));
+  }
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const uint8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm512_cvtepu8_epi32(
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+  } else {
+    auto a = Vectorized<uint8_t>::loadu(ptr, count);
+    return _mm512_cvtepu8_epi32(_mm512_castsi512_si128(a));
+  }
+}
+
+template <>
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<uint8_t>());
+}
+
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_and_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_or_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_xor_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  return _mm512_xor_si512(a, _mm512_set1_epi32(-1));
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(
+    const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(
+    const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(
+    const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(
+    const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(
+    const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(
+    const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(
+    const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(
+    const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(
+    const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(
+    const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(
+    const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(
+    const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(
+    const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(
+    const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(
+    const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(
+    const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(
+    const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(
+    const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(
+    const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(
+    const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(
+    const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(
+    const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(
+    const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(
+    const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(
+    const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(
+    const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(
+    const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(
+    const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <
+    bool left_shift,
+    typename T,
+    typename std::enable_if_t<
+        std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+        int> = 0>
+Vectorized<T> inline shift_512_8(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
+
+  // Control masks for shuffle operation, treating 512 bits as an
+  // array of 8-bit elements, and considering pairs of neighboring
+  // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m512i ctl_0_1 = _mm512_set_epi8(
+      62,
+      0x80,
+      60,
+      0x80,
+      58,
+      0x80,
+      56,
+      0x80,
+      54,
+      0x80,
+      52,
+      0x80,
+      50,
+      0x80,
+      48,
+      0x80,
+      46,
+      0x80,
+      44,
+      0x80,
+      42,
+      0x80,
+      40,
+      0x80,
+      38,
+      0x80,
+      36,
+      0x80,
+      34,
+      0x80,
+      32,
+      0x80,
+      30,
+      0x80,
+      28,
+      0x80,
+      26,
+      0x80,
+      24,
+      0x80,
+      22,
+      0x80,
+      20,
+      0x80,
+      18,
+      0x80,
+      16,
+      0x80,
+      14,
+      0x80,
+      12,
+      0x80,
+      10,
+      0x80,
+      8,
+      0x80,
+      6,
+      0x80,
+      4,
+      0x80,
+      2,
+      0x80,
+      0,
+      0x80);
+  __m512i ctl_1_0 = _mm512_set_epi8(
+      0x80,
+      63,
+      0x80,
+      61,
+      0x80,
+      59,
+      0x80,
+      57,
+      0x80,
+      55,
+      0x80,
+      53,
+      0x80,
+      51,
+      0x80,
+      49,
+      0x80,
+      47,
+      0x80,
+      45,
+      0x80,
+      43,
+      0x80,
+      41,
+      0x80,
+      39,
+      0x80,
+      37,
+      0x80,
+      35,
+      0x80,
+      33,
+      0x80,
+      31,
+      0x80,
+      29,
+      0x80,
+      27,
+      0x80,
+      25,
+      0x80,
+      23,
+      0x80,
+      21,
+      0x80,
+      19,
+      0x80,
+      17,
+      0x80,
+      15,
+      0x80,
+      13,
+      0x80,
+      11,
+      0x80,
+      9,
+      0x80,
+      7,
+      0x80,
+      5,
+      0x80,
+      3,
+      0x80,
+      1);
+
+  // Masks for bitwise and operation, treating 512 bits as an array of
+  // 8-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m512i keep_0 = _mm512_set1_epi16(0xFF);
+  __m512i keep_1 = _mm512_set1_epi16(0xFF00);
+
+  // Take each 8-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 16 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 16-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 16 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m512i a0 = _mm512_shuffle_epi8(a, ctl_0_1);
+  __m512i b0 = _mm512_and_si512(b, keep_0);
+  __m512i c0;
+  if (left_shift)
+    c0 = _mm512_sllv_epi16(a0, b0);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c0 = _mm512_srav_epi16(a0, b0);
+  else
+    c0 = _mm512_srlv_epi16(a0, b0);
+  c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
+
+  // Perform shifting the same way for input array elements with
+  // idx%2==1.
+  __m512i a1 = _mm512_and_si512(a, keep_1);
+  __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
+  __m512i c1;
+  if (left_shift)
+    c1 = _mm512_sllv_epi16(a1, b1);
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c1 = _mm512_srav_epi16(a1, b1);
+  else
+    c1 = _mm512_srlv_epi16(a1, b1);
+  c1 = _mm512_and_si512(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m512i c = _mm512_or_si512(c0, c1);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_sllv_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator<<(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return _mm512_srav_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return _mm512_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return _mm512_srav_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator>>(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ad0997df7d03d19214f50c9fa81b8d1f03ab02c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_mask.h
@@ -0,0 +1,395 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <typename T, int dst_n, typename mask_t, int mask_n>
+struct VecMaskLoad<
+    T,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (mask_n == dst_n * 2 && dst_n >= 1) &&
+            (std::is_same_v<T, float> || std::is_same_v<T, int32_t>),
+        void>> {
+  static inline VectorizedN<T, dst_n> apply(
+      const T* ptr,
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    at::vec::Vectorized<T> zero_vec(0);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    VectorizedN<mask_t, 2> tmp_vec;
+    VectorizedN<T, dst_n> result;
+    for (int i = 0; i < dst_n; i++) {
+      tmp_vec[0] = vec_mask[2 * i];
+      tmp_vec[1] = vec_mask[2 * i + 1];
+      auto int64_mask = VecMask<mask_t, 2>(tmp_vec).template cast<int64_t, 2>();
+      auto int_mask = int64_mask.template cast<int, 1>()[0];
+      auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+      if constexpr (std::is_same_v<T, float>) {
+        result[i] = Vectorized<T>(_mm512_mask_loadu_ps(
+            zero_vec, mmask, ptr + i * Vectorized<T>::size()));
+      } else {
+        result[i] = Vectorized<T>(_mm512_mask_loadu_epi32(
+            zero_vec, mmask, ptr + i * Vectorized<T>::size()));
+      }
+    }
+    return result;
+  }
+};
+
+template <typename T, int dst_n, typename mask_t>
+struct VecMaskLoad<
+    T,
+    dst_n,
+    mask_t,
+    dst_n,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t>,
+        void>> {
+  static inline VectorizedN<T, dst_n> apply(
+      const T* ptr,
+      const VecMask<mask_t, dst_n>& vec_mask) {
+    at::vec::Vectorized<T> zero_vec(0);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    VectorizedN<T, dst_n> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < dst_n; i++) {
+      auto tmp_mask = VecMask<mask_t, 1>(vec_mask[i]);
+      auto int_mask = tmp_mask.template cast<int, 1>()[0];
+      auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+      if constexpr (std::is_same_v<T, float>) {
+        result[i] = Vectorized<T>(_mm512_mask_loadu_ps(
+            zero_vec, mmask, ptr + i * Vectorized<T>::size()));
+      } else {
+        result[i] = Vectorized<T>(_mm512_mask_loadu_epi32(
+            zero_vec, mmask, ptr + i * Vectorized<T>::size()));
+      }
+    }
+    return result;
+  }
+};
+
+template <typename data_t, int dst_n, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    dst_n,
+    mask_t,
+    dst_n,
+    std::enable_if_t<
+        std::is_same_v<data_t, BFloat16> || std::is_same_v<data_t, Half>>> {
+  static inline VectorizedN<data_t, dst_n> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, dst_n>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    VectorizedN<data_t, dst_n> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < dst_n; i++) {
+      auto tmp_mask = VecMask<mask_t, 1>(vec_mask[i]);
+      auto int_mask = tmp_mask.template cast<int, 2>();
+      auto mmask0 = _mm512_cmp_epi32_mask(int_mask[0], all_ones, _MM_CMPINT_EQ);
+      auto mmask1 = _mm512_cmp_epi32_mask(int_mask[1], all_ones, _MM_CMPINT_EQ);
+      auto zero = _mm256_set1_epi16(0);
+      auto temp0 = _mm256_mask_loadu_epi16(
+          zero, mmask0, ptr + (2 * i) * Vectorized<int>::size());
+      auto temp1 = _mm256_mask_loadu_epi16(
+          zero, mmask1, ptr + (2 * i + 1) * Vectorized<int>::size());
+      result[i] = Vectorized<data_t>(
+          _mm512_inserti32x8(_mm512_castsi256_si512(temp0), temp1, 1));
+    }
+    return result;
+  }
+};
+
+template <typename data_t, int dst_n, typename mask_t, int mask_n>
+struct VecMaskLoad<
+    data_t,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (mask_n == 2 * dst_n && dst_n >= 1) &&
+        (std::is_same_v<data_t, BFloat16> || std::is_same_v<data_t, Half>)>> {
+  static inline VectorizedN<data_t, dst_n> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    VectorizedN<data_t, dst_n> result;
+    VectorizedN<mask_t, 2> tmp_vec;
+    for (int i = 0; i < dst_n; i++) {
+      tmp_vec[0] = vec_mask[2 * i];
+      tmp_vec[1] = vec_mask[2 * i + 1];
+      auto int_mask = VecMask<mask_t, 2>(tmp_vec).template cast<int, 2>();
+      auto mmask0 = _mm512_cmp_epi32_mask(int_mask[0], all_ones, _MM_CMPINT_EQ);
+      auto mmask1 = _mm512_cmp_epi32_mask(int_mask[1], all_ones, _MM_CMPINT_EQ);
+      auto zero = _mm256_set1_epi16(0);
+      auto temp0 = _mm256_mask_loadu_epi16(
+          zero, mmask0, ptr + (2 * i) * Vectorized<int>::size());
+      auto temp1 = _mm256_mask_loadu_epi16(
+          zero, mmask1, ptr + (2 * i + 1) * Vectorized<int>::size());
+      result[i] = Vectorized<data_t>(
+          _mm512_inserti32x8(_mm512_castsi256_si512(temp0), temp1, 1));
+    }
+    return result;
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    1,
+    mask_t,
+    1,
+    std::enable_if_t<
+        std::is_same_v<data_t, int8_t> || std::is_same_v<data_t, uint8_t>>> {
+  static inline VectorizedN<data_t, 1> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm_set1_epi8(0);
+    auto temp = _mm_mask_loadu_epi8(zero, mmask, ptr);
+    return Vectorized<data_t>(
+        _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0));
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    2,
+    mask_t,
+    1,
+    std::enable_if_t<
+        std::is_same_v<data_t, int64_t> || std::is_same_v<data_t, double>>> {
+  static inline VectorizedN<data_t, 2> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    at::vec::Vectorized<data_t> zero_vec(0);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    at::vec::VectorizedN<data_t, 2> result;
+    if constexpr (std::is_same_v<data_t, double>) {
+      result[0] = _mm512_mask_loadu_pd(zero_vec, (__mmask8)mmask, ptr);
+      result[1] =
+          _mm512_mask_loadu_pd(zero_vec, (__mmask8)(mmask >> 8), ptr + 8);
+    } else {
+      result[0] = _mm512_mask_loadu_epi64(zero_vec, (__mmask8)mmask, ptr);
+      result[1] =
+          _mm512_mask_loadu_epi64(zero_vec, (__mmask8)(mmask >> 8), ptr + 8);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<float, N, int, N> {
+  static inline VecMask<float, N> apply(const VecMask<int, N>& vec_mask) {
+    VectorizedN<float, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm512_castsi512_ps(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<int, N, float, N> {
+  static inline VecMask<int, N> apply(const VecMask<float, N>& vec_mask) {
+    VectorizedN<int, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm512_castps_si512(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<int64_t, N, double, N> {
+  static inline VecMask<int64_t, N> apply(const VecMask<double, N>& vec_mask) {
+    VectorizedN<int64_t, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm512_castpd_si512(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct VecMaskCast<double, N, int64_t, N> {
+  static inline VecMask<double, N> apply(const VecMask<int64_t, N>& vec_mask) {
+    VectorizedN<double, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result[i] = _mm512_castsi512_pd(vec_mask[i]);
+    }
+    return result;
+  }
+};
+
+template <int dst_n, typename mask_t, int mask_n>
+struct VecMaskCast<
+    int64_t,
+    dst_n,
+    mask_t,
+    mask_n,
+    typename std::enable_if_t<
+        (dst_n == 2 * mask_n) &&
+            (std::is_same_v<mask_t, float> || std::is_same_v<mask_t, int>),
+        void>> {
+  static inline VecMask<int64_t, dst_n> apply(
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    VectorizedN<int64_t, dst_n> result;
+    auto int_mask = vec_mask.template cast<int, mask_n>();
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < mask_n; ++i) {
+      auto int64_vec =
+          convert<int64_t, 2, int, 1>(VectorizedN<int, 1>(int_mask[i]));
+      result[2 * i] = int64_vec[0];
+      result[2 * i + 1] = int64_vec[1];
+    }
+    return VecMask<int64_t, dst_n>(result);
+  }
+};
+
+template <typename dst_t, int dst_n, int mask_n>
+struct VecMaskCast<
+    dst_t,
+    dst_n,
+    int64_t,
+    mask_n,
+    typename std::enable_if_t<
+        (mask_n == 2 * dst_n) &&
+            (std::is_same_v<dst_t, float> || std::is_same_v<dst_t, int>),
+        void>> {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<int64_t, mask_n>& vec_mask) {
+    VectorizedN<int, dst_n> result;
+    VectorizedN<int64_t, 2> int64_vec;
+    for (int i = 0; i < dst_n; ++i) {
+      int64_vec[0] = vec_mask[2 * i];
+      int64_vec[1] = vec_mask[2 * i + 1];
+      result[i] = convert<int, 1, int64_t, 2>(int64_vec);
+    }
+    return VecMask<int, dst_n>(result).template cast<dst_t, dst_n>();
+  }
+};
+
+template <>
+struct VecMaskCast<double, 2, float, 1> {
+  static inline VecMask<double, 2> apply(const VecMask<float, 1>& vec_mask) {
+    auto int64_mask = VecMaskCast<int64_t, 2, float, 1>::apply(vec_mask);
+    return VecMaskCast<double, 2, int64_t, 2>::apply(int64_mask);
+  }
+};
+
+template <>
+struct VecMaskCast<float, 1, double, 2> {
+  static inline VecMask<float, 1> apply(const VecMask<double, 2>& vec_mask) {
+    auto int64_mask = VecMaskCast<int64_t, 2, double, 2>::apply(vec_mask);
+    return VecMaskCast<float, 1, int64_t, 2>::apply(int64_mask);
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  __mmask16 mask = _mm512_test_epi32_mask(mask_[0], mask_[0]);
+  return mask == 0;
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm512_movepi32_mask(mask_[0]) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  __mmask16 mask = _mm512_movepi32_mask(mask_[0]);
+  return mask == 0xffff;
+}
+
+template <int N>
+struct VecMaskCheck<int64_t, N> {
+  static inline bool all_zero(const VectorizedN<int64_t, N>& vec_mask) {
+    bool all_zero = true;
+    for (int i = 0; i < N; ++i) {
+      all_zero =
+          all_zero && (_mm512_test_epi64_mask(vec_mask[i], vec_mask[i]) == 0);
+      if (!all_zero) {
+        return all_zero;
+      }
+    }
+    return all_zero;
+  }
+
+  static inline bool is_masked(const VectorizedN<int64_t, N>& vec_mask, int i) {
+    for (int j = 0; j < N; ++j) {
+      if (i < (j + 1) * 8) {
+        return _mm512_movepi64_mask(vec_mask[j]) & (1 << (i - j * 8));
+      }
+    }
+    return false;
+  }
+
+  static inline bool all_masked(const VectorizedN<int64_t, N>& vec_mask) {
+    bool all_masked = true;
+    for (int i = 0; i < N; ++i) {
+      all_masked = all_masked && (_mm512_movepi64_mask(vec_mask[i]) == 0xff);
+      if (!all_masked) {
+        return all_masked;
+      }
+    }
+    return all_masked;
+  }
+};
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h
new file mode 100644
index 0000000000000000000000000000000000000000..270b96bac433b52d68329bf0a452381d0c8170a3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h
@@ -0,0 +1,1552 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cmath>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m512i vals;
+#else
+struct Vectorizedqi {
+ protected:
+  __m512i vals __attribute__((aligned(64)));
+#endif
+
+ public:
+  Vectorizedqi() {
+    vals = _mm512_setzero_si512();
+  }
+  Vectorizedqi(__m512i v) : vals(v) {}
+  operator __m512i() const {
+    return vals;
+  }
+};
+
+template <typename T>
+__m512i pack_saturate_and_clamp(
+    __m512i first,
+    __m512i second,
+    T min_val,
+    T max_val);
+
+template <>
+inline __m512i pack_saturate_and_clamp<int32_t>(
+    __m512i first [[maybe_unused]],
+    __m512i second [[maybe_unused]],
+    int32_t min_val [[maybe_unused]],
+    int32_t max_val [[maybe_unused]]) {
+  // This function is for linkage only, will not be used
+  TORCH_CHECK(false, "pack_saturate_and_clamp<int32_t> is not supported");
+  return __m512i{};
+}
+
+template <>
+inline __m512i pack_saturate_and_clamp<int8_t>(
+    __m512i first,
+    __m512i second,
+    int8_t min_val,
+    int8_t max_val) {
+  __m512i packed_and_sat = _mm512_packs_epi16(first, second);
+  return _mm512_max_epi8(
+      _mm512_set1_epi8(min_val),
+      _mm512_min_epi8(packed_and_sat, _mm512_set1_epi8(max_val)));
+}
+
+template <>
+inline __m512i pack_saturate_and_clamp<uint8_t>(
+    __m512i first,
+    __m512i second,
+    uint8_t min_val,
+    uint8_t max_val) {
+  __m512i packed_and_sat = _mm512_packus_epi16(first, second);
+  return _mm512_max_epu8(
+      _mm512_set1_epi8(min_val),
+      _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val)));
+}
+
+template <typename T>
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 16*8 bits
+  __m128i input_128 = _mm512_castsi512_si128(src);
+  // Convert from 16*uint8/int8 to 16*int32
+  __m512i input_512_extended;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_512_extended = _mm512_cvtepu8_epi32(input_128);
+  else
+    input_512_extended = _mm512_cvtepi8_epi32(input_128);
+  // Convert from 16*int32 to 16*float32
+  return _mm512_cvtepi32_ps(input_512_extended);
+}
+
+template <typename T>
+at::vec::Vectorized<T> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src);
+
+template <>
+at::vec::Vectorized<int8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // Convert from float32 to int32 with truncation
+  __m512i x_values_int32 = _mm512_cvttps_epi32(src);
+
+  // Convert from int32 to int16 using signed saturation
+  __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
+
+  constexpr auto min_val = std::numeric_limits<int8_t>::min();
+  constexpr auto max_val = std::numeric_limits<int8_t>::max();
+
+  // Convert from int16 to int8 using unsigned saturation
+  __m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+  return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+}
+
+template <>
+at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // The type of *_val should be int32_t to ensure correct clamping behavior.
+  constexpr auto min_val = std::numeric_limits<int32_t>::min();
+  constexpr auto max_val = std::numeric_limits<int32_t>::max();
+  __m512 float32_min_val = _mm512_set1_ps(float(min_val));
+  __m512 float32_max_val = _mm512_set1_ps(float(max_val));
+  __m512 float32_src = _mm512_max_ps(src, float32_min_val);
+  float32_src = _mm512_min_ps(float32_src, float32_max_val);
+  __m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
+  __m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
+  return _mm512_castsi128_si512(int8_src);
+}
+
+template <typename T>
+__FORCE_INLINE void QuantizeAvx512(
+    const float* src,
+    T* dst,
+    int len,
+    float inverse_scale,
+    int64_t zero_point) {
+  constexpr int VLEN = 16;
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  const __m512i min_v = _mm512_set1_epi32(min_val);
+  const __m512i max_v = _mm512_set1_epi32(max_val);
+  // This is the largest int32 value < int32_max exactly representable in float
+  constexpr int32_t int32_float_max_val =
+      std::numeric_limits<int32_t>::max() - 127;
+  int i = 0;
+  __m512 inverse_scale_v = _mm512_set1_ps(inverse_scale);
+  // clang-format off
+  static const __m512i shuffle_mask_v = _mm512_set_epi8(
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00);
+  // clang-format on
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+  __m512i permute_mask_l8_v = _mm512_set_epi32(
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+  int len_aligned = len / (VLEN * 4) * (VLEN * 4);
+  for (; i < len_aligned; i += 4 * VLEN) {
+    // x
+    __m512 x_vals = _mm512_load_ps(src + i);
+    __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
+    // If the floating point value is greater than int32_max,
+    // _mm512_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
+    // Clip at int32_float_max_val to avoid this.
+    x_transformed_v =
+        _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // y
+    __m512 y_vals = _mm512_load_ps(src + i + VLEN);
+    __m512 y_transformed_v = _mm512_mul_ps(y_vals, inverse_scale_v);
+    y_transformed_v =
+        _mm512_min_ps(y_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // z
+    __m512 z_vals = _mm512_load_ps(src + i + 2 * VLEN);
+    __m512 z_transformed_v = _mm512_mul_ps(z_vals, inverse_scale_v);
+    z_transformed_v =
+        _mm512_min_ps(z_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // w
+    __m512 w_vals = _mm512_load_ps(src + i + 3 * VLEN);
+    __m512 w_transformed_v = _mm512_mul_ps(w_vals, inverse_scale_v);
+    w_transformed_v =
+        _mm512_min_ps(w_transformed_v, _mm512_set1_ps(int32_float_max_val));
+
+    __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
+    __m512i y_rounded_v = _mm512_cvtps_epi32(y_transformed_v);
+    __m512i z_rounded_v = _mm512_cvtps_epi32(z_transformed_v);
+    __m512i w_rounded_v = _mm512_cvtps_epi32(w_transformed_v);
+
+    // add zero point
+    x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
+    y_rounded_v = _mm512_add_epi32(y_rounded_v, _mm512_set1_epi32(zero_point));
+    z_rounded_v = _mm512_add_epi32(z_rounded_v, _mm512_set1_epi32(zero_point));
+    w_rounded_v = _mm512_add_epi32(w_rounded_v, _mm512_set1_epi32(zero_point));
+
+    __m512i xy_packed_v = _mm512_packs_epi32(x_rounded_v, y_rounded_v);
+    __m512i zw_packed_v = _mm512_packs_epi32(z_rounded_v, w_rounded_v);
+    __m512i xyzw_clamped_v =
+        pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+    xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v);
+  }
+
+  // Additional 8-lane AVX512 version to take advantage when len is smaller
+  // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
+  for (; i < len / VLEN * VLEN; i += VLEN) {
+    __m512 x_vals = _mm512_load_ps(src + i);
+    __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
+    x_transformed_v =
+        _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
+    x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
+    __m512i x_clipped_v =
+        _mm512_max_epi32(min_v, _mm512_min_epi32(max_v, x_rounded_v));
+
+    x_clipped_v = _mm512_shuffle_epi8(x_clipped_v, shuffle_mask_v);
+    x_clipped_v = _mm512_permutexvar_epi32(permute_mask_l8_v, x_clipped_v);
+    _mm_storeu_si128(
+        reinterpret_cast<__m128i*>(dst + i),
+        _mm512_castsi512_si128(x_clipped_v));
+  }
+
+  for (; i < len; ++i) {
+    float transformed = src[i] * inverse_scale;
+
+    // Not exactly the same behavior as the vectorized code.
+    // The vectorized code above always rounds to even in halfway cases
+    // (https://software.intel.com/en-us/node/523819), but std::nearbyint
+    // does the same only when the current rounding mode is FE_TONEAREST.
+    // However, in practice, this should not be a problem because most cases
+    // use the default rounding mode FE_TONEAREST.
+    // Note that we cannot implement the same behavior as the vectorized code
+    // using std::round because it does rounding away from zero in halfway
+    // cases.
+    transformed = zero_point + std::nearbyint(transformed);
+    float clipped =
+        std::min(std::max(transformed, float(min_val)), float(max_val));
+    dst[i] = clipped;
+  }
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 1;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 1>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint32& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi32(uw);
+  }
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    __m512 float_vals = _mm512_cvtepi32_ps(vals);
+    return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m512 float_vals = _mm512_cvtepi32_ps(vals);
+    return {(Vectorized<float>(float_vals) - zero_point) * scale};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale [[maybe_unused]]) {
+    Vectorized<c10::qint32> retval;
+    auto rhs_data = (__m512)rhs[0];
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
+    return retval;
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    return _mm512_max_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    return _mm512_min_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    return _mm512_min_epi32(
+        _mm512_max_epi32(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {_mm512_sub_epi32(vals, b)};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+
+    __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
+    __m512i rounded = _mm512_cvtps_epi32(scaled);
+    return _mm512_add_epi32(rounded, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm512_add_epi32(a, b);
+}
+
+/*
+ * Convert values from int32 back to int8/uint8
+ */
+template <typename T>
+__m512i RequantizeAvx512(
+    const std::array<Vectorized<c10::qint32>, 4>& inp,
+    __m512 multiplier,
+    __m512i zp) {
+  static_assert(
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+  __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier);
+  __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier);
+  __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier);
+  __m512 w_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[3]), multiplier);
+
+  __m512i x_rounded_v = _mm512_cvtps_epi32(x_scaled_v);
+  __m512i y_rounded_v = _mm512_cvtps_epi32(y_scaled_v);
+  __m512i z_rounded_v = _mm512_cvtps_epi32(z_scaled_v);
+  __m512i w_rounded_v = _mm512_cvtps_epi32(w_scaled_v);
+
+  /* Add zero point */
+  __m512i x_v = _mm512_add_epi32(x_rounded_v, zp);
+  __m512i y_v = _mm512_add_epi32(y_rounded_v, zp);
+  __m512i z_v = _mm512_add_epi32(z_rounded_v, zp);
+  __m512i w_v = _mm512_add_epi32(w_rounded_v, zp);
+
+  /* Pack to int16_t and saturate */
+  __m512i xy_packed_v = _mm512_packs_epi32(x_v, y_v);
+  __m512i zw_packed_v = _mm512_packs_epi32(z_v, w_v);
+
+  __m512i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+  /*
+   * xyzw_clamped_v has results in the following layout so we need to
+   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11
+   * x12-15 y12-15 z12-15 w12-15
+   */
+  xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+  return xyzw_clamped_v;
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+  static constexpr int size() {
+    return 64;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 4;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = c10::qint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+
+  Vectorized() {}
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint8& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi8(uw);
+  }
+
+  // This is needed because the compiler emits awful code for the default
+  // constructor for moving the enum
+  Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) {}
+
+  // This is added to avoid error: definition of implicit copy assignment
+  // operator for 'Vectorized<c10::qint8>' is deprecated because it has a
+  // user-declared copy constructor [-Werror,-Wdeprecated-copy]
+  Vectorized& operator=(const Vectorized<c10::qint8>&) = default;
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+ private:
+  __m512i cvtepi8_epi32(__m128i epi8_vals) const {
+    return _mm512_cvtepi8_epi32(epi8_vals);
+  }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_neg_zp_premul) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    int8_t quantized_values[64];
+    QuantizeAvx512<value_type>(
+        rhs_data, quantized_values, 64, inverse_scale, zero_point);
+    return Vectorized<c10::qint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    return _mm512_max_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    return _mm512_min_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    return _mm512_min_epi8(_mm512_max_epi8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512i int32_val0 = cvtepi8_epi32(int_val0);
+    __m512i int32_val1 = cvtepi8_epi32(int_val1);
+    __m512i int32_val2 = cvtepi8_epi32(int_val2);
+    __m512i int32_val3 = cvtepi8_epi32(int_val3);
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+#else
+    __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+#endif
+
+    __m512i int32_b0 = cvtepi8_epi32(int_b0);
+    __m512i int32_b1 = cvtepi8_epi32(int_b1);
+    __m512i int32_b2 = cvtepi8_epi32(int_b2);
+    __m512i int32_b3 = cvtepi8_epi32(int_b3);
+
+    __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+    __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+    __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+    __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+    return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+  static constexpr int size() {
+    return 64;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 4;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = c10::quint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::quint8& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi8(uw);
+  }
+
+  Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) {}
+
+  // This is added to avoid error: definition of implicit copy assignment
+  // operator for 'Vectorized<c10::quint8>' is deprecated because it has a
+  // user-declared copy constructor [-Werror,-Wdeprecated-copy]
+  Vectorized& operator=(const Vectorized<c10::quint8>&) = default;
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+ private:
+  __m512i cvtepu8_epi32(__m128i epu8_vals) const {
+    return _mm512_cvtepu8_epi32(epu8_vals);
+  }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
+
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    uint8_t quantized_values[64];
+    QuantizeAvx512<value_type>(
+        rhs_data, quantized_values, 64, inverse_scale, zero_point);
+    return Vectorized<c10::quint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    return _mm512_max_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    return _mm512_min_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    return _mm512_min_epu8(_mm512_max_epu8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512i int32_val0 = cvtepu8_epi32(int_val0);
+    __m512i int32_val1 = cvtepu8_epi32(int_val1);
+    __m512i int32_val2 = cvtepu8_epi32(int_val2);
+    __m512i int32_val3 = cvtepu8_epi32(int_val3);
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+#else
+    __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+#endif
+
+    __m512i int32_b0 = cvtepu8_epi32(int_b0);
+    __m512i int32_b1 = cvtepu8_epi32(int_b1);
+    __m512i int32_b2 = cvtepu8_epi32(int_b2);
+    __m512i int32_b3 = cvtepu8_epi32(int_b3);
+
+    __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+    __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+    __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+    __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+    return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#else
+
+// NOTE: These are low-performance implementations that we fall back on.
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  static constexpr int size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / 8;
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / 8;
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (const auto i : c10::irange(size())) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul [[maybe_unused]]) const {
+    float_vec_return_type rv;
+    for (const auto i : c10::irange(float_num_vecs())) {
+      float tmp_vals[16];
+      for (const auto j : c10::irange(16)) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            scale[j], zero_point[j], T(vals[16 * i + j]));
+      }
+      rv[i] = Vectorized<float>(
+          tmp_vals[0],
+          tmp_vals[1],
+          tmp_vals[2],
+          tmp_vals[3],
+          tmp_vals[4],
+          tmp_vals[5],
+          tmp_vals[6],
+          tmp_vals[7],
+          tmp_vals[8],
+          tmp_vals[9],
+          tmp_vals[10],
+          tmp_vals[11],
+          tmp_vals[12],
+          tmp_vals[13],
+          tmp_vals[14],
+          tmp_vals[15]);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    Vectorized<float> scale_zp_premul;
+    return dequantize(scale, zero_point, scale_zp_premul);
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     16> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>(ptr) {}
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale [[maybe_unused]]) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (const auto i : c10::irange(size())) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] =
+          std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    64> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale [[maybe_unused]]) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     64> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(ptr) {}
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale [[maybe_unused]]) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC)
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f78c9cfff2acaf1c35bd66684a429f83f7c6ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h
@@ -0,0 +1,76 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include <ATen/Generator.h>
+
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+
+namespace at::cuda::detail {
+
+// Set the callback to initialize Magma, which is set by
+// torch_cuda_cu. This indirection is required so magma_init is called
+// in the same library where Magma will be used.
+TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
+
+
+// The real implementation of CUDAHooksInterface
+struct CUDAHooks : public at::CUDAHooksInterface {
+  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
+  void init() const override;
+  Device getDeviceFromPtr(void* data) const override;
+  bool isPinnedPtr(const void* data) const override;
+  const Generator& getDefaultGenerator(
+      DeviceIndex device_index = -1) const override;
+  Generator getNewGenerator(
+      DeviceIndex device_index = -1) const override;
+  bool hasCUDA() const override;
+  bool hasMAGMA() const override;
+  bool hasCuDNN() const override;
+  bool hasCuSOLVER() const override;
+  bool hasCuBLASLt() const override;
+  bool hasROCM() const override;
+  bool hasCKSDPA() const override;
+  bool hasCKGEMM() const override;
+  const at::cuda::NVRTC& nvrtc() const override;
+  DeviceIndex current_device() const override;
+  bool isBuilt() const override {return true;}
+  bool isAvailable() const override {return hasCUDA();}
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  Allocator* getCUDADeviceAllocator() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool compiledWithCuDNN() const override;
+  bool compiledWithMIOpen() const override;
+  bool supportsDilatedConvolutionWithCuDNN() const override;
+  bool supportsDepthwiseConvolutionWithCuDNN() const override;
+  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
+  bool hasCUDART() const override;
+  long versionCUDART() const override;
+  long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
+  long versionMIOpen() const override;
+  std::string showConfig() const override;
+  double batchnormMinEpsilonCuDNN() const override;
+  int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override;
+  void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
+  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
+  int getNumGPUs() const override;
+  DeviceIndex deviceCount() const override;
+  DeviceIndex getCurrentDevice() const override;
+
+#ifdef USE_ROCM
+  bool isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index = -1) const override;
+#endif
+  void deviceSynchronize(DeviceIndex device_index) const override;
+};
+
+} // at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c5e22a6f2642c79fbbbd37495cd2195fe262738
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h
@@ -0,0 +1,156 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
+// These handles are tied to device, and these libraries requires/recommends not to
+// share handles across host threads.
+//
+// These libraries recommend using one handle per host thread. We may not want to do
+// this because threads are relatively light-weight, but creating and destroying
+// handles is expensive (destroying the handle causes synchronizations). DataParallel,
+// for example, creates new threads for each forward pass.
+//
+// This file implements a handle pool mechanism. The handle pool returns handles on
+// demand as threads request them. If all existing handles in the pool are in use,
+// it creates a new one. As threads terminate, they release handles back into the pool.
+// In this way, the handle pool never creates more handles than the high-water mark of
+// active threads, so it's efficient with DataParallel.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <mutex>
+#include <memory>
+
+#include <c10/util/Exception.h>
+
+namespace at::cuda { namespace {
+
+template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
+struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {
+
+    struct Handle {
+    Handle_t handle;
+    Handle(bool create = false) : handle(nullptr)
+    {
+        if(create) Create(&handle);
+    }
+    // std::vector.emplace() and push_back() may route through temporaries and call
+    // copy/move constructors along the way.  If this is the case, we don't want
+    // the destructors of temporaries to call cudnnDestroy on the handle.
+    // We can achieve safety (for the narrow case of stashing within std::vectors)
+    // by making Handle moveable but not copyable, and transferring handle ownership
+    // to the latest constructed object.  This is not a substitute for full-blown
+    // reference counting, but reference counting may be overkill here.
+    // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
+    // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
+    Handle(const Handle& rhs) = delete;
+    // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
+    Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); }
+    // operator= takes argument by value
+    Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
+    ~Handle() {
+        if(handle) Destroy(handle);
+    }
+    };
+
+    std::mutex mutex;
+
+    // Handles are lazily created as different threads request them,
+    // but are never destroyed until the end of the process.
+    // The maximum number of handles this process will create for each device is equal
+    // to the high-water mark of the number of concurrently active threads that request
+    // handles for that device.
+    // When threads terminate, they release their handles back into the pool for reuse.
+    // Otherwise, new handles would be created every time new threads were spawned,
+    // resulting in poor performance for Python modules that repeatedly or frequently
+    // spawned new sets of threads (like DataParallel, which creates a new set of threads
+    // for each forward pass).
+    //
+    // To prevent potential deadlocks, we explicitly choose not to cap the number
+    // of handles that are created per device.
+    // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
+    // only 4 can make forward progress at any time. The other 4 will not release their
+    // handles until they exit, so the fifth cannot make progress until then.  This is
+    // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
+    // intermediate point (ie, before any of them have exited).  We have no way to anticipate
+    // or enforce that user threads will not attempt such intermediate synchronization.
+    // The only way to ensure safety is to avoid imposing a cap on the number of handles.
+    std::unordered_map<int, std::vector<Handle>> created_handles;
+    std::unordered_map<int, std::vector<Handle_t>> available_handles;
+
+    // PoolWindow lazily creates and caches the handles that a particular thread is using,
+    // so in the common case handle access doesn't incur either handle creation or a mutex lock.
+    class PoolWindow
+    {
+    public:
+    PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
+    ~PoolWindow(){ release(); }
+
+    Handle_t reserve(int device)
+    {
+        // If this thread already has a handle for this device, return it
+        if(my_handles.find(device) != my_handles.end())
+        return my_handles[device];
+
+        // otherwise, either grab a handle from the pool if one is available,
+        // or if not, create a new one.
+        auto parent = weak_parent.lock();
+        TORCH_CHECK(parent, "Cannot create handle during program termination");
+        std::lock_guard<std::mutex> guard(parent->mutex);
+
+        if(parent->available_handles[device].size() > 0)
+        {
+        my_handles[device] = parent->available_handles[device].back();
+        parent->available_handles[device].pop_back();
+        }
+        else
+        {
+        // In local testing, I do observe that emplace_back sometimes routes through temporaries
+        // that incur move-constructor and destructor calls.  See comments in Handle above.
+        parent->created_handles[device].emplace_back(true /*create*/);
+        my_handles[device] = parent->created_handles[device].back().handle;
+        }
+
+        return my_handles[device];
+    }
+
+    private:
+    // Stores the per-device handles currently owned by this thread
+    std::unordered_map<int, Handle_t> my_handles;
+
+    std::weak_ptr<DeviceThreadHandlePool> weak_parent;
+
+    // Called by the destructor.  Releases this thread's handles back into the pool.
+    void release() {
+        if(!my_handles.empty()) {
+            auto parent = weak_parent.lock();
+            if (!parent) {
+                // If this thread exits after atexit handlers have completed, the
+                // cuda context itself may be invalid, so we must leak the handles.
+                return;
+            }
+
+            std::lock_guard<std::mutex> guard(parent->mutex);
+            for(auto d_h : my_handles)
+                parent->available_handles[d_h.first].push_back(d_h.second);
+        }
+    }
+    };
+
+    // Warning:
+    // If you want to change this function, be aware that this function will be called
+    // by multiple threads and there is no mutex guarding the call of this function, so
+    // make sure your implementation is thread-safe.
+    PoolWindow *newPoolWindow() {
+        // The returned pointer will be owned by a thread local variable
+        // so that different threads does not share the same PoolWindow.
+        return new PoolWindow(this->shared_from_this());
+    }
+};
+
+}}  // namespace at::cuda::detail::<anonymous>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6a09141b9b31c739fdbb50834397f9a92f1ca7f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at::cuda::detail {
+
+TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t);
+using at::native::canUse32BitIndexMath;
+
+template <typename scalar, typename IndexType>
+TensorInfo<scalar, IndexType>
+getTensorInfo(const at::TensorBase &t) {
+  IndexType sz[MAX_TENSORINFO_DIMS];
+  IndexType st[MAX_TENSORINFO_DIMS];
+
+  int dims = t.dim();
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = t.size(i);
+    st[i] = t.stride(i);
+  }
+
+  scalar* data_ptr = nullptr;
+
+  if constexpr (std::is_const_v<scalar>) {
+    data_ptr = t.const_data_ptr<scalar>();
+  } else {
+    data_ptr = t.mutable_data_ptr<scalar>();
+  }
+
+  return TensorInfo<scalar, IndexType>(
+    data_ptr, dims, sz, st);
+}
+
+} // namespace at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..432117f154c419067542cf2e3f5f51059b2068ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh
@@ -0,0 +1,129 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <assert.h>
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#include <cuda_runtime.h>
+#endif
+
+namespace at::cuda::detail {
+
+// A utility class to implement integer division by multiplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+
+  C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) { }
+
+  C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
+  C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
+  C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+
+  Value divisor;
+};
+
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+
+  IntDivider() = default;
+
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+
+  C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+
+  C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+
+  C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+
+}  // namespace at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..99562629fe531d9468fd8ec51bd98b2a492d4c35
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h
@@ -0,0 +1,42 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <limits>
+#include <c10/util/Exception.h>
+
+namespace at::cuda::detail {
+
+// CUDA: grid stride looping
+//
+// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment.
+// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final
+// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be
+// greater than INT_MAX.  But in that case _i_n_d_e_x >= n, so there are no
+// further iterations and the overflowed value in i=_i_n_d_e_x is not used.
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)                         \
+  int64_t _i_n_d_e_x = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;           \
+  for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int)
+
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+
+  // Round up division for positive number that cannot cause integer overflow
+  auto block_num = (N - 1) / max_threads_per_block + 1;
+  TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device");
+
+  return static_cast<int>(block_num);
+}
+
+}  // namespace at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h
new file mode 100644
index 0000000000000000000000000000000000000000..bab1495dda3989f4a491d3545ee23f8eec4c3773
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h
@@ -0,0 +1,16 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at::cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+
+namespace detail {
+extern NVRTC lazyNVRTC;
+} // namespace detail
+
+}  // namespace at::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5bd215318125ff9f0d9846b2adc2e3c9cb1c2e48
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh
@@ -0,0 +1,141 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <type_traits>
+#include <c10/macros/Macros.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/cuda/detail/IntegerDivider.cuh>
+
+// If element_sizes is nullptr, then the strides will be in bytes, otherwise
+// the strides will be in # of elements.
+// Operands that share the same shape, but may have different strides.
+// OffsetCalculator iterates the tensor in a column-major order
+
+#if defined(USE_ROCM)
+constexpr int MAX_DIMS = 16;
+#else
+constexpr int MAX_DIMS = 25;
+#endif
+
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  // We allow having negative strides to implement some operations like torch.flip
+  using stride_t = std::conditional_t<signed_strides,
+                                      std::make_signed_t<index_t>,
+                                      index_t>;
+  // The offset for each argument. Wrapper around fixed-size array.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = std::array<stride_t, std::max<int>(NARGS, 1)>;
+
+  // if element_sizes is nullptr, then the strides will be in bytes, otherwise
+  // the strides will be in # of elements.
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i=0; i < dims; i++){
+      sizes_[i] = at::cuda::detail::IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+
+#if defined(USE_ROCM)
+    if ((dims > 0) && (dims <= 2)) {
+      auto divmod = sizes_[0].divmod(linear_idx);
+#pragma unroll
+      for (int arg = 0; arg < NARGS; arg++)
+        offsets[arg] = divmod.mod * strides_[0][arg];
+      if (dims >= 2) {
+        divmod = sizes_[1].divmod(divmod.div);
+#pragma unroll
+        for (int arg = 0; arg < NARGS; arg++)
+          offsets[arg] += divmod.mod * strides_[1][arg];
+      }
+      // [...]
+      return offsets;
+    }
+#endif
+
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+
+    #pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.div;
+
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.mod * strides_[dim][arg];
+      }
+
+    }
+    return offsets;
+  }
+
+  int dims;
+  at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+
+template <int NARGS, typename index_t = uint32_t>
+struct TrivialOffsetCalculator {
+  // The offset for each argument. Wrapper around fixed-size array.
+  // The offsets are in # of elements, not in bytes.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = std::array<index_t, std::max<int>(NARGS, 1)>;
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+
+// Make an OffsetCalculator with byte offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(iter.ndim(), iter.shape().data(), strides.data());
+}
+
+// Make an OffsetCalculator with element offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_element_offset_calculator(
+    const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  std::array<int64_t, N> element_sizes;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(
+      iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data());
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e50519eb6a4fc842293e766f162ed26c7a028bd5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@@ -0,0 +1,48 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxCudaState.h>, which has a #pragma once.
+
+// Stores RNG state values. Passed as a kernel argument.
+// See Note [CUDA Graph-safe RNG states].
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+namespace at {
+
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed,
+                  uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(int64_t* seed,
+                  int64_t* offset_extragraph,
+                  uint64_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  Payload seed_{};
+  Payload offset_{};
+  uint64_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2d372718a4e786d676fff76c50da662e370be6ee
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh
@@ -0,0 +1,121 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/CollapseDims.h>
+
+namespace at::cuda::detail {
+
+#define MAX_TENSORINFO_DIMS 25
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo();
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  // See note on [collapse dims].
+  int collapseDims(const int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo() {
+  data = nullptr;
+  dims = 0;
+}
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions");
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+  auto result = at::collapse_dims(sizes, strides, dims, excludeDim);
+  dims = std::get<1>(result);
+  return std::get<0>(result);
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+
+    return offset + linearId * info.strides[0];
+  }
+};
+
+// Uses dynamic (runtime) instead of static (compile time) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+      IndexType offset = 0;
+
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+
+      return offset + linearId * info.strides[0];
+  }
+};
+
+} // namespace at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..797a857504ddcf336f0119f265c7a6d7e2e802a5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h
@@ -0,0 +1,705 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <string>
+#include <c10/core/ScalarType.h>
+
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/StringUtil.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+#include <ATen/OpMathType.h>
+#include <fmt/printf.h>
+
+namespace at::cuda::tunable {
+
+using at::blas::ScalingType;
+
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+
+inline char BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return 'N';
+    case BlasOp::T:
+      return 'T';
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return 'N';
+}
+
+template <typename T>
+inline const char* BLASTypeName(T v) {
+  return "unknown";
+}
+
+template <>
+inline const char* BLASTypeName(float v) {
+  return "f32_r";
+}
+
+template <>
+inline const char* BLASTypeName(double v) {
+  return "f64_r";
+}
+
+template <>
+inline const char* BLASTypeName(BFloat16 v) {
+  return "bf16_r";
+}
+
+template <>
+inline const char* BLASTypeName(Half v) {
+  return "f16_r";
+}
+
+//https://github.com/ROCm/hipBLASLt/blob/develop/library/src/include/auxiliary.hpp#L175
+template <>
+inline const char* BLASTypeName(Float8_e4m3fn v) {
+  return "f8_r";
+}
+
+template <>
+inline const char* BLASTypeName(Float8_e5m2 v) {
+  return "bf8_r";
+}
+
+template <>
+inline const char* BLASTypeName(Float8_e4m3fnuz v) {
+  return "f8_fnuz_r";
+}
+
+template <>
+inline const char* BLASTypeName(Float8_e5m2fnuz v) {
+  return "bf8_fnuz_r";
+}
+
+template <>
+inline const char* BLASTypeName(c10::complex<double> v) {
+  return "f64_r";
+}
+
+template <>
+inline const char* BLASTypeName(c10::complex<float> v) {
+  return "f32_r";
+}
+
+inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
+  std::string BLASType;
+  switch (scalar_type) {
+    case c10::ScalarType::Float:{
+      BLASType = "f32_r";
+      break;
+    }
+    case c10::ScalarType::Double:{
+      BLASType = "f64_r";
+      break;
+    }
+    case c10::ScalarType::BFloat16:{
+      BLASType = "bf16_r";
+      break;
+    }
+    case c10::ScalarType::Half: {
+      BLASType = "f16_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e4m3fn: {
+      BLASType = "f8_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e5m2: {
+      BLASType = "bf8_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e4m3fnuz: {
+      BLASType = "f8_fnuz_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e5m2fnuz: {
+      BLASType = "bf8_fnuz_r";
+      break;
+    }
+    case c10::ScalarType::ComplexFloat:{
+      BLASType = "f32_c";
+      break;
+    }
+    case c10::ScalarType::ComplexDouble:{
+      BLASType = "f64_c";
+      break;
+    }
+    default:
+      BLASType = "unknown";
+  }
+  return BLASType;
+
+}
+
+// Similar to Compute Type in GemmRocblas.h
+template <typename T>
+inline std::string ComputeTypeFor() {
+  return "Unknown ComputeType";
+}
+
+// This is a union of the compute types for
+// ROCBLAS and hipBLASLt.
+template <>
+inline std::string ComputeTypeFor<float>() {
+  if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
+    return "f32_r";
+  } else {
+    return "xf32_r";
+  }
+}
+
+template <>
+inline std::string ComputeTypeFor<double>() {
+  return "f64_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<Half>() {
+  return "f32_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<BFloat16>() {
+  return "f32_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<c10::complex<float>>() {
+  return "f32_c";
+}
+
+template <>
+inline std::string ComputeTypeFor<c10::complex<double>>() {
+  return "f64_c";
+}
+
+template <>
+inline std::string ComputeTypeFor<Float8_e4m3fn>() {
+  return "f32_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<Float8_e5m2>() {
+  return "f32_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<Float8_e4m3fnuz>() {
+  return "f32_r";
+}
+
+template <>
+inline std::string ComputeTypeFor<Float8_e5m2fnuz>() {
+  return "f32_r";
+}
+
+// Convert opmath_type<T> to string
+template <typename T>
+inline std::string to_string_opmath(const at::opmath_type<T>& value) {
+    if constexpr (std::is_same_v<at::opmath_type<T>, c10::complex<float>> ||
+                  std::is_same_v<at::opmath_type<T>, c10::complex<double>>) {
+        return fmt::format("({:.4f}, {:.4f})", value.real(), value.imag());
+    } else {
+        return fmt::format("{:.4f}", value);
+    }
+}
+
+// convert activation epilogue to string
+inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivationEpilogue& value) {
+  switch (value) {
+    case at::cuda::blas::GEMMAndBiasActivationEpilogue::None:
+      return std::string("None");
+      break;
+    case at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU:
+      return std::string("RELU");
+      break;
+    case cuda::blas::GEMMAndBiasActivationEpilogue::GELU:
+      return std::string("GELU");
+      break;
+    default:
+      return std::string("unknown");
+  }
+}
+
+namespace detail {
+
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
+
+  if (!config.enabled) {
+    return true; // skip when disabled
+  }
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+
+  const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
+  if (ok) {
+    TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol);
+  } else {
+    TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
+  }
+  return ok;
+}
+
+}
+
+// Note on GetSizeA et al.
+// Tensors can be dense or arbitrarily strided. We only need our copies to be large enough.
+// Our copies must be at least as large as the m n k shapes dictate, but could be larger
+// depending on the lda ldb ldc values. Similarly for the batched case.
+
+template <typename T>
+struct GemmParams : OpParams {
+  GemmParams() = default;
+
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string beta_str = to_string_opmath<T>(beta);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, bias_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, alpha_str, beta_str, transa, transb,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc);
+  }
+
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+
+  GemmParams* DeepCopy(bool duplicate_inputs) const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha;
+  const T* a{};
+  int64_t lda{};
+  const T* b{};
+  int64_t ldb{};
+  at::opmath_type<T> beta;
+  T* c{};
+  int64_t ldc{};
+private:
+  bool duplicate_inputs_{false};
+};
+
+template <typename T>
+struct GemmAndBiasParams : OpParams {
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string activation_str = to_string_epilogue(activation);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+      "alpha: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, activation: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, alpha_str, transa, transb,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), activation_str, BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc);
+  }
+
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+
+  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
+    GemmAndBiasParams* copy = new GemmAndBiasParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+
+  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha{};
+  const T* a{};
+  int64_t lda{};
+  const T* b{};
+  int64_t ldb{};
+  T* c{};
+  int64_t ldc{};
+  const T* bias{};
+  at::cuda::blas::GEMMAndBiasActivationEpilogue activation{};
+private:
+  bool duplicate_inputs_{false};
+};
+
+template <typename T, typename C_Dtype = T>
+struct GemmStridedBatchedParams : OpParams {
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string beta_str = to_string_opmath<T>(beta);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
+      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_B_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, batch, lda, ldb, ldc);
+  }
+
+  size_t GetSizeA() const {
+    size_t size_stride = stride_a * batch;
+    size_t size_dense = m * k * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeB() const {
+    size_t size_stride = stride_b * batch;
+    size_t size_dense = k * n * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeC() const {
+    size_t size_stride = stride_c * batch;
+    size_t size_dense = m * n * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+
+  GemmStridedBatchedParams* DeepCopy(bool duplicate_inputs) const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      // NOLINTNEXTLINE(*const-cast*)
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      // NOLINTNEXTLINE(*const-cast*)
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha{};
+  const T* a{};
+  int64_t lda{};
+  int64_t stride_a{};
+  const T* b{};
+  int64_t ldb{};
+  int64_t stride_b{};
+  at::opmath_type<T> beta;
+  C_Dtype* c{};
+  int64_t ldc{};
+  int64_t stride_c{};
+  int64_t batch{};
+private:
+  bool duplicate_inputs_{false};
+};
+
+template <typename T>
+struct ScaledGemmParams : OpParams {
+  ScaledGemmParams() = default;
+
+  std::string BLASSignature() const override {
+    // Excluding use_fast_accum and use_rowise booleans for now
+    if (bias_ptr == nullptr) {
+      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
+        m, n, k, lda, ldb, ldc, ldc, transa, transb,
+        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype),
+        ComputeTypeFor<T>(), ComputeTypeFor<T>());
+    }
+    else {
+      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
+        m, n, k, lda, ldb, ldc, ldc, transa, transb,
+        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
+        ComputeTypeFor<T>(), ComputeTypeFor<T>());
+    }
+  }
+
+  std::string Signature() const override {
+    // In Blas.cpp, code defaults to a bias_dtype of Half even when there is no bias vector.
+    // Search for this line::
+    // params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+    //
+    // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector.
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s",
+      transa, transb, m, n, k, lda, ldb, ldc,
+      a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise,
+      bias_ptr == nullptr ? "None" : at::toString(bias_dtype));
+  }
+
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+
+  ScaledGemmParams* DeepCopy(bool duplicate_inputs) const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size);
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = c10::cuda::CUDACachingAllocator::raw_alloc(a_size);
+      copy->b = c10::cuda::CUDACachingAllocator::raw_alloc(b_size);
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(b));
+    }
+  }
+
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  const void* a{};
+  const void* a_scale_ptr{};
+  int64_t lda{};
+  ScalarType a_dtype{};
+  ScalarType a_scale_dtype{};
+  ScalingType a_scaling_type{};
+  const void* b{};
+  const void* b_scale_ptr{};
+  int64_t ldb{};
+  ScalarType b_dtype{};
+  ScalarType b_scale_dtype{};
+  ScalingType b_scaling_type{};
+  const void* bias_ptr{};
+  ScalarType bias_dtype{};
+  void* c{};
+  const void* c_scale_ptr{};
+  int64_t ldc{};
+  ScalarType c_dtype{};
+  void* amax_ptr{};
+  bool use_fast_accum{};
+private:
+  bool duplicate_inputs_{false};
+};
+
+} // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h
new file mode 100644
index 0000000000000000000000000000000000000000..13d0bf23bff74af65cd296a413da661df8f9e183
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h
@@ -0,0 +1,692 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+constexpr hipDataType HipDataTypeFor();
+
+template <>
+constexpr hipDataType HipDataTypeFor<float>() {
+  return HIP_R_32F;
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<Half>() {
+  return HIP_R_16F;
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<BFloat16>() {
+  return HIP_R_16BF;
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<double>() {
+  return HIP_R_64F;
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+
+// This code is instantiated regardless of ROCm version.
+// Prior to ROCm 6.3, we hard-code the known enum values.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fn>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E4M3;
+#else
+  return static_cast<hipDataType>(28);
+#endif
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E5M2;
+#else
+  return static_cast<hipDataType>(29);
+#endif
+}
+
+// This type is not intended for matrix types but rather a scale factor.
+// Return a dummy value to satisfy linker.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
+  return static_cast<hipDataType>(500);
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
+#if ROCM_VERSION >= 70000
+  return HIP_R_4F_E2M1;
+#else
+  return static_cast<hipDataType>(33);
+#endif
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+
+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmAndBiasParams<T>* params) {
+  return 0.0;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scaling_type;
+}
+
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scaling_type;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmAndBiasParams<T>* params) {
+  return params->bias;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return HipDataTypeFor<T>();
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams<T>* params) {
+  return params->activation;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  TORCH_CHECK(false,
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  TORCH_CHECK(false,
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto a_datatype = HipDataTypeFor<AT>();
+      auto b_datatype = HipDataTypeFor<BT>();
+      auto in_out_datatype = HipDataTypeFor<CT>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);
+
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+
+      hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+        computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+      }
+      HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr) {
+        hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER;
+        hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+        if (GetAScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
+#if defined(HIPBLASLT_OUTER_VEC)
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+#elif defined(HIPBLASLT_VEC_EXT)
+          a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
+#endif
+        }
+        if (GetBScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
+#if defined(HIPBLASLT_OUTER_VEC)
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+#elif defined(HIPBLASLT_VEC_EXT)
+          b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
+#endif
+        }
+        matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr);
+        matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr);
+      }
+      if (result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+      }
+
+      const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+      auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+      if (bias_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        auto activation = GetActivationFromParams<CT>(params);
+        if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS);
+        }
+        else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS);
+        }
+        else {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+        }
+      }
+
+      size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize();
+
+      auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
+
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul.descriptor(),
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          return FAIL;
+        }
+      }
+      else {
+        return FAIL;
+      }
+
+      void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace();
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul.descriptor(),
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            at::cuda::getCurrentCUDAStream()));
+
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      return OK;
+    }
+
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto a_datatype = HipDataTypeFor<AT>();
+  auto b_datatype = HipDataTypeFor<BT>();
+  auto in_out_datatype = HipDataTypeFor<CT>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+#if ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F)
+          && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) {
+    std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ignore;
+    return ignore;
+  }
+#endif
+
+  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+  if (at::globalContext().allowTF32CuBLAS()) {
+    computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+  }
+
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        a_datatype,
+        b_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        computeType,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = hipblaslt_ext::getIndexFromAlgo(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = fmt::sprintf("Gemm_Hipblaslt_%d", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+
+  return ret;
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmAndBiasTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmAndBiasParams<T>>();
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
+}
+
+#undef TORCH_HIPBLASLT_CHECK
+
+}  // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..8734d42b01a9a8603532f3284b14904471543a2e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h
@@ -0,0 +1,282 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  TORCH_CHECK(false,
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+        return FAIL;  // no support for TF32 in rocBLAS
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(fmt::sprintf("Gemm_Rocblas_%d", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+        return FAIL;  // no support for TF32 in rocBLAS
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+}  // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..14f1f089ad4fc04b28c6c1c1d36fe64056725fd9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h
@@ -0,0 +1,55 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/tunable/Tunable.h>
+
+namespace at::cuda::tunable {
+
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    ~StreamTimer() override;
+
+    void Start() override;
+
+    void End() override;
+
+    float Duration() override;
+
+  private:
+    cudaEvent_t start_{};
+    cudaEvent_t end_{};
+};
+
+class StreamTimerNoSync : public ITimer {
+  public:
+    StreamTimerNoSync();
+    ~StreamTimerNoSync() override;
+
+    void Start() override;
+
+    void End() override;
+
+    float Duration() override;
+
+  private:
+    cudaEvent_t start_{};
+    cudaEvent_t end_{};
+};
+
+} // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h
new file mode 100644
index 0000000000000000000000000000000000000000..c055f6e72989c3c6e66a35671d10839f3bb354c8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h
@@ -0,0 +1,270 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/env.h>
+
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#define TUNABLE_LOGV(LEVEL, ...) getTuningContext()->Log(LEVEL, __VA_ARGS__)
+#define TUNABLE_LOG1(...) TUNABLE_LOGV(1, __VA_ARGS__)
+#define TUNABLE_LOG2(...) TUNABLE_LOGV(2, __VA_ARGS__)
+#define TUNABLE_LOG3(...) TUNABLE_LOGV(3, __VA_ARGS__)
+
+namespace at::cuda::tunable {
+
+enum TORCH_CUDA_CPP_API TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+
+// Mapping from params signature to kernel id
+class TORCH_CUDA_CPP_API ResultEntry {
+  public:
+    explicit ResultEntry(std::string  key, double time) : key_(std::move(key)), time_(time) {}
+    explicit ResultEntry(std::string  key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {}
+    bool operator==(const ResultEntry& other) const { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) const { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    std::string GetKey() const { return key_; }
+    double GetTime() const { return time_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+
+  private:
+    std::string key_;
+    double time_;
+    std::string blas_sig_;
+};
+
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+typedef std::unordered_map<std::string, std::unordered_set<std::string>> UntunedMap;
+
+struct TORCH_CUDA_CPP_API TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+
+class TORCH_CUDA_CPP_API TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+
+    KernelMap Lookup(const std::string& op_signature);
+
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+
+    void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+
+    void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+
+    void Load(const ResultsMap& results_to_load);
+
+    ResultsMap Dump();
+
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+
+    size_t GetSize();
+
+    void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
+      const std::string& params_signature, const std::string& blas_signature);
+
+    void InitRealtimeAppend(
+        const std::string& filename,
+        const std::unordered_map<std::string, std::string>& validators);
+
+    void AppendResultLine(const std::string& op_sig,
+                         const std::string& param_sig,
+                         const ResultEntry& result);
+
+    void CloseRealtimeAppend();  // For clean shutdown
+  private:
+    std::mutex lock_;
+    std::mutex realtime_file_mutex_;
+    std::unique_ptr<std::ofstream> realtime_out_;
+    std::string realtime_filename_;
+    ResultsMap results_;
+    UntunedMap untuned_results_;
+    bool validators_written_ = false;
+
+};
+
+class TORCH_CUDA_CPP_API TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+
+  protected:
+    static std::string GetPyTorchVersion() ;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+
+  private:
+    GetValidateFuncs validators_;
+};
+
+struct NumericalCheckConfig {
+  bool   enabled{false};
+  double atol{1e-5};
+  double rtol{1e-5};
+
+  NumericalCheckConfig() = default;
+  NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {}
+};
+
+
+class TORCH_CUDA_CPP_API TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+
+    void EnableTunableOp(bool value);
+    bool IsTunableOpEnabled() const;
+
+    void EnableTuning(bool value);
+    bool IsTuningEnabled() const;
+
+    void EnableRecordUntuned(bool value);
+    bool IsRecordUntunedEnabled() const;
+    std::ofstream& GetUntunedFile();
+
+    void EnableNumericsCheck(bool value);
+    bool IsNumericsCheckEnabled() const;
+    void SetNumericalCheckConfig(bool enabled, double atol, double rtol);
+    NumericalCheckConfig GetNumericalCheckConfig() const;
+
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+
+    void EnableICacheFlush(bool value);
+    bool IsICacheFlushEnabled() const;
+
+    void SetRotatingBufferSize(int size);
+    int GetRotatingBufferSize() const;
+
+    TuningResultsManager& GetTuningResultsManager();
+
+    TuningResultsValidator& GetTuningResultsValidator();
+
+    TuningResults GetTuningResults();
+
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+
+    void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
+    std::string GetFilename() const;
+
+    bool ReadFile(const std::string& filename={});
+
+    template<class... Types>
+    void Log(int level, Types... args) {
+      if (GetLogOkay() && GetLogLevel() >= level) {
+        GetLog() << c10::str(args...) << std::endl;
+      }
+    }
+
+  private:
+    std::string GetLogFilename() const;
+    int GetLogLevel() const;
+    bool GetLogOkay() const;
+    std::ostream& GetLog() const;
+
+    bool enable_;
+    bool tuning_enable_;
+    bool record_untuned_enable_;
+    bool manager_initialized_;
+    bool numerics_check_enable_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    bool icache_flush_;
+    int rotating_buffer_size_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    std::ofstream untuned_file_;
+    size_t results_count_from_input_file_;
+    bool is_shutting_down_;
+
+    NumericalCheckConfig numerics_cfg_{};
+};
+
+TORCH_CUDA_CPP_API TuningContext* getTuningContext();
+
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+
+    virtual void Start() = 0;
+    virtual void End() = 0;
+
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+
+} // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..b377374967ee2f224983c993145eb427d2cc57bd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h
@@ -0,0 +1,334 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/GemmCommon.h>
+#ifdef USE_ROCM
+#include <ATen/cuda/tunable/GemmHipblaslt.h>
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#endif
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+
+namespace at::cuda::tunable {
+
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::cuda::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+
+static bool _transposeBoolFromChar(char op) {
+  return op == 't' || op == 'T';
+}
+
+template <typename T>
+class DefaultGemmAndBiasOp : public Callable<GemmAndBiasParams<T>> {
+  public:
+    TuningStatus Call(const GemmAndBiasParams<T>* params) override {
+      at::cuda::blas::gemm_and_bias<T>(
+          _transposeBoolFromChar(params->transa),
+          _transposeBoolFromChar(params->transb),
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->bias,
+          params->c, params->ldc,
+          params->activation);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::cuda::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::cuda::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->a_scale_dtype,
+          params->a_scaling_type,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->b_scale_dtype,
+          params->b_scaling_type,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->use_fast_accum,
+          std::nullopt /* alpha */);
+      return OK;
+    }
+};
+
+template <typename T>
+inline bool IsZero(T v) {
+  return v == 0.0f;
+}
+
+template <>
+inline bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+
+template <>
+inline bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+
+template <>
+inline bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+
+template <>
+inline bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+
+template <typename T>
+inline const char* TypeName(T v) {
+  return "unknown";
+}
+
+template <>
+inline const char* TypeName(float v) {
+  if (at::globalContext().allowTF32CuBLAS()) {
+    return "tf32";
+  } else {
+    return "float";
+  }
+}
+
+template <>
+inline const char* TypeName(double v) {
+  return "double";
+}
+
+template <>
+inline const char* TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+
+template <>
+inline const char* TypeName(Half v) {
+  return "Half";
+}
+
+template <>
+inline const char* TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+
+template <>
+inline const char* TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+
+template <>
+inline const char* TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+
+template <>
+inline const char* TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+
+template <>
+inline const char* TypeName(Float8_e8m0fnu v) {
+  return "Float8_e8m0fnu";
+}
+
+template <>
+inline const char* TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+
+template <>
+inline const char* TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+
+#ifdef USE_ROCM
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
+      for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+  }
+
+  std::string Signature() override {
+    return fmt::sprintf("GemmTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmAndBiasTunableOp : public TunableOp<GemmAndBiasParams<T>> {
+ public:
+  GemmAndBiasTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+
+#ifdef USE_ROCM
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+  }
+
+  std::string Signature() override {
+    return fmt::sprintf("GemmAndBiasTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+
+#ifdef USE_ROCM
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
+      for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+  }
+
+  std::string Signature() override {
+    return fmt::sprintf("GemmStridedBatchedTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+#endif
+
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+  }
+
+  std::string Signature() override {
+    return fmt::sprintf("ScaledGemmTunableOp_%s_%s_%s_%c%c",
+      TypeName<AT>(AT{}),
+      TypeName<BT>(BT{}),
+      TypeName<CT>(CT{}),
+      BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+} // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a59c1aebc7f01340384c0bbc3cbdc1c3299a6dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h
@@ -0,0 +1,434 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/tunable/StreamTimer.h>
+#include <ATen/cuda/Sleep.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <deque>
+
+namespace at::cuda::tunable {
+
+template <typename ParamsT>
+class Callable {
+  public:
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT* /*unused*/) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+
+namespace {
+
+/** http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */
+
+class Stats {
+  public:
+    Stats() {
+      _n = 0UL;
+      _mean = 0.0;
+      _M2 = 0.0;
+      _sum = 0.0;
+      _min = 0.0;
+      _max = 0.0;
+    }
+
+    void sample_value(const double x) {
+      double delta = 0;
+      _sum = _sum + x;
+      if (0UL == _n) {
+          _min = x;
+          _max = x;
+      }
+      else {
+          _min = _min < x ? _min : x;
+          _max = _max > x ? _max : x;
+      }
+      _n = _n + 1UL;
+      delta = x - _mean;
+      _mean = _mean + delta/_n;
+      _M2 = _M2 + delta * (x - _mean);
+    }
+
+    double variance() const {
+      return _M2/(_n-1);
+    }
+
+    double stddev() const {
+      return std::sqrt(variance());
+    }
+
+    unsigned long _n;
+    double _mean;
+    double _M2;
+    double _sum;
+    double _min;
+    double _max;
+};
+
+class FixedSizeStack {
+  private:
+      std::deque<std::string> stack;
+      const size_t max_size;
+
+  public:
+      FixedSizeStack(size_t size) : max_size(size) {}
+
+      void push(const std::string& value) {
+          if (stack.size() >= max_size) {
+              stack.pop_front(); // Remove the oldest entry
+          }
+          stack.push_back(value); // Add new entry
+      }
+
+      auto rbegin() { return stack.rbegin(); }
+      auto rend() { return stack.rend(); }
+};
+
+} // anonymous namespace
+
+template <typename ParamsT>
+class TunableOp {
+  public:
+    virtual ~TunableOp() = default;
+
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        auto blas_sig = params->BLASSignature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null()) {
+          if (ctx->IsTuningEnabled()) {
+            result = FindFastest(params);
+            mgr.Add(op_sig, params_sig, result);
+          }
+          else if (ctx->IsRecordUntunedEnabled()) {
+            // or record the gemm into file
+            mgr.RecordUntuned(ctx->GetUntunedFile(), op_sig, params_sig, blas_sig);
+          }
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG2("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+
+  private:
+    static void WarmUp(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+    }
+
+    static double ProfileSimple(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      StreamTimerNoSync timer{};
+
+      // Small Mandatory Warmup
+      // Reduces outliers
+      for (size_t i = 0; i < 2; i++) {
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+
+    static Stats ProfileStats(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      std::vector<StreamTimerNoSync> timer(num_iter);
+
+      // Small Mandatory Warmup
+      // Reduces outliers
+      for (size_t i = 0; i < 2; i++) {
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+
+      for (size_t i = 0; i < num_iter; i++) {
+        timer[i].Start();
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+        timer[i].End();
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+      }
+      Stats s;
+      for (size_t i = 0; i < num_iter; i++) {
+        s.sample_value(timer[i].Duration());
+      }
+      return s;
+    }
+
+  protected:
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      auto blas_sig = params->BLASSignature();
+      TUNABLE_LOG2("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+      ParamsT* reference_params = nullptr;
+      auto top_solns = FixedSizeStack(5);
+
+      // numeric check option is controlled by non-static env var, so check it once per tuned operator
+      bool do_numerics_check = ctx->IsNumericsCheckEnabled();
+
+      // calculate a reference answer for numerical check
+      if (do_numerics_check) {
+        reference_params = params->DeepCopy(false);
+        TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+      }
+
+      // need copies of params to reuse
+      // make as many copies as will fill the requested rotating buffer size, if requested
+      // rotating_size guaranteed to be >= 0 even though GetRotatingBufferSize() returns int
+      size_t rotating_size = ctx->GetRotatingBufferSize();
+      bool use_buffer_rotation = (rotating_size > 0);
+      size_t param_size = params->GetSize(use_buffer_rotation);
+      size_t param_count = (rotating_size / param_size) + 1;
+      constexpr size_t MB = 1024ull*1024;
+      if (use_buffer_rotation) {
+        TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ",
+            "Needed Size: ", param_size/MB, " MiB. ",
+            "Needed number of param copies: ", param_count);
+      }
+      TORCH_CHECK(param_count > 0);
+
+      std::vector<ParamsT*> reusable_params(param_count);
+      for (size_t i = 0; i < param_count; i++) {
+        reusable_params[i] = params->DeepCopy(use_buffer_rotation);
+      }
+
+      // for rotating buffer
+      size_t offset = 0;
+
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+
+        auto status = candidate->Call(reusable_params[0]);
+        if (status != OK) {
+          TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        // collect a small profile
+        int approx_num_iter = 3;
+        auto s = ProfileStats(candidate, reusable_params, approx_num_iter, offset);
+        double approx_duration = s._mean;
+        // bail if too slow
+        if (approx_duration > 1.5 * min_duration_ms) {
+          TUNABLE_LOG3("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        // 2nd phase skip, more aggressive
+        approx_num_iter = 10;
+        s = ProfileStats(candidate, reusable_params, approx_num_iter, offset);
+        approx_duration = s._mean;
+        // bail if too slow
+        if (approx_duration > 1.15 * min_duration_ms) {
+          TUNABLE_LOG3("├──2nd skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        if (do_numerics_check) {
+          ParamsT* numerical_params = params->DeepCopy(false);
+          auto status = candidate->Call(numerical_params);
+          if (status != OK) {
+            numerical_params->Delete();
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+
+        // for warmup does user set max duration, max iters, or both?
+        // warmup is skipped by default, i.e. warmup_iter = 0
+        // warmup will be set to the non-zero value of max_warmup_duration
+        // or max_warmup_iter
+        // if both are non-zero, we take the smaller of the two.
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 0; // default
+        if (max_warmup_duration > 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter > 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter > 0) {
+          warmup_iter = max_warmup_iter;
+        }
+
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+        // tuning must run at least 1 iteration
+        tuning_iter = std::max(1, tuning_iter);
+
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG3("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        TUNABLE_LOG3("├──offset at ", offset);
+        WarmUp(candidate, reusable_params, warmup_iter, offset);
+        s = ProfileStats(candidate, reusable_params, tuning_iter, offset);
+        auto s_stddev = s.stddev();
+        // Assume normal distribution.
+        // Solution with smallest mean + 2*sigma will be a better solution?
+        // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) {
+        if (s._mean < min_duration_ms) {
+          TUNABLE_LOG3("├──found better instance id=", i, ". " , s._mean, "ms. ", op_names_[i],
+                " min ", s._min,
+                " max ", s._max,
+                " mean ", s._mean,
+                " std ", s_stddev);
+          min_duration_ms = s._mean;
+          id_name = op_names_[i];
+          std::string current_soln = std::to_string(s._mean) + " " + op_names_[i];
+          top_solns.push(current_soln);
+        }
+        else {
+          TUNABLE_LOG3("├──found slower instance id=", i, ". " , s._mean, "ms. ", op_names_[i],
+                " min ", s._min,
+                " max ", s._max,
+                " mean ", s._mean,
+                " std ", s_stddev);
+        }
+      }
+
+      for (size_t i = 0; i < reusable_params.size(); i++) {
+        reusable_params[i]->Delete();
+      }
+      if (reference_params) {
+        reference_params->Delete();
+      }
+
+      TUNABLE_LOG2("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      TUNABLE_LOG2("└──top five solutions for ", op_sig, '(', params_sig, ") ");
+      for (auto it = top_solns.rbegin(); it != top_solns.rend(); ++it) {
+        TUNABLE_LOG2("   ", *it);
+      }
+      return ResultEntry(id_name, min_duration_ms, blas_sig);
+    }
+
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      // NOLINTNEXTLINE(*array*)
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+
+struct OpParams {
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+  virtual std::string BLASSignature() const = 0;
+};
+
+} // namespace at::cuda::tunable
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..005f4b0a55c787c61c76fbe4acbdc870e8dd9fb5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -0,0 +1,248 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/hip/HIPCachingAllocator.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10::hip {
+
+// Takes a valid HIPAllocator (of any sort) and turns it into
+// an allocator pretending to be a CUDA allocator.  See
+// Note [Masquerading as CUDA]
+class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
+  HIPCachingAllocator::HIPAllocator* allocator_;
+public:
+  explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
+    : allocator_(allocator) {}
+
+  virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
+
+  // From c10::Allocator
+
+  DataPtr allocate(size_t size) override {
+    DataPtr r = allocator_->allocate(size);
+    r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
+    return r;
+  }
+
+  bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
+    return allocator_->is_simple_data_ptr(data_ptr);
+  }
+
+  DeleterFnPtr raw_deleter() const override {
+    return allocator_->raw_deleter();
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    allocator_->copy_data(dest, src, count);
+  }
+
+  // From DeviceAllocator
+
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+  }
+
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    HIPStream hip_stream = HIPStream(stream);
+    recordStream(ptr, hip_stream);
+  }
+
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    return allocator_->getDeviceStats(device);
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    allocator_->resetAccumulatedStats(device);
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    allocator_->resetPeakStats(device);
+  }
+
+  // From CUDAAllocator
+
+  void* raw_alloc(size_t nbytes) override {
+    return allocator_->raw_alloc(nbytes);
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    return allocator_->raw_alloc_with_stream(nbytes, stream);
+  }
+
+  void raw_delete(void* ptr) override {
+    allocator_->raw_delete(ptr);
+  }
+
+  void init(int device_count) override {
+    allocator_->init(device_count);
+  }
+
+  double getMemoryFraction(c10::DeviceIndex device) override {
+    return allocator_->getMemoryFraction(device);
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    allocator_->setMemoryFraction(fraction, device);
+  }
+
+  std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
+    return allocator_->getExpandableSegmentSizes(device);
+  }
+
+  void enable(bool value) override {
+    allocator_->enable(value);
+  }
+
+  bool isEnabled() const override {
+    return allocator_->isEnabled();
+  }
+
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    allocator_->cacheInfo(device, largestBlock);
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    return allocator_->getBaseAllocation(ptr, size);
+  }
+
+  void recordStream(const DataPtr& ptr, HIPStream stream) override {
+    allocator_->recordStream(ptr, stream);
+  }
+
+  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
+    return allocator_->snapshot(mempool_id);
+  }
+
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) override {
+    allocator_->beginAllocateToPool(device, mempool_id, filter);
+  }
+
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) override {
+    allocator_->endAllocateToPool(device, mempool_id);
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->releasePool(device, mempool_id);
+  }
+
+  int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    return allocator_->getPoolUseCount(device, mempool_id);
+  }
+
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      HIPAllocator* allocator = nullptr) override {
+    allocator_->createOrIncrefPool(device, mempool_id, allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setUseOnOOM(device, mempool_id);
+  }
+
+  void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setNoSplit(device, mempool_id);
+  }
+
+  bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) override {
+    return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
+  }
+
+  HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
+    return allocator_->shareIpcHandle(ptr);
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    return allocator_->getIpcDevPtr(handle);
+  }
+
+  bool isHistoryEnabled() override {
+    return allocator_->isHistoryEnabled();
+  }
+
+  void recordHistory(
+      bool enabled,
+      HIPCachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      HIPCachingAllocator::RecordContext when,
+      bool clearHistory) override {
+    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+  }
+
+  void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& md) override {
+    allocator_->recordAnnotation(md);
+  }
+
+  void pushCompileContext(std::string& md) override {
+    allocator_->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    allocator_->popCompileContext();
+  }
+
+  void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
+    allocator_->attachOutOfMemoryObserver(observer);
+  }
+
+  void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
+    allocator_->attachAllocatorTraceTracker(tracker);
+  }
+
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
+    allocator_->enablePeerAccess(dev, dev_to_access);
+  }
+
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+  }
+
+  std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    return allocator_->getCheckpointState(device, id);
+  }
+
+  HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
+    auto cpd = allocator_->setCheckpointPoolState(device, pps);
+    for (auto& ptr : cpd.dataptrs_allocd) {
+      ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
+    }
+    return cpd;
+  }
+
+  std::string name() override {
+    return allocator_->name();
+  }
+
+};
+
+} // namespace c10::hip
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f0214ee3c8c9d23a07aa2070f92e78bdbc326a8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -0,0 +1,203 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+
+namespace c10 {
+// forward declaration
+class DataPtr;
+namespace hip {
+namespace HIPCachingAllocatorMasqueradingAsCUDA {
+
+C10_HIP_API HIPCachingAllocator::HIPAllocator* get();
+C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
+
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline double getMemoryFraction(c10::DeviceIndex device) {
+  return get()->getMemoryFraction(device);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+}
+
+inline void enable(bool value) {
+  return get()->enable(value);
+}
+
+inline bool isEnabled() {
+  return get()->isEnabled();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+}
+
+inline std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(hipStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    HIPCachingAllocator::CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    HIPCachingAllocator::RecordContext when,
+    bool clearHistory) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+}
+
+inline void recordAnnotation(
+    const std::vector<std::pair<std::string, std::string>>& md) {
+  return get()->recordAnnotation(md);
+}
+
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+}
+
+inline void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setNoSplit(device, mempool_id);
+}
+
+inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->getPoolUseCount(device, mempool_id);
+}
+
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) {
+  return get()->shareIpcHandle(ptr);
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline hipError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    hipStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
+} // namespace HIPCachingAllocatorMasqueradingAsCUDA
+} // namespace hip
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..be82a7c22e3f62e77ef9b9f232fe1f1ce864efa8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -0,0 +1,388 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/hip/HIPConfig.h>
+
+// The includes of HIPGuard.h
+#include <c10/hip/impl/HIPGuardImpl.h>
+#include <c10/hip/HIPMacros.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/util/Exception.h>
+
+#include <c10/hip/impl/HIPGuardImpl.h>
+
+#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// Note [Masquerading as CUDA]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// c10_hip is very easy to understand: it is HIPified from c10_cuda,
+// and anywhere you said CUDA, the source code now says HIP.  HIPified
+// PyTorch is much harder to understand: it is HIPified from regular
+// PyTorch, yes, but NO source-to-source translation from CUDA to
+// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
+// For example, when you use HIPified PyTorch, you say x.cuda() to
+// move a tensor onto ROCm device.  We call this situation "HIP
+// masquerading as CUDA".
+//
+// This leads to a very awkward situation when we want to call c10_hip
+// code from PyTorch, since c10_hip is expecting things to be called
+// HIP, but PyTorch is calling them CUDA (masquerading as HIP).  To
+// fix this impedance mismatch, we have MasqueradingAsCUDA variants
+// for all c10_hip classes.  These translate between the "HIP" and "CUDA
+// masquerading as HIP" worlds.  For example,
+// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a
+// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type()
+// returns CUDA, getDevice() reports the current HIP device as a CUDA
+// device.)
+//
+// We should be able to delete all of these classes entirely once
+// we switch PyTorch to calling a HIP a HIP.
+//
+// When you add a new MasqueradingAsCUDA class/function, you need to
+// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py
+//
+//
+//
+// By the way, note that the cpp file associated with this also
+// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with
+// this HIP implementation.
+
+struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA;
+  HIPGuardImplMasqueradingAsCUDA() {}
+  HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA);
+  }
+  c10::DeviceType type() const override {
+    return c10::DeviceType::CUDA;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+      C10_HIP_CHECK(hipSetDevice(d.index()));
+    }
+    return old_device;
+  }
+  Device getDevice() const override {
+    int device;
+    C10_HIP_CHECK(hipGetDevice(&device));
+    return Device(c10::DeviceType::CUDA, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    C10_HIP_CHECK(hipSetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_HIP_CHECK_WARN(hipSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const override {
+    return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultHIPStreamMasqueradingAsCUDA(d.index());
+  }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPoolMasqueradingAsCUDA(priority, d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override {
+    return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index());
+  }
+  Stream exchangeStream(Stream s) const override {
+    HIPStreamMasqueradingAsCUDA cs(s);
+    auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index());
+    setCurrentHIPStreamMasqueradingAsCUDA(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    int deviceCnt;
+    hipError_t _err;
+    _err = hipGetDeviceCount(&deviceCnt);
+    if(_err != hipErrorNoDevice && _err != hipSuccess)
+        C10_HIP_CHECK(_err);
+    return deviceCnt;
+  }
+
+  // Event-related functions
+  // Note: hipEventCreateWithFlags should be called on the same device as
+  //  the recording stream's device.
+  void createEvent(
+    hipEvent_t* hip_event,
+    const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to HIP flag
+    auto hip_flag = hipEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+        hip_flag = hipEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+        hip_flag = hipEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "HIP event received unknown flag");
+    }
+
+    C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag));
+  }
+
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override {
+    if (!event) return;
+    auto hip_event = static_cast<hipEvent_t>(event);
+    int orig_device;
+    C10_HIP_CHECK_WARN(hipGetDevice(&orig_device));
+    C10_HIP_CHECK_WARN(hipSetDevice(device_index));
+    C10_HIP_CHECK_WARN(hipEventDestroy(hip_event));
+    C10_HIP_CHECK_WARN(hipSetDevice(orig_device));
+  }
+
+  void record(void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override {
+    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(),
+      "Event device index ",
+      device_index,
+      " does not match recording stream's device index ",
+      stream.device_index(),
+      ".");
+
+    hipEvent_t hip_event = static_cast<hipEvent_t>(*event);
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!hip_event) createEvent(&hip_event, flag);
+    C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream));
+    // Makes the void* point to the (possibly just allocated) HIP event
+    *event = hip_event;
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(
+    void* event,
+    const Stream& stream) const override {
+    if (!event) return;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_HIP_CHECK(hipStreamWaitEvent(
+      hip_stream,
+      hip_event,
+      /*flags (must be zero)=*/ 0));
+    setDevice(orig_device);
+  }
+
+  bool queryEvent(void* event) const override {
+    if (!event) return true;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    const hipError_t err = hipEventQuery(hip_event);
+    if (err != hipErrorNotReady) C10_HIP_CHECK(err);
+    else {
+      // ignore and clear the error if not ready
+      (void)hipGetLastError();
+    }
+    return (err == hipSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    return hip_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    hip_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    C10_HIP_CHECK(hipEventSynchronize(hip_event));
+  }
+
+  // Note: synchronizeDevice can be safely called from any device
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    int orig_device{-1};
+    C10_HIP_CHECK(hipGetDevice(&orig_device));
+    C10_HIP_CHECK(hipSetDevice(device_index));
+    C10_HIP_CHECK(hipDeviceSynchronize());
+    C10_HIP_CHECK(hipSetDevice(orig_device));
+  }
+
+  void recordDataPtrOnStream(
+    const c10::DataPtr& data_ptr,
+    const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    int orig_device;
+    C10_HIP_CHECK(hipGetDevice(&orig_device));
+    C10_HIP_CHECK(hipSetDevice(device_index));
+    hipEvent_t hip_event1 = static_cast<hipEvent_t>(event1);
+    hipEvent_t hip_event2 = static_cast<hipEvent_t>(event2);
+    float time_ms = 0;
+    // raise hipErrorNotReady if either event is recorded but not yet completed
+    C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2));
+    C10_HIP_CHECK(hipSetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
+};
+
+// All of the guards which have HIPGuardImpl burned in need to also have
+// variants using HIPGuardImplMasqueradingAsCUDA.
+
+/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with
+/// the correct InlineDeviceGuard burned in.  Sorry about the
+/// copy-pasting.
+
+struct HIPGuardMasqueradingAsCUDA {
+  explicit HIPGuardMasqueradingAsCUDA() = delete;
+  explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {}
+  explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {}
+
+  HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete;
+  HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete;
+  HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete;
+  HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete;
+
+  void set_device(Device device) { guard_.set_device(device); }
+  void reset_device(Device device) { guard_.reset_device(device); }
+  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
+  Device original_device() const { return guard_.original_device(); }
+  Device current_device() const { return guard_.current_device(); }
+
+ private:
+  c10::impl::InlineDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct OptionalHIPGuardMasqueradingAsCUDA {
+  explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {}
+  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<Device> device_opt) : guard_(device_opt) {}
+  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
+
+  OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
+
+  void set_device(Device device) { guard_.set_device(device); }
+  void reset_device(Device device) { guard_.reset_device(device); }
+  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
+  std::optional<Device> original_device() const { return guard_.original_device(); }
+  std::optional<Device> current_device() const { return guard_.current_device(); }
+  void reset() { guard_.reset(); }
+
+private:
+  c10::impl::InlineOptionalDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct HIPStreamGuardMasqueradingAsCUDA {
+  explicit HIPStreamGuardMasqueradingAsCUDA() = delete;
+  explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
+  HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+  HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
+
+  HIPStreamMasqueradingAsCUDA original_stream() const {
+    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream());
+  }
+  HIPStreamMasqueradingAsCUDA current_stream() const {
+    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream());
+  }
+
+  Device current_device() const { return guard_.current_device(); }
+  Device original_device() const { return guard_.original_device(); }
+
+private:
+  c10::impl::InlineStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct OptionalHIPStreamGuardMasqueradingAsCUDA {
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {}
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(std::optional<Stream> stream_opt) : guard_(stream_opt) {}
+
+  OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
+
+  std::optional<HIPStreamMasqueradingAsCUDA> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  std::optional<HIPStreamMasqueradingAsCUDA> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void reset() { guard_.reset(); }
+
+private:
+  c10::impl::InlineOptionalStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct HIPMultiStreamGuardMasqueradingAsCUDA {
+  explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef<HIPStreamMasqueradingAsCUDA> streams)
+    : guard_(unwrapStreams(streams)) {}
+
+  HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+private:
+  c10::impl::InlineMultiStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<HIPStreamMasqueradingAsCUDA> hipStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(hipStreams.size());
+    for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) {
+      streams.push_back(hipStream);
+    }
+    return streams;
+  }
+};
+
+}} // namespace c10::hip
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..48f1459396b82283290e457a16f9ec66ee500601
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -0,0 +1,140 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/hip/HIPStream.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// See Note [Masquerading as CUDA] for motivation
+
+class HIPStreamMasqueradingAsCUDA {
+public:
+
+  enum Unchecked { UNCHECKED };
+
+  explicit HIPStreamMasqueradingAsCUDA(Stream stream)
+    : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) {
+    // We did the coercion unchecked; check that it was right.
+    TORCH_CHECK(stream.device().is_cuda() /* !!! */);
+  }
+
+  explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream)
+    // Unsafely coerce the "CUDA" stream into a HIP stream
+    : stream_(
+        HIPStream(
+          Stream(
+            Stream::UNSAFE,
+            Device(c10::DeviceType::HIP, stream.device_index()),
+            stream.id())
+        )
+      ) {}
+
+  // New constructor, just for this.  Does NOT coerce.
+  explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {}
+
+  bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
+    return stream_ == other.stream_;
+  }
+
+  bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
+    return stream_ != other.stream_;
+  }
+
+  operator hipStream_t() const { return stream_.stream(); }
+
+  operator Stream() const {
+    // Unsafely coerce HIP stream into a "CUDA" stream
+    return Stream(Stream::UNSAFE, device(), id());
+  }
+
+  DeviceIndex device_index() const { return stream_.device_index(); }
+
+  // Unsafely coerce HIP device into CUDA device
+  c10::DeviceType device_type() const { return c10::DeviceType::CUDA; }
+
+  Device device() const {
+    // Unsafely coerce HIP device into CUDA device
+    return Device(c10::DeviceType::CUDA, stream_.device_index());
+  }
+
+  StreamId id() const        { return stream_.id(); }
+  bool query() const         { return stream_.query(); }
+  void synchronize() const   { stream_.synchronize(); }
+  int priority() const       { return stream_.priority(); }
+  hipStream_t stream() const { return stream_.stream(); }
+
+  Stream unwrap() const {
+    // Unsafely coerce HIP stream into "CUDA" stream
+    return Stream(Stream::UNSAFE, device(), id());
+  }
+
+  c10::StreamData3 pack3() const noexcept {
+    // Unsafely coerce HIP stream into "CUDA" stream before packing
+    return unwrap().pack3();
+  }
+
+  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
+                                             DeviceIndex device_index,
+                                             c10::DeviceType device_type) {
+    // NB: constructor manages CUDA->HIP translation for us
+    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
+        stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
+
+  // New method, gets the underlying HIPStream
+  HIPStream hip_stream() const { return stream_; }
+
+private:
+  HIPStream stream_;
+};
+
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
+}
+
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device));
+}
+
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
+}
+
+inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
+  return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));
+}
+
+inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
+  return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index));
+}
+
+inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) {
+  setCurrentHIPStream(stream.hip_stream());
+}
+
+inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) {
+  stream << s.hip_stream() << " (masquerading as CUDA)";
+  return stream;
+}
+
+}} // namespace c10::hip
+
+namespace std {
+  template <>
+  struct hash<c10::hip::HIPStreamMasqueradingAsCUDA> {
+    size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept {
+      return std::hash<c10::Stream>{}(s.unwrap());
+    }
+  };
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1d088a57f4a74af873ff240661852c76af3e144
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h
@@ -0,0 +1,17 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+
+using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t);
+DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub)
+
+} // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..314bc2c06d7acdd436d11bc9d20eb4553efee103
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h
@@ -0,0 +1,19 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at {
+struct TensorIteratorBase;
+
+namespace native {
+inline namespace CPU_CAPABILITY {
+
+void direct_copy_kernel(TensorIteratorBase &iter);
+void copy_kernel(TensorIterator& iter, bool /*non_blocking*/);
+
+}}}  // namespace at::native::CPU_CAPABILITY
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h
new file mode 100644
index 0000000000000000000000000000000000000000..86cf48ff2a6823ccb27987121fd797d75e2b70e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h
@@ -0,0 +1,430 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandBase.h>
+#include <ATen/core/DistributionsHelper.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <mutex>
+
+#ifdef CPU_CAPABILITY_AVX2
+#include <ATen/native/cpu/avx_mathfun.h>
+#include <c10/util/irange.h>
+#endif
+
+
+
+
+namespace at::native::templates::cpu {
+namespace {
+
+// ==================================================== Random ========================================================
+
+template<typename RNG>
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) {
+  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cpu", AT_WRAP([&] {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t {
+      uniform_int_from_to_distribution<scalar_t> random(range, base);
+      return random(generator);
+    });
+  }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+// This is the special kernel to handle single specific case:
+// from(inclusive) = std::numeric_limits<int64_t>::lowest()
+// to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+template<typename RNG>
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] {
+    if constexpr (std::is_same_v<scalar_t, int64_t> ||
+        std::is_same_v<scalar_t, double> ||
+        std::is_same_v<scalar_t, float> ||
+        std::is_same_v<scalar_t, at::BFloat16>) {
+      std::lock_guard<std::mutex> lock(generator->mutex_);
+      cpu_serial_kernel(iter, [generator]() -> scalar_t {
+        uniform_int_full_range_distribution<scalar_t> random;
+        return random(generator);
+      });
+    } else {
+      TORCH_CHECK(false, "random_full_64_bits_range_kernel_cpu handles only int64, double, float and bfloat16");
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
+    random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
+  }
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
+    random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+template<typename RNG>
+void random_kernel(TensorIteratorBase& iter, RNG generator) {
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] {
+    cpu_serial_kernel(iter, [generator]() -> scalar_t {
+      uniform_int_distribution<scalar_t> random;
+      return random(generator);
+    });
+  });
+}
+
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
+    random_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Normal ========================================================
+
+#ifdef CPU_CAPABILITY_AVX2
+void normal_fill_16_AVX2(float *data,
+                         const __m256* two_pi,
+                         const __m256* one,
+                         const __m256* minus_two,
+                         const __m256* mean,
+                         const __m256* std_v) {
+  const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data));
+  const __m256 u2 = _mm256_loadu_ps(data + 8);
+  // sincos256_ps and log256_ps are from avx_mathfun.h
+  const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1)));
+  const __m256 theta = _mm256_mul_ps(*two_pi, u2);
+  __m256 sintheta, costheta;
+  sincos256_ps(theta, &sintheta, &costheta);
+  const __m256 n1 = _mm256_mul_ps(radius, costheta);
+  const __m256 n2 = _mm256_mul_ps(radius, sintheta);
+  _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *std_v, *mean));
+  _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *std_v, *mean));
+}
+
+template<typename RNG>
+void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) {
+  float *data = self.data_ptr<float>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<float> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+  const __m256 two_pi = _mm256_set1_ps(2.0f * c10::pi<double>);
+  const __m256 one = _mm256_set1_ps(1.0f);
+  const __m256 minus_two = _mm256_set1_ps(-2.0f);
+  const __m256 mean_v = _mm256_set1_ps(mean);
+  const __m256 std_v = _mm256_set1_ps(std);
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &std_v);
+  }
+
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<float> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &std_v);
+  }
+}
+#endif
+
+template <typename scalar_t>
+void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
+  for (const auto j : c10::irange(8)) {
+    const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log.
+    const scalar_t u2 = data[j + 8];
+    const scalar_t radius = std::sqrt(-2 * std::log(u1));
+    const scalar_t theta = 2.0f * c10::pi<double> * u2;
+    data[j] = radius * std::cos(theta) * std + mean;
+    data[j + 8] = radius * std::sin(theta) * std + mean;
+  }
+}
+
+#if defined(__VSX__)  || defined(CPU_CAPABILITY_VSX)
+static void normal_fill_16_VSX(float *data,const Vectorized<float> &two_pi,const Vectorized<float> &one,const Vectorized<float> &minus_two,const Vectorized<float> &mean,const Vectorized<float> &std) {
+  using Vec = Vectorized<float>;
+  Vec u1=one-Vec::loadu(data);
+  Vec u2=Vec::loadu(data+8);
+  Vec radius=(minus_two * u1.log());
+  radius=radius.sqrt();
+  Vec theta=two_pi * u2;
+  Vec output_vec=radius * theta.cos() * std + mean;
+  Vec output_vec2=radius * theta.sin() * std + mean;
+  output_vec.store(data);
+  output_vec2.store(data+8);
+}
+
+template <typename scalar_t, typename RNG>
+void normal_fill_VSX(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
+  float *data = self.data_ptr<float>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<scalar_t> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+
+  using Vec = Vectorized<float>;
+  const Vec two_pi = Vec(2.0f * c10::pi<double>);
+  const Vec one = Vec(1.0f);
+  const Vec minus_two = Vec(-2.0f);
+  const Vec var_vec  = Vec(std);
+  const Vec mean_vec = Vec(mean);
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    if(Vec::size()==8) {
+      normal_fill_16_VSX(data + i, two_pi, one, minus_two, mean_vec, var_vec);
+    }
+    else{
+      normal_fill_16<scalar_t>(data + i, mean, std);
+    }
+  }
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<scalar_t> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    if(Vec::size()==8){
+      normal_fill_16_VSX(data, two_pi, one, minus_two, mean_vec, var_vec);
+    }
+    else{
+      normal_fill_16<scalar_t>(data, mean, std);
+    }
+  }
+}
+#endif //VSX
+
+template <typename scalar_t, typename RNG>
+void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
+  scalar_t *data = self.data_ptr<scalar_t>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<scalar_t> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    normal_fill_16<scalar_t>(data + i, mean, std);
+  }
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<scalar_t> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    normal_fill_16<scalar_t>(data, mean, std);
+  }
+}
+
+template<typename RNG>
+void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) {
+  auto size = self.numel();
+  if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
+#ifdef CPU_CAPABILITY_AVX2
+    normal_fill_AVX2(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#elif defined(__VSX__)  || defined(CPU_CAPABILITY_VSX)
+    normal_fill_VSX(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#else
+    normal_fill(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "normal_kernel_cpu", [&] {
+      if (size >= 16 && self.is_contiguous()) {
+        normal_fill<scalar_t>(self, static_cast<scalar_t>(mean), static_cast<scalar_t>(std), generator);
+      } else {
+        auto iter = TensorIterator::borrowing_nullary_op(self);
+        std::lock_guard<std::mutex> lock(generator->mutex_);
+        cpu_serial_kernel(iter, [mean, std, generator]() -> scalar_t {
+          at::normal_distribution<double> normal(mean, std);
+          return static_cast<scalar_t>(normal(generator));
+        });
+      }
+    });
+  }
+}
+
+template<typename RNG>
+struct NormalKernel {
+  void operator()(Tensor& self, double mean, double std, std::optional<Generator> gen) {
+    normal_kernel(self, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Uniform =======================================================
+
+template<typename RNG>
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    at::uniform_real_distribution<scalar_t> uniform(from, to);
+    cpu_serial_kernel(iter, [&uniform, generator]() -> scalar_t {
+      return static_cast<scalar_t>(uniform(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
+    uniform_kernel(iter, from, to, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Cauchy ========================================================
+
+template<typename RNG>
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::cauchy_distribution<double> cauchy(median, sigma);
+    cpu_serial_kernel(iter, [&cauchy, generator]() -> scalar_t {
+      return static_cast<scalar_t>(cauchy(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+    cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== LogNormal =======================================================
+
+template<typename RNG>
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::lognormal_distribution<double> logNormal(mean, std);
+    cpu_serial_kernel(iter, [&logNormal, generator]() -> scalar_t {
+      return static_cast<scalar_t>(logNormal(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+    log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// =================================================== Geometric ======================================================
+
+template<typename RNG>
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::geometric_distribution<double> geometric(p);
+    cpu_serial_kernel(iter, [&geometric, generator]() -> scalar_t {
+      return static_cast<scalar_t>(geometric(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+    geometric_kernel(iter, p, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Exponential =====================================================
+
+template<typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::exponential_distribution<double> exponential(lambda);
+    cpu_serial_kernel(iter, [&exponential, generator]() -> scalar_t {
+      return static_cast<scalar_t>(exponential(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+    exponential_kernel(iter, lambda, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Bernoulli =======================================================
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+  self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    using self_t = scalar_t;
+    auto p_cpu = p_.to(kCPU);
+    auto p = expand_inplace(self, p_cpu);
+    auto iter = TensorIteratorConfig()
+        .add_output(self)
+        .add_const_input(*p)
+        .check_all_same_dtype(false)
+        .build();
+    if (p->scalar_type() == kDouble) {
+      cpu_serial_kernel(iter, [&](const double p_val) -> self_t {
+        at::bernoulli_distribution<double> bernoulli(p_val);
+        return static_cast<self_t>(bernoulli(generator));
+      });
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half,
+      p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
+        using p_t = scalar_t;
+        cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
+          at::bernoulli_distribution<float> bernoulli(p_val);
+          return static_cast<self_t>(bernoulli(generator));
+        });
+      });
+    }
+  });
+}
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+  self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    auto iter = TensorIterator::borrowing_nullary_op(self);
+    cpu_serial_kernel(iter, [p, generator]() -> scalar_t {
+      at::bernoulli_distribution<double> bernoulli(p);
+      return static_cast<scalar_t>(bernoulli(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct BernoulliKernel {
+  void operator()(const TensorBase &self, double p, std::optional<Generator> gen) {
+    bernoulli_kernel(self, p, check_generator<RNG>(gen));
+  }
+  void operator()(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
+    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
+  }
+};
+
+}}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e214126e00d106b18702cb926c9a0c1fe5550c27
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h
@@ -0,0 +1,88 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
+// access constants such as M_SQRT2 and M_2_SQRTPI.
+#ifdef _WIN32
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <math.h>
+#endif // _WIN32
+
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+constexpr double kGeluBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+constexpr double kGeluKappa = 0.044715;
+
+template <typename T>
+using reduced_fp_to_float_t = std::conditional_t<c10::is_reduced_floating_point_v<T>, float, T>;
+
+template <typename T, std::enable_if_t<c10::is_reduced_floating_point_v<T>, bool> = true>
+float reduced_fp_to_float(T x) {
+  return float(x);
+}
+
+template <typename T, std::enable_if_t<!c10::is_reduced_floating_point_v<T>, bool> = true>
+T reduced_fp_to_float(T x) {
+  return x;
+}
+
+template <typename T>
+T scalar_gelu_approximated_with_tanh(T x) {
+  using opmath_t = reduced_fp_to_float_t<T>;
+  auto x_float = reduced_fp_to_float(x);
+  auto x_cube = x_float * x_float * x_float;
+  auto inner = opmath_t(kGeluBeta) * (x_float + opmath_t(kGeluKappa) * x_cube);
+  return opmath_t(0.5) * x_float * (opmath_t(1) + std::tanh(inner));
+}
+
+template <typename T, std::enable_if_t<!c10::is_reduced_floating_point_v<T>, bool> = true>
+vec::Vectorized<T> vectorized_gelu_approximated_with_tanh(vec::Vectorized<T> x) {
+  const vec::Vectorized<T> kPointFiveVec(T(0.5));
+  const vec::Vectorized<T> kOneVec(T(1));
+  const vec::Vectorized<T> kGeluBetaVec((T(kGeluBeta)));
+  const vec::Vectorized<T> kGeluKappaVec((T(kGeluKappa)));
+  auto x_cube = x * x * x;
+  vec::Vectorized<T> inner_vec = kGeluBetaVec * (x + kGeluKappaVec * x_cube);
+  return kPointFiveVec * x * (kOneVec + inner_vec.tanh());
+}
+
+template <typename T, std::enable_if_t<c10::is_reduced_floating_point_v<T>, bool> = true>
+vec::Vectorized<T> vectorized_gelu_approximated_with_tanh(vec::Vectorized<T> x) {
+  auto [x0, x1] = at::vec::convert_to_float<T>(x);
+  return at::vec::convert_from_float<T>(
+      vectorized_gelu_approximated_with_tanh(x0),
+      vectorized_gelu_approximated_with_tanh(x1));
+}
+
+
+template <typename T>
+T scalar_gelu(T x) {
+  using opmath_t = reduced_fp_to_float_t<T>;
+  const auto kAlpha = opmath_t(M_SQRT1_2);
+  return reduced_fp_to_float(x) * opmath_t(0.5) * (opmath_t(1) + std::erf(reduced_fp_to_float(x) * kAlpha));
+}
+
+template<typename T, std::enable_if_t<!c10::is_reduced_floating_point_v<T>, bool> = true>
+vec::Vectorized<T> vectorized_gelu(vec::Vectorized<T> x) {
+  const vec::Vectorized<T> kAlphaVec(T(M_SQRT1_2));
+  const vec::Vectorized<T> kOneVec(T(1));
+  const vec::Vectorized<T> kPointFiveVec(T(0.5));
+  return x * kPointFiveVec * (kOneVec + (x * kAlphaVec).erf());
+}
+
+template<typename T, std::enable_if_t<c10::is_reduced_floating_point_v<T>, bool> = true>
+vec::Vectorized<T> vectorized_gelu(vec::Vectorized<T> x) {
+  auto [x0, x1] = at::vec::convert_to_float<T>(x);
+  return at::vec::convert_from_float<T>(vectorized_gelu(x0), vectorized_gelu(x1));
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c3f03718a9d5c08dc5c0fa47bedfe12e2a8fd95
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h
@@ -0,0 +1,66 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/complex.h>
+#include <ATen/NumericUtils.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+// custom min and max to be used in logcumsumexp for complex arguments
+template <typename scalar_t>
+std::pair<c10::complex<scalar_t>, c10::complex<scalar_t>> _logcumsumexp_minmax(c10::complex<scalar_t> x, c10::complex<scalar_t> y) {
+  if (at::_isnan(y)) {  // either real is nan or imag is nan
+    return std::make_pair(y, y);
+  } else if (at::_isnan(x)) {  // either real is nan or imag is nan
+    return std::make_pair(x, x);
+  } else {
+    return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x);
+  }
+}
+
+template <typename scalar_t>
+scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan
+  scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan
+  if (min != max || std::isfinite(min)) {
+    // nan will be propagated here
+    return std::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  auto [min, max] = _logcumsumexp_minmax<scalar_t>(x, y);
+  auto min_real = std::real(min);
+  auto max_real = std::real(max);
+
+  if (at::_isnan(min)) {  // either real is nan or imag is nan
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  } else if (!std::isfinite(min_real) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      return std::log(std::exp(min) + std::exp(max));
+    }
+  } else {
+    return std::log1p(std::exp(min - max)) + max;
+  }
+}
+
+} // end namespace
+} //end at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..6af3a57749a51e46a6a536d164b2a1218e5bb269
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h
@@ -0,0 +1,315 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/Parallel.h>
+#include <c10/util/TypeList.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/irange.h>
+
+#include <type_traits>
+
+namespace at::native { inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define VEC_LOOP_HEADER(func_t, data) \
+  using scalar_t = typename function_traits<func_t>::result_type; \
+  using Vec = Vectorized<scalar_t>; \
+  char* out_ptr = data[0]; \
+  (void) out_ptr;
+
+// reduction that is contiguous over the input in dim 0
+template <typename traits>
+inline bool is_contiguous_reduction(const int64_t* strides) {
+  return strides[0] == 0 &&
+         strides[1] == sizeof(typename traits::arg2_t);
+}
+
+// reduction that is contiguous over the input in dim 1
+template <typename traits>
+inline bool is_outer_reduction(const int64_t* strides) {
+  return strides[0] == 0 &&
+         strides[2] == sizeof(typename traits::result_type) &&
+         strides[3] == sizeof(typename traits::arg2_t);
+}
+
+template <typename func_t, typename vec_func_t>
+inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
+                                        func_t op, vec_func_t vop, bool reduce) {
+  VEC_LOOP_HEADER(func_t, data)
+  const char* in1_ptr = data[1];
+  Vec acc[4];
+  for (const auto j : c10::irange(4)) {
+    acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
+  }
+  for (const auto i : c10::irange(1, n)) {
+    const char* ptr = in1_ptr + stride * i;
+    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
+    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
+    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
+    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
+  }
+  if (reduce) {
+    scalar_t buffer[Vec::size()];
+    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
+    acc[0].store(buffer);
+    for (const auto j : c10::irange(1, Vec::size())) {
+      buffer[0] = op(buffer[0], buffer[j]);
+    }
+    auto dst = (scalar_t*)out_ptr;
+    *dst = op(*dst, buffer[0]);
+  } else {
+    for (const auto j : c10::irange(4)) {
+      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
+      acc[j] = vop(acc[j], Vec::loadu(dst));
+      acc[j].store(dst);
+    }
+  }
+}
+
+template <typename F>
+inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
+  for ([[maybe_unused]] const auto j : c10::irange(n)) {
+    f();
+    data[0] += strides[0];
+    data[1] += strides[1];
+  }
+}
+
+// computes the reduction out = op(out, in)
+template <typename func_t, typename vec_func_t>
+inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
+  VEC_LOOP_HEADER(func_t, data)
+  constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
+  int64_t count = n / (4 * Vec::size());
+  if (count > 0) {
+    vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
+  }
+  char* ptrs[3] = { data[0], data[0], data[1] };
+  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
+  basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
+}
+
+// computes the reduction out = op(out, in)
+template <typename func_t, typename vec_func_t>
+inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
+  VEC_LOOP_HEADER(func_t, data)
+
+  // reduce down each column of 4 * Vec::size() elements.
+  constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
+  int64_t outer_stride[2] = { vector_stride, vector_stride };
+  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
+    vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
+  });
+
+  // reduce down the remaining columns
+  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
+  int64_t remaining = size1 % (4 * Vec::size());
+  UNARY_OUTER_LOOP(data, step, remaining, [&] {
+    char* ptrs[3] = { data[0], data[0], data[1] };
+    int64_t strides[] = { 0, 0, inner_stride };
+    basic_loop(ptrs, strides, 0, size0, op);
+  });
+}
+
+template<typename traits, typename res_t>
+static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
+  // static_assert(std::is_same_v<res_t, typename traits::arg2_t>, "data types must match");
+  if (index < num_outputs) {
+    char *out = (char *) iter.data_ptr(index);
+    *(res_t *) out = result;
+  }
+}
+
+template<typename traits, typename res_t>
+static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs == 1);
+  set_result<traits>(0, result, iter, num_outputs);
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+inline std::enable_if_t<i == sizeof...(tuple_t), std::size_t>
+for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
+  return i;
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+inline std::enable_if_t<i < sizeof...(tuple_t), std::size_t>
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
+  if (i < (size_t)num_outputs) {
+    set_result<traits>(i, std::get<i>(t), iter, num_outputs);
+    return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
+  }
+  return i;
+}
+
+template<typename traits, typename... res_t>
+static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs >= 1);
+  std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
+  AT_ASSERT((size_t)num_outputs == result_size);
+}
+
+template <typename T, typename... Args>
+struct all_same : std::conjunction<
+  std::is_same<T, Args>...
+> {};
+
+// data_t is the input/output data type.
+// acc_t is a type that contains all the necessary data
+// to continue reducing.
+// index_t is a one-dimensional index
+//
+// ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy
+// the following.
+// reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value.
+// combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one.
+// project: acc_t -> out_t finishes the reduction, getting the required output.
+//
+// Additionally, acc_t must be default-constructible:
+// acc_t {} is an identity for combine,
+// and project(acc_t {}) is the value of the operation on zero elements.
+//
+// The point of `combine` is to support parallelization -
+// the idea is to one sequence of `reduce` calls per thread of execution,
+// and then to combine them at the end with `combine`.
+//
+// If there is more than one output element,
+// our parallelization strategy is to use one thread for each of them,
+// which means that `combine` will never be called.
+//
+// If, on the other hand, there is only one, then we split the input into
+// into several pieces, reduce each separately, and then combine them.
+
+template <typename ops_t, typename init_t>
+void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
+  using rf_t = decltype(&ops_t::reduce);
+  using cf_t = decltype(&ops_t::combine);
+  using pf_t = decltype(&ops_t::project);
+  using r_traits = binary_function_traits<rf_t>;
+  using c_traits = binary_function_traits<cf_t>;
+  using p_traits = unary_function_traits<pf_t>;
+  using acc_t = typename p_traits::arg1_t;
+  using data_t = typename r_traits::arg2_t;
+  static_assert(
+    all_same<
+      acc_t,
+      init_t,
+      typename r_traits::arg1_t,
+      typename r_traits::result_type,
+      typename c_traits::arg1_t,
+      typename c_traits::arg2_t,
+      typename c_traits::result_type>::value,
+    "all accumulate types must match");
+  static_assert(
+    std::is_default_constructible_v<acc_t>,
+    "the accumulate type must be default-constructible"
+  );
+  const int num_outputs = iter.noutputs();
+  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
+    auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
+      int ntensors = sub_iter.ntensors();
+      sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
+        AT_ASSERT(ntensors - num_outputs == 1);
+        char *in = data[ntensors - 1];
+        int64_t stride = strides[ntensors - 1];
+        for (const auto i : c10::irange(size)) {
+          acc = ops.reduce(acc, c10::load<data_t>(in), begin + i);
+          in += stride;
+        }
+      }, {begin, end});
+      return ops.translate_idx(acc, sub_iter.view_offsets()[0]);
+    };
+    acc_t total_acc = init;
+    auto numel = sub_iter.numel();
+    if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
+        at::in_parallel_region()) {
+      total_acc = reduction_body(total_acc, 0, numel);
+    } else {
+      int max_threads = at::get_num_threads();
+      AT_ASSERT(max_threads > 0);
+      static_assert(
+        !std::is_same_v<acc_t, bool>,
+        "Concurrently modifying different references into std::vector<bool> is UB."
+      );
+      std::vector<acc_t> buffer((unsigned)max_threads, init);
+      at::parallel_for(0, numel, internal::GRAIN_SIZE,
+        [&](int64_t begin, int64_t end) {
+          auto& acc = buffer[at::get_thread_num()];
+          acc = reduction_body(acc, begin, end);
+        }
+      );
+      for (const auto i : c10::irange(max_threads)) {
+        total_acc = ops.combine(total_acc, buffer[i]);
+      }
+    }
+    set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
+  });
+}
+
+template <typename func_t, typename vec_func_t>
+void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
+  using traits = binary_function_traits<func_t>;
+  static_assert(
+    all_same<
+      typename traits::result_type,
+      typename traits::arg1_t,
+      typename traits::arg2_t>::value,
+    "all types must match");
+
+  iter.output_base().fill_(ident);
+  iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
+    int64_t outer_strides[] = { strides[2], strides[3] };
+    if (is_contiguous_reduction<traits>(strides)) {
+      // input is contiguous in dim 0, output is reduced in dim 0
+      UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
+        vectorized_inner_reduction(data, size0, op, vop);
+      });
+    } else if (is_outer_reduction<traits>(strides)) {
+      // input and output are contiguous in dim 1
+      int64_t inner_stride = strides[1]; // stride of input in dim 0
+      vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop);
+    } else {
+      UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
+        char* ptrs[3] = { data[0], data[0], data[1] };
+        int64_t inner_strides[3] = { strides[0], strides[0], strides[1] };
+        basic_loop(ptrs, inner_strides, 0, size0, op);
+      });
+    }
+  });
+}
+
+// when reduction is on most inner dimension (dim 0 in TensorIterator)
+// and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim`
+// can be used.
+inline bool is_reduce_lastdim(TensorIteratorBase& iter) {
+  return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0)
+      && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1);
+}
+
+template <typename reduce_func_t>
+void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) {
+  auto shape = iter.shape();
+  int64_t dim_size = shape[0];
+  int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size);
+  TensorIterator sub_iter(iter);
+  // create sub iterator to parallel on all non-reduce-dims
+  sub_iter.narrow(0, 0, 1);
+  auto loop = [&](char** data, const int64_t* strides, int64_t size) {
+    char* out = data[0];
+    char* in = data[1];
+    for (int64_t i = 0; i < size; ++i) {
+      reduce_op(out, in, dim_size);
+      out += strides[0];
+      in += strides[1];
+    }
+  };
+  sub_iter.for_each(loop, grain_size);
+}
+
+}} // namespace at::native::<anonymous>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2b3bf4061ffc690c1eb1b6a57577510403f1eeb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h
@@ -0,0 +1,25 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+using weight_norm_fn = void(*)(
+    TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, int64_t);
+using weight_norm_backward_fn = void(*)(
+    TensorBase&, TensorBase&, const TensorBase&, const TensorBase&,
+    const TensorBase&, const TensorBase&, int64_t);
+
+DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub)
+DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub)
+
+} // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa31a27e798745e9a638473a30c97ff83510de7e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h
@@ -0,0 +1,216 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <utility>
+
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+template<typename T> using opmath_t = at::opmath_type<T>;
+
+constexpr int64_t kChunkSize = 16;
+
+template <typename T>
+void AddMoments(
+    int64_t m0_add,
+    const T& m1_add,
+    const T& m2_add,
+    int64_t& m0,
+    T& m1,
+    T& m2) {
+  const int64_t n = m0 + m0_add;
+  const T c = n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
+  const T delta = m1_add - m1;
+  m1 += c * delta;
+  m2 += m2_add + delta * delta * c * static_cast<T>(m0);
+  m0 = n;
+}
+
+template <typename T>
+C10_ALWAYS_INLINE void AddMomentsVec(
+    int64_t m0_add,
+    const vec::Vectorized<T>& m1_add,
+    const vec::Vectorized<T>& m2_add,
+    int64_t& m0,
+    vec::Vectorized<T>& m1,
+    vec::Vectorized<T>& m2) {
+  using Vec = vec::Vectorized<T>;
+  const int64_t n = m0 + m0_add;
+  const T c = n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
+  const Vec c_vec(c);
+  const Vec delta = m1_add - m1;
+  const Vec m2_tmp = m2 + m2_add;
+  const Vec c_vec_delta = c_vec * delta;
+  const Vec m0_delta = delta * Vec(static_cast<T>(m0));
+  m1 = m1 + c_vec_delta;
+  m2 = fmadd(m0_delta, c_vec_delta, m2_tmp);
+  m0 = n;
+}
+
+template <typename T>
+inline std::enable_if_t<std::is_same_v<T, opmath_t<T>>, void>
+UpdateMomentsVec(
+    int64_t m0,
+    const T* X_ptr,
+    const std::array<vec::Vectorized<opmath_t<T>>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<opmath_t<T>>& m1_stk0,
+    vec::Vectorized<opmath_t<T>>& m2_stk0) {
+  using Vec = vec::Vectorized<opmath_t<T>>;
+  Vec m1_vec(0);
+  Vec m2_vec(0);
+  for (const auto j : c10::irange(m0)) {
+    const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size());
+    const Vec tmpVec = c_vecs[j];
+    const Vec delta_vec = x_vec - m1_vec;
+    m1_vec = fmadd(tmpVec, delta_vec, m1_vec);
+    const Vec tmpVec2 = x_vec - m1_vec;
+    m2_vec = fmadd(delta_vec, tmpVec2, m2_vec);
+  }
+  AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0);
+}
+
+// each bfloat16/half vector will be converted to two float vectors,
+// and accumulated successively on m1_stk0/m2_stk0.
+template <typename T>
+inline std::enable_if_t<!std::is_same_v<T, at::opmath_type<T>>, void>
+UpdateMomentsVec(
+    int64_t m0,
+    const T* X_ptr,
+    const std::array<vec::Vectorized<at::opmath_type<T>>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<at::opmath_type<T>>& m1_stk0,
+    vec::Vectorized<at::opmath_type<T>>& m2_stk0) {
+  using Vec = vec::Vectorized<T>;
+  using fVec = vec::Vectorized<at::opmath_type<T>>;
+  fVec m1_fvec0(0), m1_fvec1(0);
+  fVec m2_fvec0(0), m2_fvec1(0);
+  for (const auto j : c10::irange(m0)) {
+    const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size());
+    const fVec tmpVec = c_vecs[j];
+    auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+    const fVec delta_fvec0 = x_fvec0 - m1_fvec0;
+    const fVec delta_fvec1 = x_fvec1 - m1_fvec1;
+    m1_fvec0 = fmadd(delta_fvec0, tmpVec, m1_fvec0);
+    m1_fvec1 = fmadd(delta_fvec1, tmpVec, m1_fvec1);
+    const fVec delta_fvec2 = x_fvec0 - m1_fvec0;
+    const fVec delta_fvec3 = x_fvec1 - m1_fvec1;
+    m2_fvec0 = fmadd(delta_fvec0, delta_fvec2, m2_fvec0);
+    m2_fvec1 = fmadd(delta_fvec1, delta_fvec3, m2_fvec1);
+  }
+  AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0);
+  AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0);
+}
+
+// Compute rowwise moments by Welford algorithm and cascade sum to improve
+// numerical stability.
+// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename T, int64_t kMaxDepth>
+std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
+  using math_t = opmath_t<T>;
+
+  constexpr int64_t kVecSize = vec::Vectorized<T>::size();
+  constexpr int64_t kAccVecSize = vec::Vectorized<math_t>::size();
+  const int64_t n = N / kVecSize;
+  const int64_t m = divup(n, kChunkSize);
+  const int64_t depth = utils::CeilLog2(m);
+
+  using Vec = vec::Vectorized<math_t>;
+  const Vec kZeroVec(math_t(0));
+  std::array<int64_t, kMaxDepth> m0_stk = {{0}};
+  std::array<Vec, kMaxDepth> m1_stk;
+  m1_stk.fill(kZeroVec);
+  std::array<Vec, kMaxDepth> m2_stk;
+  m2_stk.fill(kZeroVec);
+
+  for (const auto i : c10::irange(m)) {
+    const T* X_ptr = X + i * kChunkSize * kVecSize;
+    const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize);
+    static std::array<Vec, kChunkSize> c_vecs = ([]() {
+      std::array<Vec, kChunkSize> result;
+      for (const auto i : c10::irange(kChunkSize)) {
+        result[i] = Vec(math_t(1) / static_cast<math_t>(i + 1));
+      }
+      return result;
+    })();
+    UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]);
+
+    int64_t mask = i + 1;
+    for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) {
+      AddMomentsVec(
+          m0_stk[j - 1],
+          m1_stk[j - 1],
+          m2_stk[j - 1],
+          m0_stk[j],
+          m1_stk[j],
+          m2_stk[j]);
+      m0_stk[j - 1] = 0;
+      m1_stk[j - 1] = kZeroVec;
+      m2_stk[j - 1] = kZeroVec;
+      mask >>= 1;
+    }
+  }
+  for (const auto i : c10::irange(1, depth)) {
+    AddMomentsVec(
+        m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]);
+  }
+
+  std::array<math_t, kAccVecSize> m1_arr{};
+  std::array<math_t, kAccVecSize> m2_arr{};
+  m1_stk[0].store(m1_arr.data());
+  m2_stk[0].store(m2_arr.data());
+
+  int64_t m0 = 0;
+  math_t m1 = 0;
+  math_t m2 = 0;
+  for (int64_t i = n * kVecSize; i < N; ++i) {
+    math_t x = static_cast<math_t>(X[i]);
+    const math_t delta = x - m1;
+    ++m0;
+    m1 += delta / static_cast<math_t>(m0);
+    m2 += delta * (x - m1);
+  }
+  // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result
+  int64_t m0_add = n * kVecSize / kAccVecSize;
+  for (const auto i : c10::irange(kAccVecSize)) {
+    AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2);
+  }
+
+  return std::make_pair(m1, m2 / static_cast<math_t>(N - ddof));
+}
+
+template <typename T>
+std::pair<opmath_t<T>, opmath_t<T>> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
+  using Vec = vec::Vectorized<T>;
+  constexpr int64_t kVecSize = Vec::size();
+  const int64_t n = N / kVecSize;
+  const int64_t m = divup(n, kChunkSize);
+  const int64_t depth = utils::CeilLog2(m);
+  if (depth <= 4) {
+    return RowwiseMomentsImpl<T, 4>(X, N, ddof);
+  } else if (depth <= 8) {
+    return RowwiseMomentsImpl<T, 8>(X, N, ddof);
+  } else if (depth <= 16) {
+    return RowwiseMomentsImpl<T, 16>(X, N, ddof);
+  } else if (depth <= 32) {
+    return RowwiseMomentsImpl<T, 32>(X, N, ddof);
+  } else {
+    return RowwiseMomentsImpl<T, 64>(X, N, ddof);
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1c036c8edba98f808f615ff5a522942a10b05bd3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh
@@ -0,0 +1,161 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cutlass/util/packed_stride.hpp>
+
+namespace at::cuda::detail {
+
+using Strides = std::array<int64_t, 3>;
+
+template <
+    typename DtypeA,
+    typename DtypeB,
+    typename DtypeOutput,
+    typename DtypeScale,
+    typename ProblemShape,
+    typename StrideA,
+    typename StrideB,
+    typename StrideOutput>
+__global__ void prepare_grouped_gemm_data(
+    DtypeA* A,
+    DtypeB* B,
+    DtypeOutput* output,
+    DtypeScale* scale_A,
+    DtypeScale* scale_B,
+    DtypeA** A_ptrs,
+    DtypeB** B_ptrs,
+    DtypeOutput** output_ptrs,
+    DtypeScale** inputA_scale_ptrs,
+    DtypeScale** inputB_scale_ptrs,
+    ProblemShape* problem_sizes,
+    // Strides for cutlass, cute::Stride
+    StrideA* stride_A,
+    StrideB* stride_B,
+    StrideOutput* stride_output,
+    const int32_t* offs,
+    int32_t M,
+    int32_t N,
+    int32_t K,
+    // Original strides of the input tensors
+    Strides tensor_StrideA,
+    Strides tensor_StrideB,
+    Strides tensor_StrideOutput,
+    Strides tensor_ShapeA,
+    Strides tensor_ShapeB,
+    int64_t a_scale_stride,
+    int64_t b_scale_stride,
+    bool a_row_major = true,
+    bool b_row_major = false) {
+  int32_t tid = threadIdx.x;
+  int32_t delta = 0;
+  int32_t offset = 0;
+  if (offs != nullptr) {
+    int32_t start = tid == 0 ? 0 : offs[tid - 1];
+    offset = offs[tid];
+    delta = offset - start;
+    CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
+
+    // TMA transfers require global memory tensor addresses to be
+    // aligned to 16 bytes.
+    if (tid < blockDim.x - 1) {
+      // Check this requirement for input tensors, in case group
+      // addresses are increased along the dynamic dimension.
+      if ((K < 0 && a_row_major) ||       // 2D/2D: check along K dimension
+          (M < 0 && !a_row_major)) {      // 3D/2D: check along N dimension
+        int align = 128 / cutlass::sizeof_bits<DtypeA>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+      if ((K < 0 && !b_row_major) ||      // 2D/2D: check along K dimension
+          (N < 0 && b_row_major)) {       // 3D/2D: check along N dimension
+        int align = 128 / cutlass::sizeof_bits<DtypeB>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+
+      // Check the same requirement for output tensor (that is always
+      // contiguous, and in row-major layout).
+      if (N < 0) {
+        int align = 128 / cutlass::sizeof_bits<DtypeOutput>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected output tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+    }
+  }
+  int64_t lda, ldb, ldoutput;
+  if (M < 0) {
+    // A and output is 2d
+    CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
+    M = delta;
+    lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
+    ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
+    ldoutput = tensor_StrideOutput[0];
+    A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = tid == 0 ? scale_A : scale_A + offs[tid - 1];
+      inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride;
+    }
+    output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
+    B_ptrs[tid] = B + tid * tensor_StrideB[0];
+  } else if (N < 0) {
+    CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
+    N = delta;
+    lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
+    ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
+    ldoutput = tensor_StrideOutput[0];
+    A_ptrs[tid] = A + tid * tensor_StrideA[0];
+    output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1];
+    B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[1];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride;
+      inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
+    }
+  } else if (K < 0) {
+    CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
+    // A, B is 2d, output is 3d
+    K = delta;
+    lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
+    ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1];
+    ldoutput = tensor_StrideOutput[1];
+    A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[1];
+    B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[0];
+    output_ptrs[tid] = output + tid * tensor_StrideOutput[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * M;
+      inputB_scale_ptrs[tid] = scale_B + tid * N;
+    }
+  } else {
+    // A, B, output are 3D
+    lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
+    ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
+    ldoutput = tensor_StrideOutput[1];
+    A_ptrs[tid] = A + tid * tensor_StrideA[0];
+    B_ptrs[tid] = B + tid * tensor_StrideB[0];
+    output_ptrs[tid] = output + tid * tensor_StrideOutput[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride;
+      inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride;
+    }
+  }
+  problem_sizes[tid] = ProblemShape(M, N, K);
+
+  // make_cute_packed_stride only replaces one of the stride elements with
+  // one the provided values in the shape arguments
+  // the indices of the src/dst depend on whether A/B are row-major
+  // so constructing shape argument with two similar lda values
+  // while it looks non-sensical (and it is a nonsensical shape)
+  // is fine for these stride construction purposes - the one that will be used
+  // for replacement is correct, the other one is ignored, and we don't have to
+  // branch on whether A/B are row-major
+  stride_A[tid] = cutlass::make_cute_packed_stride(StrideA{}, {lda, lda, 1});
+  stride_B[tid] = cutlass::make_cute_packed_stride(StrideB{}, {ldb, ldb, 1});
+  stride_output[tid] =
+      cutlass::make_cute_packed_stride(StrideOutput{}, {M, ldoutput, 1});
+}
+} // namespace at::cuda::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..793922407e457e5971bd14d03cb29fb9d275655c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h
@@ -0,0 +1,21 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <algorithm>
+
+namespace at::native {
+
+// returns 2**floor(log2(n))
+static int lastPow2(unsigned int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max<int>(1, n - (n >> 1));
+}
+
+} // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c98379b40d11e0a293bf8b13bc547321eaebc901
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
@@ -0,0 +1,407 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/cuda/DeviceUtils.cuh>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
+// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
+// The template arguments have the following meaning:
+// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
+// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
+// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// This is important because it means only __shfl_ instructions are required for reductions.
+// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
+// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
+// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
+// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed.
+// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
+// This allows SoftMax to be fused with a cast immediately following the SoftMax.
+// The mask should have the same shape as input, with a boolean indicate if the value is masked.
+// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D.
+// For instance:
+// input_t=half,  acc_t=float, output_t=half  => read half tensor, float accumulators, write half tensor.
+// input_t=half,  acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor.
+// input_t_float, acc_t=float, output_t=half  => read float tensor, float accumulators, write half tensor.
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+    int idx_offset = first_batch * stride + local_idx;
+
+    src += idx_offset;
+    dst += idx_offset;
+
+    if (is_transformer_mask) {
+        mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx;
+    } else {
+        mask += idx_offset;
+    }
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                elements[i][it] = src[i*element_count+it*WARP_SIZE];
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        bool is_meaningful_max = false;
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (is_masked) {
+                int idx = it*WARP_SIZE;
+                if ((idx + local_idx) < batch_element_count) {
+                    if (!is_transformer_mask) {
+                        idx += i*element_count;
+                    }
+                    if (!mask[idx]) {
+                        max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+                        is_meaningful_max = true;
+                    }
+                }
+            } else {
+                max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it];
+            }
+        }
+        if (is_masked) {
+            if (!is_meaningful_max) {
+                max_value[i] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked) {
+                if (is_log_softmax) {
+                    sum[i] += std::exp(elements[i][it] - max_value[i]);
+                } else {
+                    elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                    sum[i] += elements[i][it];
+                }
+            } else {
+                int idx = it*WARP_SIZE;
+                bool valid = (idx + local_idx) < batch_element_count;
+                if (!is_transformer_mask) {
+                    idx += i*element_count;
+                }
+                if (valid) {
+                    if (!mask[idx]) {
+                        if (is_log_softmax) {
+                            sum[i] += std::exp(elements[i][it] - max_value[i]);
+                        } else {
+                            elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                            sum[i] += elements[i][it];
+                        }
+                    } else {
+                        if (!is_log_softmax) {
+                            // Masked values are treated as -infinity, and std::exp(-infinity) is 0.
+                            elements[i][it] = 0;
+                        }
+                    }
+                } else {
+                    if (!is_log_softmax) {
+                        elements[i][it] = 0.;
+                    }
+                }
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        if (is_log_softmax) sum[i] = std::log(sum[i]);
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_log_softmax) {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
+                } else if (sum[i] == 0) {
+                    dst[i*element_count+it*WARP_SIZE] = std::numeric_limits<acc_t>::quiet_NaN();
+                } else {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i];
+                }
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x % WARP_SIZE;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+    if (is_masked) {
+        mask += thread_offset;
+    }
+
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE];
+                output_reg[i][it] = output[i*element_count+it*WARP_SIZE];
+            } else {
+                grad_reg[i][it] = acc_t(0);
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) {
+                sum[i] += grad_reg[i][it];
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_masked && mask[i*element_count+it*WARP_SIZE]) {
+                    gradInput[i*element_count+it*WARP_SIZE] = 0;
+                }
+                // compute gradients
+                else if (is_log_softmax) {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
+                } else {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+                }
+            }
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E:                    \
+            softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>   \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst,   \
+                    src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
+            LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
+            LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
+            LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
+            LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
+            LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
+            LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
+            LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
+            LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
+            LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
+            LAUNCH_SOFTMAX_WARP_FORWARD(10); // 1024
+            LAUNCH_SOFTMAX_WARP_FORWARD(11); // 2048
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E:                      \
+            softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked> \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>       \
+                (grad_input, grad, output, batch_count, softmax_elements_stride, \
+                softmax_elements, mask);                                              \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                      \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1
+            LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2
+            LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4
+            LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8
+            LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16
+            LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32
+            LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64
+            LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128
+            LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256
+            LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512
+            LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024
+            default:
+                break;
+        }
+    }
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..9584f4710ea0657269da74889322589df83d54c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
@@ -0,0 +1,23 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+
+namespace native {
+
+// NOTE: these functions require output tensors to be contiguous
+void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+
+}}  // namespace at::native
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc0fb8ca6043142a5c4c34b83ca1da4208878bc0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/macros/Macros.h>
+
+// Marks a lambda as executable on both the host and device. The __host__
+// attribute is important so that we can access static type information from
+// the host, even if the function is typically only executed on the device.
+#ifndef GPU_LAMBDA
+#define GPU_LAMBDA __host__ __device__
+#endif
+
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 256;
+}
+
+constexpr int thread_work_size() { return 4; }
+#else
+constexpr uint32_t num_threads() {
+  return C10_WARP_SIZE * 4;
+}
+
+constexpr int thread_work_size() { return 8; }
+#endif
+
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..97ece5bfd2d7cba3fac04076f1abc3719d8024f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..42118c217630a793e3dc99db81ced73f4cec6f65
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/_addmm_activation_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_addmm_activation_out_cpu : public at::meta::structured__addmm_activation {
+void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out);
+};
+struct TORCH_API structured_addmm_activation_out_cuda : public at::meta::structured__addmm_activation {
+void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..d99cfde46f1f4850e4a4815ac71c18cc06762be9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_amp_update_scale_ops.h>
+
+namespace at {
+
+
+// aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale_::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+}
+
+// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_out(at::Tensor & out, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+}
+// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_outf(const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out) {
+    return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+}
+
+// aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out)
+inline ::std::tuple<at::Tensor,at::Tensor> _amp_update_scale(const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h
new file mode 100644
index 0000000000000000000000000000000000000000..49cf9563e00deab38c11abfb68d9b81a5c9fa885
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_autocast_to_reduced_precision_ops.h>
+
+namespace at {
+
+
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..12883720942c874ab4d6755ead3051337bfe5163
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Char_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _cast_Char(const at::Tensor & self, bool non_blocking=false);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc20a697e6fb4fbb749eaca4c04de45050f5cb5a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor _cast_Half(const at::Tensor & self, bool non_blocking=false);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..000e41741551c82a2c2653b2f5e81cc38e117201
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_for_cpu_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _convert_weight_to_int4pack_for_cpu {
+  using schema = at::Tensor (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_convert_weight_to_int4pack_for_cpu";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_convert_weight_to_int4pack_for_cpu(Tensor self, int innerKTiles) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, int64_t innerKTiles);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t innerKTiles);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..80219580c7f35528cb79e40c2eb6492d98ae9ba2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _cudnn_ctc_loss {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_cudnn_ctc_loss";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)";
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity);
+};
+
+struct TORCH_API _cudnn_ctc_loss_Tensor {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_cudnn_ctc_loss";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)";
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity);
+};
+
+struct TORCH_API _cudnn_ctc_loss_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_cudnn_ctc_loss";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))";
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..a52157f65be2298de1d1c76e35533c9d7415c904
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cufft_get_plan_cache_max_size.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_cufft_get_plan_cache_max_size_ops.h>
+
+namespace at {
+
+
+// aten::_cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
+inline int64_t _cufft_get_plan_cache_max_size(at::DeviceIndex device_index) {
+    return at::_ops::_cufft_get_plan_cache_max_size::call(device_index);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a0e776f5b0e4a5f001bdf36e7e5bdc5d50f12af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cummin_helper_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API void _cummin_helper(const at::Tensor & self, at::Tensor & values, at::Tensor & indices, int64_t dim);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c438b64ce7aca7a6a231b50468e1cea1b8d8150
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API int64_t _debug_has_internal_overlap(const at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c2541f982101aad537118295f41ac7d34a57a06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h
@@ -0,0 +1,97 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_embedding_bag_dense_backward_ops.h>
+
+namespace at {
+
+
+// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+inline at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+  }
+}
+
+// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+inline at::Tensor _embedding_bag_dense_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+  }
+}
+
+// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+  }
+}
+
+// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+  }
+}
+
+// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _embedding_bag_dense_backward_symint_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+  }
+}
+
+// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _embedding_bag_dense_backward_symint_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+    return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d74307ee632e81fcbe5efb98546dc71bac2d4469
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_forward_only_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_forward_only_outf(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional<at::Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b24ce717374bcf9d99ab82248d6d9f47abe81796
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1);
+TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..80ca9c4276dd8ddb545a314ffa58883ff727e74a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _fft_r2c {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, int64_t, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_fft_r2c";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided);
+};
+
+struct TORCH_API _fft_r2c_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, int64_t, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_fft_r2c";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f302ef4d3f695a88175a4a0c4f3c29fbba5d496
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_flash_attention_backward_ops.h>
+
+namespace at {
+
+
+// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional<double> scale=::std::nullopt, ::std::optional<int64_t> window_size_left=::std::nullopt, ::std::optional<int64_t> window_size_right=::std::nullopt) {
+    return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional<double> scale=::std::nullopt, ::std::optional<int64_t> window_size_left=::std::nullopt, ::std::optional<int64_t> window_size_right=::std::nullopt) {
+    return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt);
+  }
+}
+
+// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward_symint(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional<double> scale=::std::nullopt, ::std::optional<c10::SymInt> window_size_left=::std::nullopt, ::std::optional<c10::SymInt> window_size_right=::std::nullopt) {
+    return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional<double> scale=::std::nullopt, ::std::optional<c10::SymInt> window_size_left=::std::nullopt, ::std::optional<c10::SymInt> window_size_right=::std::nullopt) {
+    return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2f4593a28ce22251a9bec54095b180ec346467
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_foreach_asin_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_asin(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_asin(at::TensorList self) {
+    return at::_ops::_foreach_asin::call(self);
+}
+
+// aten::_foreach_asin_(Tensor(a!)[] self) -> ()
+inline void _foreach_asin_(at::TensorList self) {
+    return at::_ops::_foreach_asin_::call(self);
+}
+
+// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_asin_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_asin_out::call(self, out);
+}
+// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_asin_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_asin_out::call(self, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eebf66a88b9303f4a283f2bdcb53087d63750d01
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_floor(at::TensorList self);
+TORCH_API void _foreach_floor_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_floor_outf(at::TensorList self, at::TensorList out);
+TORCH_API void _foreach_floor_(at::TensorList self);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb4c773498213cd132e3a21e637e33a2c793d133
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_frac(at::TensorList self);
+TORCH_API void _foreach_frac_(at::TensorList self);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bf4590e7cce3190f772f739e2c0c4c21d352ee9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_sign(at::TensorList self);
+TORCH_API void _foreach_sign_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_sign_outf(at::TensorList self, at::TensorList out);
+TORCH_API void _foreach_sign_(at::TensorList self);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1af5d09c4e152720cb81533a7219875cba743ef2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _functional_sym_constrain_range_for_size {
+  using schema = at::Tensor (const at::Scalar &, ::std::optional<int64_t>, ::std::optional<int64_t>, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_functional_sym_constrain_range_for_size";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor";
+  static at::Tensor call(const at::Scalar & size, ::std::optional<int64_t> min, ::std::optional<int64_t> max, const at::Tensor & dep_token);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, ::std::optional<int64_t> min, ::std::optional<int64_t> max, const at::Tensor & dep_token);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..d21f9bf639992cfca76c45bac8a1fec3e70654cf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h
@@ -0,0 +1,69 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_fused_adagrad_ops.h>
+
+namespace at {
+
+
+// aten::_fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad_::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf);
+}
+
+// aten::_fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad__tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf);
+}
+
+// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out);
+}
+// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale, const ::std::optional<at::Tensor> & found_inf, at::TensorList out) {
+    return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out);
+}
+
+// aten::_fused_adagrad(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out, Tensor[] state_steps_out)
+inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf);
+}
+
+// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out);
+}
+// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale, const ::std::optional<at::Tensor> & found_inf, at::TensorList out) {
+    return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out);
+}
+
+// aten::_fused_adagrad.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out)
+inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional<at::Tensor> & grad_scale={}, const ::std::optional<at::Tensor> & found_inf={}) {
+    return at::_ops::_fused_adagrad_tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3add70e853927d8d9e80cecba4d68a5942489d00
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _fused_rms_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, ::std::array<bool,2> output_mask);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e82a8d1315e29aa2450793e5de14442d79f44a5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _fused_rms_norm_backward_cuda(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, ::std::array<bool,2> output_mask);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c952ad3b5af8c9d69705226fe50cf75ec93ca94
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _fw_primal_copy_out(at::Tensor & out, const at::Tensor & self, int64_t level);
+TORCH_API at::Tensor & _fw_primal_copy_outf(const at::Tensor & self, int64_t level, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..8393df9dee2248d540eb77c90bc2ed9f2a82657f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_gather_sparse_backward_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _gather_sparse_backward(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & grad);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ddfbdf06414d0cc7133b58ddf1096dbf6f6f3a8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & _index_put_impl_(at::Tensor & self, const c10::List<::std::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad43c636a72fd962b80fd51acabc0d044e9333df
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_native.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/_log_softmax_backward_data_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_log_softmax_backward_cpu_out : public at::meta::structured__log_softmax_backward_data {
+void impl(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, const at::Tensor & out);
+};
+struct TORCH_API structured_log_softmax_backward_cuda_out : public at::meta::structured__log_softmax_backward_data {
+void impl(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..13442417e38ef45875e9da399ac74601c018ea66
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _logcumsumexp(const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor & _logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor & _logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcb38bb0e9868c094e8f68f4495c7e55ab8f553e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _make_per_tensor_quantized_tensor_out(const at::Tensor & self, double scale, int64_t zero_point, at::Tensor & out);
+TORCH_API at::Tensor make_per_tensor_quantized_tensor_cpu(const at::Tensor & self, double scale, int64_t zero_point);
+TORCH_API at::Tensor make_per_tensor_quantized_tensor_cuda(const at::Tensor & self, double scale, int64_t zero_point);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f9cfb8c53f58e7f261b2a22c1c4c7eff5ad0ad0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _mkldnn_reshape_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef shape);
+TORCH_API at::Tensor & _mkldnn_reshape_outf(const at::Tensor & self, at::IntArrayRef shape, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h
new file mode 100644
index 0000000000000000000000000000000000000000..41a15aa5c7727a84409e6368a48798fa7f151d75
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_transpose.h
@@ -0,0 +1,97 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_mps_convolution_transpose_ops.h>
+
+namespace at {
+
+
+// aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+inline at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::_mps_convolution_transpose::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::_mps_convolution_transpose::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+  }
+}
+
+// aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+inline at::Tensor _mps_convolution_transpose_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::_mps_convolution_transpose::call(self, weight, padding, output_padding, stride, dilation, groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor _mps_convolution_transpose(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::_mps_convolution_transpose::call(self, weight, padding, output_padding, stride, dilation, groups);
+  }
+}
+
+// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+  }
+}
+
+// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+  }
+}
+
+// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _mps_convolution_transpose_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _mps_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out);
+  }
+}
+
+// aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _mps_convolution_transpose_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _mps_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+    return at::_ops::_mps_convolution_transpose_out::call(self, weight, padding, output_padding, stride, dilation, groups, out);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..57ab178108d3a34403ae4fa7b9da11cc03b41e49
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_cuda_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_outf(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_outf(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae773bd7d5b671669035611a14696b38a1275505
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _pdist_forward_out(const at::Tensor & self, double p, at::Tensor & out);
+TORCH_API at::Tensor _pdist_forward(const at::Tensor & self, double p=2);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e85728ec26480e5cd20ea8c052e4ec92cbe768d4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_print_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void _print(c10::string_view s);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7e4d85e0c0fa9589b11b0d6e858a32752de9e0d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _scaled_dot_product_flash_attention_backward {
+  using schema = ::std::tuple<at::Tensor,at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, c10::SymInt, c10::SymInt, double, bool, const at::Tensor &, const at::Tensor &, ::std::optional<double>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_scaled_dot_product_flash_attention_backward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)";
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> call(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional<double> scale);
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional<double> scale);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d166a3dd1b0fac5450b2185fec166f0a7a012e8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_flash_attention_cpu(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const ::std::optional<at::Tensor> & attn_mask={}, ::std::optional<double> scale=::std::nullopt);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fe1a4c7aecef2ddb3655a34a879343aeb342956
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_fused_attention_overrideable_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,c10::SymInt,c10::SymInt,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_fused_attention_overrideable(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional<at::Tensor> & attn_bias={}, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional<double> scale=::std::nullopt);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ecb42910d2beacebd87847ecd466dfd00dbe015
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_slow_conv2d_forward_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _slow_conv2d_forward_output {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_slow_conv2d_forward";
+  static constexpr const char* overload_name = "output";
+  static constexpr const char* schema_str = "_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output);
+};
+
+struct TORCH_API _slow_conv2d_forward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_slow_conv2d_forward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2745c3ffc6efb4a77af718e2a95cb20e7915895
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured__softmax_backward_data : public at::impl::MetaBase {
+
+
+    void meta(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8904318ac09d06dfc2675dcfbb4cf0110902f4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d5db02db2381c6ad24205ff14e29c66be4c1c3f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax_ops.h
@@ -0,0 +1,67 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_log_softmax_int {
+  using schema = at::Tensor (const at::Tensor &, int64_t, ::std::optional<at::ScalarType>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_sparse_log_softmax";
+  static constexpr const char* overload_name = "int";
+  static constexpr const char* schema_str = "_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, int64_t dim, ::std::optional<at::ScalarType> dtype);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, ::std::optional<at::ScalarType> dtype);
+};
+
+struct TORCH_API _sparse_log_softmax_Dimname {
+  using schema = at::Tensor (const at::Tensor &, at::Dimname, ::std::optional<at::ScalarType>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_sparse_log_softmax";
+  static constexpr const char* overload_name = "Dimname";
+  static constexpr const char* schema_str = "_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype);
+};
+
+struct TORCH_API _sparse_log_softmax {
+  using schema = at::Tensor (const at::Tensor &, int64_t, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_sparse_log_softmax";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, int64_t dim, bool half_to_float);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float);
+};
+
+struct TORCH_API _sparse_log_softmax_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_sparse_log_softmax";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..be054e403728f881d3ea5a073c6c151e40a4f08c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _sparse_mm_reduce_impl_backward_sparse_csr_cpu(const at::Tensor & self, const at::Tensor & grad_out, const at::Tensor & weight, c10::string_view reduce, const at::Tensor & arg_out, ::std::array<bool,2> output_mask);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ba9054ff7481c1502e229ec74d57e621450ffe7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _sparse_semi_structured_addmm(const at::Tensor & input, const at::Tensor & mat1, const at::Tensor & mat1_meta, const at::Tensor & mat2, const at::Scalar & alpha=1, const at::Scalar & beta=1, ::std::optional<at::ScalarType> out_dtype=::std::nullopt);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..dadd446e1558cdd30f8916690c307be7f38e78ef
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h
@@ -0,0 +1,55 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_sparse_softmax_ops.h>
+
+namespace at {
+
+
+// aten::_sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, ::std::optional<at::ScalarType> dtype=::std::nullopt) {
+    return at::_ops::_sparse_softmax_int::call(self, dim, dtype);
+}
+
+// aten::_sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor _sparse_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype=::std::nullopt) {
+    return at::_ops::_sparse_softmax_Dimname::call(self, dim, dtype);
+}
+
+// aten::_sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, bool half_to_float) {
+    return at::_ops::_sparse_softmax::call(self, dim, half_to_float);
+}
+
+// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _sparse_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) {
+    return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out);
+}
+// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _sparse_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) {
+    return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9270da98fca4dc399ae91cdaa6644841de9b95b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_serialization_subcmul {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_test_serialization_subcmul";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebc37dbc78804386bbea23d0abd0fe00d28939ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_warn_in_autograd {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_test_warn_in_autograd";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_test_warn_in_autograd(Tensor self) -> Tensor";
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API _test_warn_in_autograd_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_test_warn_in_autograd";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d2586820a918d37e3b4d7bf43d3af4e05b4edca
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _thnn_fused_gru_cell {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, const ::std::optional<at::Tensor> &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_thnn_fused_gru_cell";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)";
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional<at::Tensor> & input_bias, const ::std::optional<at::Tensor> & hidden_bias);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional<at::Tensor> & input_bias, const ::std::optional<at::Tensor> & hidden_bias);
+};
+
+struct TORCH_API _thnn_fused_gru_cell_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, const ::std::optional<at::Tensor> &, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_thnn_fused_gru_cell";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))";
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional<at::Tensor> & input_bias, const ::std::optional<at::Tensor> & hidden_bias, at::Tensor & out0, at::Tensor & out1);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional<at::Tensor> & input_bias, const ::std::optional<at::Tensor> & hidden_bias, at::Tensor & out0, at::Tensor & out1);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb3cb03ff9a8114fd6e4c087e6915f18cbd93c8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _to_sparse_bsr(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional<int64_t> dense_dim=::std::nullopt);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc469abb37d75ebad8b18be6da4e2679bdb4a1af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _to_sparse_csr(const at::Tensor & self, ::std::optional<int64_t> dense_dim=::std::nullopt);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8af14d73c569b0cba19c8d49f694ab8c068fff3b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _triton_scaled_dot_attention {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, double);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_triton_scaled_dot_attention";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor";
+  static at::Tensor call(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p);
+};
+
+struct TORCH_API _triton_scaled_dot_attention_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, double, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_triton_scaled_dot_attention";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "_triton_scaled_dot_attention.out(Tensor q, Tensor k, Tensor v, float dropout_p=0.0, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..36172229b4fc54d1a0e0ad09f64f589570324127
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d26e88f4d0dd26288c607b5d12b3824cb1cc7975
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor _upsample_bicubic2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a94a5a500d88c3d390761e8aeab05074f4aba8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _upsample_nearest_exact1d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales=::std::nullopt);
+TORCH_API at::Tensor _upsample_nearest_exact1d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales=::std::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales=::std::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales, at::Tensor & out);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales=::std::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..d52eb044dd747fbf60232e6c36a5a7c044eb772a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h
@@ -0,0 +1,119 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_upsample_nearest_exact3d_ops.h>
+
+namespace at {
+
+
+// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+  }
+}
+
+// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_d=::std::nullopt, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3bf8234734e728d7c8885c5c59954cc8c73edc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured__upsample_nearest_exact3d : public at::impl::MetaBase {
+
+
+    void meta(const at::Tensor & self, at::ArrayRef<int64_t> output_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..143ebaf74c8ba2b8b54159a1ae5586287301f4d4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_use_cudnn_ctc_loss_ops.h>
+
+namespace at {
+
+
+// aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
+inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank) {
+    return at::_ops::_use_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank);
+}
+
+// aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
+inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank) {
+    return at::_ops::_use_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7d6708303629d8bab44c9e45f0f0e6ca3337256
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _validate_sparse_bsr_tensor_args {
+  using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_validate_sparse_bsr_tensor_args";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()";
+  static void call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional<bool> check_pinning);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional<bool> check_pinning);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbc189450438d8ded976143386c9d8bd87ffd971
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor _weight_norm(const at::Tensor & v, const at::Tensor & g, int64_t dim=0);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2401ac35082d0798cc3ec6b34a9babdeecee25
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> _weight_norm_interface_backward_out(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> weight_norm_backward_cpu(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> weight_norm_backward_cuda(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddc82b6fb6b64aa734c70bd4aed92de92411f972
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _wrapped_linear_prepack(const at::Tensor & weight, const at::Tensor & weight_scale, const at::Tensor & weight_zero_point, const at::Tensor & bias);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..98bc32b7020176294200047a1b4d50348297ae3d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _wrapped_quantized_linear_prepacked {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::_wrapped_quantized_linear_prepacked";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "_wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor";
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h
new file mode 100644
index 0000000000000000000000000000000000000000..332e9284731804c6e80dfca8275a72ada7bd5d1f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/addcdiv_ops.h>
+
+namespace at {
+
+
+// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out);
+}
+// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) {
+    return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out);
+}
+
+// aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+inline at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcdiv::call(self, tensor1, tensor2, value);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdfeaae63201db89c45e052362ff367d93587717
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/align_tensors.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/align_tensors_ops.h>
+
+namespace at {
+
+
+// aten::align_tensors(Tensor[] tensors) -> Tensor[]
+inline ::std::vector<at::Tensor> align_tensors(at::TensorList tensors) {
+    return at::_ops::align_tensors::call(tensors);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d98ecc829e952fb085603756920c6379c29c267a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API arctanh {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::arctanh";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "arctanh(Tensor self) -> Tensor";
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API arctanh_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::arctanh_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "arctanh_(Tensor(a!) self) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API arctanh_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::arctanh";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae5e637068bf39a33a78767ca007532e941aa2e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor argmin(const at::Tensor & self, ::std::optional<int64_t> dim=::std::nullopt, bool keepdim=false);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..92f62ded996c33504b4a68f8084d7cfb82308ec8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_atanh : public TensorIteratorBase {
+
+
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9336475ca68a0798779ac558e3fa01cccf4ce01d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor atleast_3d(const at::Tensor & self);
+TORCH_API ::std::vector<at::Tensor> atleast_3d(at::TensorList tensors);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..da53ab8c9c4bbf3c17af663b7dca05550c0508f9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool1d_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API avg_pool1d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::avg_pool1d";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad);
+};
+
+struct TORCH_API avg_pool1d_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::avg_pool1d";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "avg_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d17657ba5d3f84d66d8845f8a56e621b2575bf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_backward.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/avg_pool2d_backward_ops.h>
+
+namespace at {
+
+
+// aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & avg_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional<int64_t> divisor_override) {
+    return at::_ops::avg_pool2d_backward_grad_input::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+}
+// aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional<int64_t> divisor_override, at::Tensor & grad_input) {
+    return at::_ops::avg_pool2d_backward_grad_input::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+}
+
+// aten::avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+inline at::Tensor avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional<int64_t> divisor_override) {
+    return at::_ops::avg_pool2d_backward::call(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..47d50a3a94b7846bd188c146f28221d1d8b269de
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor batch_norm_elemt(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps);
+TORCH_API at::Tensor & batch_norm_elemt_out(at::Tensor & out, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps);
+TORCH_API at::Tensor & batch_norm_elemt_outf(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h
new file mode 100644
index 0000000000000000000000000000000000000000..92a00d968d26a39e66cdc1a6282b779484f331db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or.h
@@ -0,0 +1,73 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/bitwise_or_ops.h>
+
+namespace at {
+
+
+// aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::bitwise_or_Tensor_out::call(self, other, out);
+}
+// aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::bitwise_or_Tensor_out::call(self, other, out);
+}
+
+// aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::bitwise_or_Scalar_out::call(self, other, out);
+}
+// aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+    return at::_ops::bitwise_or_Scalar_out::call(self, other, out);
+}
+
+// aten::bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor bitwise_or(const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::bitwise_or_Scalar::call(self, other);
+}
+
+// aten::bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+inline at::Tensor bitwise_or(const at::Scalar & self, const at::Tensor & other) {
+    return at::_ops::bitwise_or_Scalar_Tensor::call(self, other);
+}
+
+// aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::bitwise_or_Tensor::call(self, other);
+}
+
+// aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_out(at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+    return at::_ops::bitwise_or_Scalar_Tensor_out::call(self, other, out);
+}
+// aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_or_outf(const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::bitwise_or_Scalar_Tensor_out::call(self, other, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..93d0456a3071dbee9ca22831a46d34ce89bb4c01
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & bitwise_or_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f50b11858ae9b250220560f392e9227a373b0c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor ccol_indices_default(const at::Tensor & self);
+TORCH_API at::Tensor ccol_indices_sparse_csr(const at::Tensor & self);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..272a44791d5ac1b585d6767eb4bf08b5952e35df
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API chunk {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::chunk";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]";
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, int64_t chunks, int64_t dim);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t chunks, int64_t dim);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fa32c1c64f5659a5fe2396c97288ff9abb2a5cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor concat(at::TensorList tensors, int64_t dim=0);
+TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, int64_t dim=0);
+TORCH_API at::Tensor & concat_outf(at::TensorList tensors, int64_t dim, at::Tensor & out);
+TORCH_API at::Tensor concat(at::TensorList tensors, at::Dimname dim);
+TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim);
+TORCH_API at::Tensor & concat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d2151ff261579f164d7b0c872f587e0e34c5e0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API conv_transpose2d_input {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::conv_transpose2d";
+  static constexpr const char* overload_name = "input";
+  static constexpr const char* schema_str = "conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor";
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc9ce492c781bce2d82d0d5a81f8f99827d7db9a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copysign_meta_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor copysign(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & copysign_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & copysign_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & copysign_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h
new file mode 100644
index 0000000000000000000000000000000000000000..807162ef475fe5f91c30c624d51af68fda23dac4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/crow_indices_ops.h>
+
+namespace at {
+
+
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f672eadac3c2572b68f5592a1f97d1fa1727abc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & crow_indices_copy_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & crow_indices_copy_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8edd9fa6500d41801fb8bd5b40b5cc800e926ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor cumprod(const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype=::std::nullopt);
+TORCH_API at::Tensor & cumprod_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype=::std::nullopt);
+TORCH_API at::Tensor & cumprod_outf(const at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype, at::Tensor & out);
+TORCH_API at::Tensor & cumprod_(at::Tensor & self, at::Dimname dim, ::std::optional<at::ScalarType> dtype=::std::nullopt);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d287b0ffa62abb12ab9e9e954626a1b88c4e4f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor diagonal(const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1);
+TORCH_API at::Tensor diagonal(const at::Tensor & self, at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d141b737498ee3d889f18e58c8effe7b93dd574
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/div.h
@@ -0,0 +1,87 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/div_ops.h>
+
+namespace at {
+
+
+// aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor div(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::div_Tensor::call(self, other);
+}
+
+// aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::div_out::call(self, other, out);
+}
+// aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::div_out::call(self, other, out);
+}
+
+// aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+inline at::Tensor div(const at::Tensor & self, const at::Tensor & other, ::std::optional<c10::string_view> rounding_mode) {
+    return at::_ops::div_Tensor_mode::call(self, other, rounding_mode);
+}
+
+// aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, ::std::optional<c10::string_view> rounding_mode) {
+    return at::_ops::div_out_mode::call(self, other, rounding_mode, out);
+}
+// aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_outf(const at::Tensor & self, const at::Tensor & other, ::std::optional<c10::string_view> rounding_mode, at::Tensor & out) {
+    return at::_ops::div_out_mode::call(self, other, rounding_mode, out);
+}
+
+// aten::div.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor div(const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::div_Scalar::call(self, other);
+}
+
+// aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+inline at::Tensor div(const at::Tensor & self, const at::Scalar & other, ::std::optional<c10::string_view> rounding_mode) {
+    return at::_ops::div_Scalar_mode::call(self, other, rounding_mode);
+}
+
+// aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::div_Scalar_out::call(self, other, out);
+}
+// aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+    return at::_ops::div_Scalar_out::call(self, other, out);
+}
+
+// aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other, ::std::optional<c10::string_view> rounding_mode) {
+    return at::_ops::div_Scalar_mode_out::call(self, other, rounding_mode, out);
+}
+// aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & div_outf(const at::Tensor & self, const at::Scalar & other, ::std::optional<c10::string_view> rounding_mode, at::Tensor & out) {
+    return at::_ops::div_Scalar_mode_out::call(self, other, rounding_mode, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d815f19182a50598b9f9bee08feb6948b5ada6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API embedding {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::embedding";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor";
+  static at::Tensor call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse);
+};
+
+struct TORCH_API embedding_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::embedding";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..884429fe107b4da3d41b36063ad27a1473d37d20
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_renorm.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/embedding_renorm_ops.h>
+
+namespace at {
+
+
+// aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+inline at::Tensor & embedding_renorm_(at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+    return at::_ops::embedding_renorm_::call(self, indices, max_norm, norm_type);
+}
+
+// aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & embedding_renorm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+    return at::_ops::embedding_renorm_out::call(self, indices, max_norm, norm_type, out);
+}
+// aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & embedding_renorm_outf(const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type, at::Tensor & out) {
+    return at::_ops::embedding_renorm_out::call(self, indices, max_norm, norm_type, out);
+}
+
+// aten::embedding_renorm(Tensor self, Tensor indices, float max_norm, float norm_type) -> Tensor
+inline at::Tensor embedding_renorm(const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+    return at::_ops::embedding_renorm::call(self, indices, max_norm, norm_type);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab0f2932d6233a6e98fb9bc799c8ab63e32b42aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_meta_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor empty(at::IntArrayRef size, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=::std::nullopt);
+TORCH_API at::Tensor empty(at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format);
+TORCH_API at::Tensor empty_symint(c10::SymIntArrayRef size, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=::std::nullopt);
+TORCH_API at::Tensor empty_symint(c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1c1de4b1aaa29a53eeb472bf56ad20e32393fa1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_cuda_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={});
+TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={});
+TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7a620bbb4388e3bd07d3394374fd62805f5ce84
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & empty_strided_out_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out);
+TORCH_API at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+TORCH_API at::Tensor empty_strided_cuda(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+TORCH_API at::Tensor empty_strided_meta_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+TORCH_API at::Tensor empty_strided_unknown_quantized(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e3a573c421272df7801a2e543af7258f83f30d5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API bool equal(const at::Tensor & self, const at::Tensor & other);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..80fdd2fda4ae2f7f62fe4cf34bf9394604f4d322
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor erf(const at::Tensor & self);
+TORCH_API at::Tensor & erf_(at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c73c50b315a4081638a84b92913dec1ea184cb9d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_fft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional<c10::string_view> norm=::std::nullopt);
+TORCH_API at::Tensor & fft_fft2_symint_out(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..973228e10f49ee4f467b18b51678653e6743b4b4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API fft_irfft2 {
+  using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional<c10::string_view>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::fft_irfft2";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm);
+};
+
+struct TORCH_API fft_irfft2_out {
+  using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional<c10::string_view>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::fft_irfft2";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h
new file mode 100644
index 0000000000000000000000000000000000000000..4981643fd116edd8e23cd307fd8033135d1ceccf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft.h
@@ -0,0 +1,97 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/fft_rfft_ops.h>
+
+namespace at {
+
+
+// aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+inline at::Tensor fft_rfft(const at::Tensor & self, ::std::optional<int64_t> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor fft_rfft(const at::Tensor & self, ::std::optional<int64_t> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm);
+  }
+}
+
+// aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+inline at::Tensor fft_rfft_symint(const at::Tensor & self, ::std::optional<c10::SymInt> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft::call(self, n, dim, norm);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor fft_rfft(const at::Tensor & self, ::std::optional<c10::SymInt> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft::call(self, n, dim, norm);
+  }
+}
+
+// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional<int64_t> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional<int64_t> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out);
+  }
+}
+
+// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional<int64_t> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional<int64_t> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_rfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out);
+  }
+}
+
+// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_rfft_symint_out(at::Tensor & out, const at::Tensor & self, ::std::optional<c10::SymInt> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft_out::call(self, n, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & fft_rfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional<c10::SymInt> n=::std::nullopt, int64_t dim=-1, ::std::optional<c10::string_view> norm=::std::nullopt) {
+    return at::_ops::fft_rfft_out::call(self, n, dim, norm, out);
+  }
+}
+
+// aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_rfft_symint_outf(const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_rfft_out::call(self, n, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & fft_rfft_outf(const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_rfft_out::call(self, n, dim, norm, out);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7cdf9f3fe42adc6e91ca4e8bcc2d59af88abc82
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor fft_rfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional<c10::string_view> norm=::std::nullopt);
+TORCH_API at::Tensor fft_rfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional<c10::string_view> norm=::std::nullopt);
+TORCH_API at::Tensor & fft_rfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional<c10::string_view> norm=::std::nullopt);
+TORCH_API at::Tensor & fft_rfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+TORCH_API at::Tensor & fft_rfft2_symint_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional<c10::string_view> norm=::std::nullopt);
+TORCH_API at::Tensor & fft_rfft2_symint_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c6c37bb8c172d61fbf62b81dfcb569917dab01f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> fractional_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool2d_out(at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4e5766304ee387cbd823d5bfd42b162de8c8250
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API fractional_max_pool3d_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::fractional_max_pool3d_backward";
+  static constexpr const char* overload_name = "grad_input";
+  static constexpr const char* schema_str = "fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input);
+};
+
+struct TORCH_API fractional_max_pool3d_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::fractional_max_pool3d_backward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor";
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..3eff09091d9093cb254b7f68c2ff42fa39caf47f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_gather : public at::impl::MetaBase {
+
+
+    void meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fe0b341091ae1fa0cd1292458e411b2b9da9436
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor gcd(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & gcd_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & gcd_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6c6b347793670678fd73a7d6a1b6623465d714f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor gelu(const at::Tensor & self, c10::string_view approximate="none");
+TORCH_API at::Tensor & gelu_out(at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none");
+TORCH_API at::Tensor & gelu_outf(const at::Tensor & self, c10::string_view approximate, at::Tensor & out);
+TORCH_API at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate="none");
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f031c7c8b84858b071de8b1fb4eea18dc4ab4fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ger.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/ger_ops.h>
+
+namespace at {
+
+
+// aten::ger(Tensor self, Tensor vec2) -> Tensor
+inline at::Tensor ger(const at::Tensor & self, const at::Tensor & vec2) {
+    return at::_ops::ger::call(self, vec2);
+}
+
+// aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & ger_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & vec2) {
+    return at::_ops::ger_out::call(self, vec2, out);
+}
+// aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & ger_outf(const at::Tensor & self, const at::Tensor & vec2, at::Tensor & out) {
+    return at::_ops::ger_out::call(self, vec2, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1c4de7dfcee9f1485c9250491919bd048ec7110
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h
@@ -0,0 +1,89 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API greater_Scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater";
+  static constexpr const char* overload_name = "Scalar_out";
+  static constexpr const char* schema_str = "greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+};
+
+struct TORCH_API greater_Scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater";
+  static constexpr const char* overload_name = "Scalar";
+  static constexpr const char* schema_str = "greater.Scalar(Tensor self, Scalar other) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API greater_Tensor_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater";
+  static constexpr const char* overload_name = "Tensor_out";
+  static constexpr const char* schema_str = "greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+struct TORCH_API greater_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "greater.Tensor(Tensor self, Tensor other) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API greater__Scalar {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater_";
+  static constexpr const char* overload_name = "Scalar";
+  static constexpr const char* schema_str = "greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API greater__Tensor {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::greater_";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..a737519c3ff3a27f5511b2882871fd70c0979561
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & grid_sampler_2d_out(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out);
+TORCH_API at::Tensor grid_sampler_2d_cpu(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+TORCH_API at::Tensor grid_sampler_2d_cuda(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..37d273dc8cdbac5af6dc0b8c9c531597028d1e9d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/hardsigmoid_backward_ops.h>
+
+namespace at {
+
+
+// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & hardsigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) {
+    return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input);
+}
+// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & hardsigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) {
+    return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input);
+}
+
+// aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
+inline at::Tensor hardsigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self) {
+    return at::_ops::hardsigmoid_backward::call(grad_output, self);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7b49a3e190fcf4929dde4b1dd3ecdfec803db8d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_hardsigmoid_backward : public TensorIteratorBase {
+
+
+    void meta(const at::Tensor & grad_output, const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..892ece6f84ae8ca9f80e528372bb932cbfdb66c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor hardswish(const at::Tensor & self);
+TORCH_API at::Tensor & hardswish_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & hardswish_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & hardswish_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7709ef73430311dc50a236a864e7c249821f600
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/hardtanh_backward_ops.h>
+
+namespace at {
+
+
+// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) {
+    return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input);
+}
+// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input) {
+    return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input);
+}
+
+// aten::hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+inline at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) {
+    return at::_ops::hardtanh_backward::call(grad_output, self, min_val, max_val);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..144e59c679abdd10096eb55ce08399d902953a4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor hash_tensor(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false, int64_t mode=0);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h
new file mode 100644
index 0000000000000000000000000000000000000000..3afc8b577723d664a639158fa802b006e5c69a8b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hstack.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/hstack_ops.h>
+
+namespace at {
+
+
+// aten::hstack(Tensor[] tensors) -> Tensor
+inline at::Tensor hstack(at::TensorList tensors) {
+    return at::_ops::hstack::call(tensors);
+}
+
+// aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & hstack_out(at::Tensor & out, at::TensorList tensors) {
+    return at::_ops::hstack_out::call(tensors, out);
+}
+// aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & hstack_outf(at::TensorList tensors, at::Tensor & out) {
+    return at::_ops::hstack_out::call(tensors, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..17d4f3472f9050511a1c1520ffc533c332c43a3d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss_backward_cuda_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & huber_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta);
+TORCH_API at::Tensor & huber_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94757084730f1d57cf366f293fcd0d40a91c365
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h
@@ -0,0 +1,44 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_index_add : public at::impl::MetaBase {
+
+                template <bool DIM = false>
+                struct TORCH_API precompute_out {
+
+                    precompute_out<true> set_dim(int64_t value) {
+                        static_assert(DIM == false, "dim already set");
+                        precompute_out<true> ret;
+ret.dim = value;
+return ret;
+                    }
+
+                    int64_t dim;
+            };
+    using meta_return_ty = precompute_out <true>;
+    meta_return_ty meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5cd9056522b1cd97c04df9979e441d014ba1b92
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/instance_norm_ops.h>
+
+namespace at {
+
+
+// aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+inline at::Tensor instance_norm(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
+    return at::_ops::instance_norm::call(input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c2790b1388072cc79512fd3a2fc204963f3226d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor inverse(const at::Tensor & self);
+TORCH_API at::Tensor & inverse_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & inverse_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e6ca3c2824ed95472376cc405c81c6a26590954
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API bool is_inference(const at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e024c5d39f0f9686ec781bff3dd0ced0b57f1c4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API bool is_pinned(const at::Tensor & self, ::std::optional<at::Device> device=::std::nullopt);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..29f56db48e94344501185cee43825347bde69b5c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isclose_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor isclose(const at::Tensor & self, const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c15158f09213113bed4a3d760ec562413210f4b2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor isneginf(const at::Tensor & self);
+TORCH_API at::Tensor & isneginf_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & isneginf_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..14951b5ccbe251aa7c99244b6a7d89c8fe3d7e10
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor isreal(const at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd191a208873053d7b4671337f5b4b9882fd47ae
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API leaky_relu_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::leaky_relu_backward";
+  static constexpr const char* overload_name = "grad_input";
+  static constexpr const char* schema_str = "leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input);
+};
+
+struct TORCH_API leaky_relu_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::leaky_relu_backward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor";
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..25429a732d69d19811a4a8cdf0fa104ff4fe5ddf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & lift_fresh_copy_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & lift_fresh_copy_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe07b2a5eb9fcdec33ce921f20af02cc13fb7e93
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_cpu_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor linalg_cross(const at::Tensor & self, const at::Tensor & other, int64_t dim=-1);
+TORCH_API at::Tensor & linalg_cross_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1);
+TORCH_API at::Tensor & linalg_cross_outf(const at::Tensor & self, const at::Tensor & other, int64_t dim, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f3368dff66b454c7fdfa868b723e9fc6add9fbc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_linalg_cross : public at::impl::MetaBase {
+
+
+    void meta(const at::Tensor & self, const at::Tensor & other, int64_t dim);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba5bcb116dd51221203fcc67e73b135aa83768e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/linalg_eigh_ops.h>
+
+namespace at {
+
+
+// aten::linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
+inline ::std::tuple<at::Tensor,at::Tensor> linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L") {
+    return at::_ops::linalg_eigh::call(self, UPLO);
+}
+
+// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eigh_out(at::Tensor & eigvals, at::Tensor & eigvecs, const at::Tensor & self, c10::string_view UPLO="L") {
+    return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs);
+}
+// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eigh_outf(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs) {
+    return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a15412312fe0d962a841853dfc4eda8f4ac3bb4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/linalg_ldl_factor_ops.h>
+
+namespace at {
+
+
+// aten::linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
+inline ::std::tuple<at::Tensor,at::Tensor> linalg_ldl_factor(const at::Tensor & self, bool hermitian=false) {
+    return at::_ops::linalg_ldl_factor::call(self, hermitian);
+}
+
+// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_ldl_factor_out(at::Tensor & LD, at::Tensor & pivots, const at::Tensor & self, bool hermitian=false) {
+    return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots);
+}
+// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_ldl_factor_outf(const at::Tensor & self, bool hermitian, at::Tensor & LD, at::Tensor & pivots) {
+    return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h
new file mode 100644
index 0000000000000000000000000000000000000000..85f62a7c75b1076e23ef9fd3c2c147bd1675fdf5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/linalg_multi_dot_ops.h>
+
+namespace at {
+
+
+// aten::linalg_multi_dot(Tensor[] tensors) -> Tensor
+inline at::Tensor linalg_multi_dot(at::TensorList tensors) {
+    return at::_ops::linalg_multi_dot::call(tensors);
+}
+
+// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & linalg_multi_dot_out(at::Tensor & out, at::TensorList tensors) {
+    return at::_ops::linalg_multi_dot_out::call(tensors, out);
+}
+// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & linalg_multi_dot_outf(at::TensorList tensors, at::Tensor & out) {
+    return at::_ops::linalg_multi_dot_out::call(tensors, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9464d45a7751087c8c387341fcdd9cd3128445a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, ::std::optional<double> atol, ::std::optional<double> rtol, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, ::std::optional<double> atol, ::std::optional<double> rtol, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, ::std::optional<double> atol, ::std::optional<double> rtol, bool hermitian, at::Tensor & out);
+TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, double rcond, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, double rcond, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, double rcond, bool hermitian, at::Tensor & out);
+TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false);
+TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, const at::Tensor & rcond, bool hermitian, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..295f38fcd3e70fe992ea43595bf9082285b88149
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_slogdet_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> linalg_slogdet(const at::Tensor & A);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> linalg_slogdet_out(const at::Tensor & A, at::Tensor & sign, at::Tensor & logabsdet);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3b8ce9c20e90bd2fd7cd33ad1596422f0ab5669
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_log1p : public TensorIteratorBase {
+
+
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bbe6f9a3fcf089cc13fb1e35bb9adabc0d2df27
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/log_sigmoid_ops.h>
+
+namespace at {
+
+
+// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & log_sigmoid_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::log_sigmoid_out::call(self, out);
+}
+// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & log_sigmoid_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::log_sigmoid_out::call(self, out);
+}
+
+// aten::log_sigmoid(Tensor self) -> Tensor
+inline at::Tensor log_sigmoid(const at::Tensor & self) {
+    return at::_ops::log_sigmoid::call(self);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ccb46eb4082be9da8936764e63812541fd2de
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid_backward_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+TORCH_API at::Tensor & log_sigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+TORCH_API at::Tensor & log_sigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h
new file mode 100644
index 0000000000000000000000000000000000000000..34b6706df7fc2f21b0e063143a195248d9b7f43f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/logaddexp2_ops.h>
+
+namespace at {
+
+
+// aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::logaddexp2_out::call(self, other, out);
+}
+// aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::logaddexp2_out::call(self, other, out);
+}
+
+// aten::logaddexp2(Tensor self, Tensor other) -> Tensor
+inline at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::logaddexp2::call(self, other);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b12c09dd34eca0e5badcd52e906a82b01b45a9f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..34fc2a808efa0afdaf1571f6c09b775d1715c623
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp_native.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor logcumsumexp(const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor & logcumsumexp_out(const at::Tensor & self, int64_t dim, at::Tensor & out);
+TORCH_API at::Tensor logcumsumexp(const at::Tensor & self, at::Dimname dim);
+TORCH_API at::Tensor & logcumsumexp_out(const at::Tensor & self, at::Dimname dim, at::Tensor & out);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..29e594c7dfdb46fb0fcd474187b291ff97588823
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor logical_and(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & logical_and_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cea312ae4ba6b55d3a33bd4225e306f1a75d8b1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_xor_cuda_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & logical_xor_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & logical_xor_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3730bcb3b0673d0a65036a90891af1b99c2a2a1c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor logit(const at::Tensor & self, ::std::optional<double> eps=::std::nullopt);
+TORCH_API at::Tensor & logit_out(at::Tensor & out, const at::Tensor & self, ::std::optional<double> eps=::std::nullopt);
+TORCH_API at::Tensor & logit_outf(const at::Tensor & self, ::std::optional<double> eps, at::Tensor & out);
+TORCH_API at::Tensor & logit_(at::Tensor & self, ::std::optional<double> eps=::std::nullopt);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f5e6583ecc8b91fa3300c50dd00112e002736f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/logspace_ops.h>
+
+namespace at {
+
+
+// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+    return at::_ops::logspace::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::logspace::call(start, end, steps, base, dtype, layout, device, pin_memory);
+}
+
+// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+    return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory);
+}
+
+// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+    return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, dtype, layout, device, pin_memory);
+}
+
+// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+    return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory);
+}
+
+// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0) {
+    return at::_ops::logspace_out::call(start, end, steps, base, out);
+}
+// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) {
+    return at::_ops::logspace_out::call(start, end, steps, base, out);
+}
+
+// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0) {
+    return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out);
+}
+// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) {
+    return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out);
+}
+
+// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0) {
+    return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out);
+}
+// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) {
+    return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out);
+}
+
+// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0) {
+    return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out);
+}
+// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) {
+    return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc5a3501559667389b789183a7e38319000c1b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_native.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor masked_scatter(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+TORCH_API at::Tensor & masked_scatter_out(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out);
+TORCH_API at::Tensor & masked_scatter__cpu(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+TORCH_API at::Tensor & masked_scatter__cuda(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c350897170b58f9cbcf9b33af34b9ff1abc02cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API matmul {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::matmul";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "matmul(Tensor self, Tensor other) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API matmul_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::matmul";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a1b521df218186e05aac90ad748f3a676ee6c3f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API max_pool2d_with_indices_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::max_pool2d_with_indices";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))";
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+};
+
+struct TORCH_API max_pool2d_with_indices {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::max_pool2d_with_indices";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)";
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c23a22e039712ca29fc5167bfc8ad56f87cef493
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional<at::Scalar> & alpha, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..85f0e0f71c0eabf5c60e970ed075cfa158c550e9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_cuda_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor miopen_convolution(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic);
+TORCH_API at::Tensor miopen_convolution_symint(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bee78a94be892d967760dfcb280055fc93866e23
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API miopen_convolution {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::miopen_convolution";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic);
+};
+
+struct TORCH_API miopen_convolution_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::miopen_convolution";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ed35c78d273e01fc25cc5619ca2bf6ce9a19df5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor math_mish_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API at::Tensor mish_backward(const at::Tensor & grad_output, const at::Tensor & self);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca183b58f8bee439911855a302823bbfea12c013
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/mkldnn_linear_backward_weights_ops.h>
+
+namespace at {
+
+
+// aten::mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> mkldnn_linear_backward_weights(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) {
+    return at::_ops::mkldnn_linear_backward_weights::call(grad_output, input, weight, bias_defined);
+}
+
+// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> mkldnn_linear_backward_weights_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) {
+    return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1);
+}
+// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> mkldnn_linear_backward_weights_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined, at::Tensor & out0, at::Tensor & out1) {
+    return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f2860569a561e95875cdd8a62ac606b60514c8a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> mps_convolution_transpose_backward_out_symint(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1ac11224ac580d337b8e356e2fdf8ca22390cb6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_mse_loss : public TensorIteratorBase {
+
+
+    void meta(const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c21f17eaee5c12646779c857df5137098ec8b6dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/mse_loss_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_mse_loss_out : public at::meta::structured_mse_loss {
+void impl(const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..569dbe7a9092b4533a08e55777c29fb4855bb8e3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multi_margin_loss_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API multi_margin_loss_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const ::std::optional<at::Tensor> &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::multi_margin_loss";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out);
+};
+
+struct TORCH_API multi_margin_loss {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const ::std::optional<at::Tensor> &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::multi_margin_loss";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a903e11f66b5159099f50633fb789810ef92d31c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor multilabel_margin_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target);
+TORCH_API at::Tensor & multilabel_margin_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target);
+TORCH_API at::Tensor & multilabel_margin_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..82775810c09aa3e8051265f0b839141b399fffb7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multiply_native.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor multiply(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & multiply_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & multiply_(at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor multiply(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & multiply_(at::Tensor & self, const at::Scalar & other);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4873dd71df73b8cb90e4bbebbb44670fb1337ad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional<at::ScalarType> dtype=::std::nullopt);
+TORCH_API at::Tensor & nansum_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional<at::ScalarType> dtype=::std::nullopt);
+TORCH_API at::Tensor & nansum_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbb064726cc38b64e0547ef51525f555cc9ca732
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, const ::std::optional<at::Tensor> & save_mean, const ::std::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..803c442144d231099ef3abb4af3a5850a06311a8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_backward_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API native_layer_norm_backward {
+  using schema = ::std::tuple<at::Tensor,at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, const ::std::optional<at::Tensor> &, ::std::array<bool,3>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::native_layer_norm_backward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)";
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> call(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+};
+
+struct TORCH_API native_layer_norm_backward_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, const ::std::optional<at::Tensor> &, ::std::array<bool,3>, at::Tensor &, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::native_layer_norm_backward";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))";
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> call(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..22746c258d7705a2e5c23aa935c0ae5448554e7b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_norm.h
@@ -0,0 +1,59 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/native_norm_ops.h>
+
+namespace at {
+
+
+// aten::native_norm(Tensor self, Scalar p=2) -> Tensor
+inline at::Tensor native_norm(const at::Tensor & self, const at::Scalar & p=2) {
+    return at::_ops::native_norm::call(self, p);
+}
+
+// aten::native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
+inline at::Tensor native_norm(const at::Tensor & self, const ::std::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype) {
+    return at::_ops::native_norm_ScalarOpt_dim_dtype::call(self, p, dim, keepdim, dtype);
+}
+
+// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2) {
+    return at::_ops::native_norm_out::call(self, p, out);
+}
+// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & native_norm_outf(const at::Tensor & self, const at::Scalar & p, at::Tensor & out) {
+    return at::_ops::native_norm_out::call(self, p, out);
+}
+
+// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype) {
+    return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out);
+}
+// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & native_norm_outf(const at::Tensor & self, const ::std::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype, at::Tensor & out) {
+    return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a563f2c91b7e7623cb8349f08a1370bd3e16061
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+TORCH_API at::Tensor & new_full_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2734b3f39712e2641a9cf44a702817722bcb034
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_zeros.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/new_zeros_ops.h>
+
+namespace at {
+
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor new_zeros(const at::Tensor & self, at::IntArrayRef size, at::TensorOptions options={}) {
+    return at::_ops::new_zeros::call(self, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor new_zeros(const at::Tensor & self, at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::new_zeros::call(self, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor new_zeros(const at::Tensor & self, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+    return at::_ops::new_zeros::call(self, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor new_zeros(const at::Tensor & self, c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::new_zeros::call(self, size, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+    return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+    return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out);
+  }
+}
+
+// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_zeros_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+    return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor & new_zeros_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+    return at::_ops::new_zeros_out::call(self, c10::fromIntArrayRefSlow(size), out);
+  }
+}
+
+// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_zeros_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+    return at::_ops::new_zeros_out::call(self, size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & new_zeros_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+    return at::_ops::new_zeros_out::call(self, size, out);
+  }
+}
+
+// aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_zeros_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+    return at::_ops::new_zeros_out::call(self, size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor & new_zeros_outf(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+    return at::_ops::new_zeros_out::call(self, size, out);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2daf30c26a82433de871943c16cfbf3ba6c4a02c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor pad_sequence(at::TensorList sequences, bool batch_first=false, double padding_value=0.0, c10::string_view padding_side="right");
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..b10009bb2594e43389be17ae319a2e144ba1e4b7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/pixel_shuffle_ops.h>
+
+namespace at {
+
+
+// aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+inline at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor) {
+    return at::_ops::pixel_shuffle::call(self, upscale_factor);
+}
+
+// aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & pixel_shuffle_out(at::Tensor & out, const at::Tensor & self, int64_t upscale_factor) {
+    return at::_ops::pixel_shuffle_out::call(self, upscale_factor, out);
+}
+// aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & pixel_shuffle_outf(const at::Tensor & self, int64_t upscale_factor, at::Tensor & out) {
+    return at::_ops::pixel_shuffle_out::call(self, upscale_factor, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..afd8ffc3aa48640cfbd697fcfe4c54a6991c10b8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5519e52dc4a77d3dbb1381fbf0e504b902c50c0e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor poisson(const at::Tensor & self, ::std::optional<at::Generator> generator=::std::nullopt);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5094d41c3fd272b7244d514365d1344725e2df0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & poisson_out(const at::Tensor & self, ::std::optional<at::Generator> generator, at::Tensor & out);
+TORCH_API at::Tensor _s_poisson_cpu(const at::Tensor & self, ::std::optional<at::Generator> generator=::std::nullopt);
+TORCH_API at::Tensor _s_poisson_cuda(const at::Tensor & self, ::std::optional<at::Generator> generator=::std::nullopt);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d78ac59e59dc61b49ace81989edb0059fed0f57
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/quantized_max_pool2d_ops.h>
+
+namespace at {
+
+
+// aten::quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+inline at::Tensor quantized_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+    return at::_ops::quantized_max_pool2d::call(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+
+// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & quantized_max_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+    return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out);
+}
+// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & quantized_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+    return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a116a24bc596fdc4833164b11e441ccf49f91b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_ops.h
@@ -0,0 +1,111 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API randn {
+  using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor";
+  static at::Tensor call(c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+};
+
+struct TORCH_API randn_generator {
+  using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional<at::Generator>, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "generator";
+  static constexpr const char* schema_str = "randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor";
+  static at::Tensor call(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+};
+
+struct TORCH_API randn_names {
+  using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional<at::DimnameList>, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "names";
+  static constexpr const char* schema_str = "randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor";
+  static at::Tensor call(c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+};
+
+struct TORCH_API randn_generator_with_names {
+  using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional<at::Generator>, ::std::optional<at::DimnameList>, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "generator_with_names";
+  static constexpr const char* schema_str = "randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor";
+  static at::Tensor call(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+};
+
+struct TORCH_API randn_out {
+  using schema = at::Tensor & (c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(c10::SymIntArrayRef size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out);
+};
+
+struct TORCH_API randn_generator_out {
+  using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "generator_out";
+  static constexpr const char* schema_str = "randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API randn_names_out {
+  using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional<at::DimnameList>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "names_out";
+  static constexpr const char* schema_str = "randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, at::Tensor & out);
+};
+
+struct TORCH_API randn_generator_with_names_out {
+  using schema = at::Tensor & (c10::SymIntArrayRef, ::std::optional<at::Generator>, ::std::optional<at::DimnameList>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::randn";
+  static constexpr const char* overload_name = "generator_with_names_out";
+  static constexpr const char* schema_str = "randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..97ac74755dff5e11d9f7ce51133b63c0463d3256
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor reflection_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor reflection_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input);
+TORCH_API at::Tensor & reflection_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d17e59fbe477843c687e0c5d17708e5ff819643
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward_cuda_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor reflection_pad3d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input);
+TORCH_API at::Tensor & reflection_pad3d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad3d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7be4f7bc996fe76403a9e9aa71cb120604516df0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu_native.h
@@ -0,0 +1,40 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & relu_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor relu(const at::Tensor & self);
+TORCH_API at::Tensor & relu_(at::Tensor & self);
+TORCH_API at::Tensor NestedTensor_relu(const at::Tensor & self);
+TORCH_API at::Tensor & NestedTensor_relu_(at::Tensor & self);
+TORCH_API at::Tensor relu_sparse(const at::Tensor & self);
+TORCH_API at::Tensor & relu_sparse_(at::Tensor & self);
+TORCH_API at::Tensor relu_sparse_csr(const at::Tensor & self);
+TORCH_API at::Tensor & relu_sparse_csr_(at::Tensor & self);
+TORCH_API at::Tensor mkldnn_relu(const at::Tensor & self);
+TORCH_API at::Tensor & mkldnn_relu_(at::Tensor & self);
+TORCH_API at::Tensor relu_quantized_cpu(const at::Tensor & self);
+TORCH_API at::Tensor & relu_quantized_cpu_(at::Tensor & self);
+TORCH_API at::Tensor relu_quantized_cuda(const at::Tensor & self);
+TORCH_API at::Tensor & relu_quantized_cuda_(at::Tensor & self);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fe323549a841c89f619351ee507d998f68b5752
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API repeat {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::repeat";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "repeat(Tensor self, SymInt[] repeats) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef repeats);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats);
+};
+
+struct TORCH_API repeat_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::repeat";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef repeats, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h
new file mode 100644
index 0000000000000000000000000000000000000000..1716001e1491d005a32bd07f1841e47d1869323b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/resolve_conj_ops.h>
+
+namespace at {
+
+
+// aten::resolve_conj(Tensor(a) self) -> Tensor(a)
+inline at::Tensor resolve_conj(const at::Tensor & self) {
+    return at::_ops::resolve_conj::call(self);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef36ac996710062c0e5b9be66a4d05008562e75b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor resolve_conj(const at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..568f16aa3c8682ee9a04f704f3bf3fad827e427d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> rnn_relu(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> rnn_relu(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h
new file mode 100644
index 0000000000000000000000000000000000000000..119c29f2182ab840e7d8b89cf835ce4e9189da52
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/rnn_tanh_ops.h>
+
+namespace at {
+
+
+// aten::rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> rnn_tanh(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+    return at::_ops::rnn_tanh_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+}
+
+// aten::rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> rnn_tanh(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+    return at::_ops::rnn_tanh_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..15e7ebc65529d456a9a29245e94a010433129e6e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor rnn_tanh_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const ::std::optional<at::Tensor> & b_ih={}, const ::std::optional<at::Tensor> & b_hh={});
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e07f05ce2d8a210161e28d79ccb1f5b577b42a40
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor row_indices(const at::Tensor & self);
+
+} // namespace compositeexplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dabbc6a65a867bd952f67eaaa766425dbec614b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scaled_dot_product_attention.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/scaled_dot_product_attention_ops.h>
+
+namespace at {
+
+
+// aten::scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor
+inline at::Tensor scaled_dot_product_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional<at::Tensor> & attn_mask={}, double dropout_p=0.0, bool is_causal=false, ::std::optional<double> scale=::std::nullopt, bool enable_gqa=false) {
+    return at::_ops::scaled_dot_product_attention::call(query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ece27168ab5549fd1bf7fbf882113e2a282a2ba
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API scatter_reduce_two {
+  using schema = at::Tensor (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::scatter_reduce";
+  static constexpr const char* overload_name = "two";
+  static constexpr const char* schema_str = "scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self);
+};
+
+struct TORCH_API scatter_reduce__two {
+  using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::scatter_reduce_";
+  static constexpr const char* overload_name = "two";
+  static constexpr const char* schema_str = "scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self);
+};
+
+struct TORCH_API scatter_reduce_two_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::scatter_reduce";
+  static constexpr const char* overload_name = "two_out";
+  static constexpr const char* schema_str = "scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5d7f31da8cd46fbd4b4f55a459321e4f338d650
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/searchsorted_cpu_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, ::std::optional<c10::string_view> side=::std::nullopt, const ::std::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, ::std::optional<c10::string_view> side=::std::nullopt, const ::std::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32, bool right, ::std::optional<c10::string_view> side, const ::std::optional<at::Tensor> & sorter, at::Tensor & out);
+TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, ::std::optional<c10::string_view> side=::std::nullopt, const ::std::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, ::std::optional<c10::string_view> side=::std::nullopt, const ::std::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32, bool right, ::std::optional<c10::string_view> side, const ::std::optional<at::Tensor> & sorter, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..74f4e4aa25f333e542c60f160eb6a765dad787e8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API sigmoid_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::sigmoid_backward";
+  static constexpr const char* overload_name = "grad_input";
+  static constexpr const char* schema_str = "sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input);
+};
+
+struct TORCH_API sigmoid_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::sigmoid_backward";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor";
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & output);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40f5ef108e0b9cef30c1afe8703e248aedda5f2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/silu_ops.h>
+
+namespace at {
+
+
+// aten::silu(Tensor self) -> Tensor
+inline at::Tensor silu(const at::Tensor & self) {
+    return at::_ops::silu::call(self);
+}
+
+// aten::silu_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & silu_(at::Tensor & self) {
+    return at::_ops::silu_::call(self);
+}
+
+// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & silu_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::silu_out::call(self, out);
+}
+// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & silu_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::silu_out::call(self, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2251a63ac29ccf06494eb8df8622736a754a08b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor sin(const at::Tensor & self);
+TORCH_API at::Tensor & sin_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & sin_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & sin_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..62d1c634bf47293125583b913fd9edf96bbfa316
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sin_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API sin {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::sin";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "sin(Tensor self) -> Tensor";
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API sin_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::sin_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "sin_(Tensor(a!) self) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API sin_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::sin";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a86e4eef1f7309606cb092256b05211845e84f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/slice_inverse_ops.h>
+
+namespace at {
+
+
+// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional<int64_t> start=::std::nullopt, ::std::optional<int64_t> end=::std::nullopt, int64_t step=1) {
+    return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional<int64_t> start=::std::nullopt, ::std::optional<int64_t> end=::std::nullopt, int64_t step=1) {
+    return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step);
+  }
+}
+
+// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor slice_inverse_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional<c10::SymInt> start=::std::nullopt, ::std::optional<c10::SymInt> end=::std::nullopt, c10::SymInt step=1) {
+    return at::_ops::slice_inverse::call(self, src, dim, start, end, step);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional<c10::SymInt> start=::std::nullopt, ::std::optional<c10::SymInt> end=::std::nullopt, c10::SymInt step=1) {
+    return at::_ops::slice_inverse::call(self, src, dim, start, end, step);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..6494b3d433d7ef3aa3a5435170bdcfc9685034bc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0);
+TORCH_API at::Tensor & slow_conv3d_out(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..6888047ba23da2fa9e80978015744e34ea5d6675
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/softplus_backward_ops.h>
+
+namespace at {
+
+
+// aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & softplus_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) {
+    return at::_ops::softplus_backward_grad_input::call(grad_output, self, beta, threshold, grad_input);
+}
+// aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & softplus_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input) {
+    return at::_ops::softplus_backward_grad_input::call(grad_output, self, beta, threshold, grad_input);
+}
+
+// aten::softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+inline at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) {
+    return at::_ops::softplus_backward::call(grad_output, self, beta, threshold);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..2aae61de1bd0a0ef6ca7688d1b9dc241509ac471
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_and_clear_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor sparse_resize_and_clear(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim);
+TORCH_API const at::Tensor & sparse_resize_and_clear_out(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out);
+TORCH_API const at::Tensor & sparse_resize_and_clear_(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..986d60cb7e9facb2ab3859fa3a5ce800f6493194
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor special_bessel_y0(const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y0_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y0_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bc66a6ebc2ab6f913fd8c9032857ba1238418cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor special_bessel_y1(const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y1_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y1_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1c555c260e2d9158ec2b61f87d19c2be9c37042
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_bessel_y1 {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::special_bessel_y1";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "special_bessel_y1(Tensor self) -> Tensor";
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API special_bessel_y1_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::special_bessel_y1";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e27a1cce3e2e180ab32ddd8d94b10c1a292555
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..12161eef9dc647334bdfc539d73bc6b2defb3e00
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor special_digamma(const at::Tensor & self);
+TORCH_API at::Tensor & special_digamma_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_digamma_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..6176e342151b38208607c276c4b9e138718d17bd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor special_exp2(const at::Tensor & self);
+TORCH_API at::Tensor & special_exp2_out(const at::Tensor & self, at::Tensor & out);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..96f8c2be0776afd38c7b00d8f65108aa2378a869
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor special_modified_bessel_k0(const at::Tensor & self);
+TORCH_API at::Tensor & special_modified_bessel_k0_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_modified_bessel_k0_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec3c35b9e42a558ce480834879c955e672214292
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_ndtri(const at::Tensor & self);
+TORCH_API at::Tensor & special_ndtri_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_ndtri_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f15a5d8b45211e41a5e4554c57ce9bf27e3fc8e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/special_sinc_ops.h>
+
+namespace at {
+
+
+// aten::special_sinc(Tensor self) -> Tensor
+inline at::Tensor special_sinc(const at::Tensor & self) {
+    return at::_ops::special_sinc::call(self);
+}
+
+// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_sinc_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::special_sinc_out::call(self, out);
+}
+// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_sinc_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::special_sinc_out::call(self, out);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..65c5b4205074ea6c5b75896d7df8e840b1b7ae3c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_native.h
@@ -0,0 +1,39 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor squeeze(const at::Tensor & self);
+TORCH_API at::Tensor squeeze_nested(const at::Tensor & self);
+TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self);
+TORCH_API at::Tensor & squeeze_(at::Tensor & self);
+TORCH_API at::Tensor squeeze(const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor squeeze_dim_nested(const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor & squeeze_(at::Tensor & self, int64_t dim);
+TORCH_API at::Tensor squeeze(const at::Tensor & self, at::Dimname dim);
+TORCH_API at::Tensor & squeeze_(at::Tensor & self, at::Dimname dim);
+TORCH_API at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim);
+TORCH_API at::Tensor squeeze_dim_nested(const at::Tensor & self, at::IntArrayRef dim);
+TORCH_API at::Tensor squeeze_quantized(const at::Tensor & self, at::IntArrayRef dim);
+TORCH_API at::Tensor & squeeze_(at::Tensor & self, at::IntArrayRef dim);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1ac1fa61468f0b667435b2e48390ac1bb87ee0e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/squeeze_ops.h
@@ -0,0 +1,111 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API squeeze {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "squeeze(Tensor(a) self) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API squeeze_dim {
+  using schema = at::Tensor (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze";
+  static constexpr const char* overload_name = "dim";
+  static constexpr const char* schema_str = "squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, int64_t dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim);
+};
+
+struct TORCH_API squeeze_dimname {
+  using schema = at::Tensor (const at::Tensor &, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze";
+  static constexpr const char* overload_name = "dimname";
+  static constexpr const char* schema_str = "squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, at::Dimname dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim);
+};
+
+struct TORCH_API squeeze_dims {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze";
+  static constexpr const char* overload_name = "dims";
+  static constexpr const char* schema_str = "squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim);
+};
+
+struct TORCH_API squeeze_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "squeeze_(Tensor(a!) self) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API squeeze__dim {
+  using schema = at::Tensor & (at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze_";
+  static constexpr const char* overload_name = "dim";
+  static constexpr const char* schema_str = "squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, int64_t dim);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim);
+};
+
+struct TORCH_API squeeze__dims {
+  using schema = at::Tensor & (at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze_";
+  static constexpr const char* overload_name = "dims";
+  static constexpr const char* schema_str = "squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, at::IntArrayRef dim);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::IntArrayRef dim);
+};
+
+struct TORCH_API squeeze__dimname {
+  using schema = at::Tensor & (at::Tensor &, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::squeeze_";
+  static constexpr const char* overload_name = "dimname";
+  static constexpr const char* schema_str = "squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, at::Dimname dim);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c85851c80c6cb5e9e92c30cbaf7b19474a45ab3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API void sym_constrain_range_for_size(const at::Scalar & size, ::std::optional<int64_t> min=::std::nullopt, ::std::optional<int64_t> max=::std::nullopt);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f5630e6a8687594ccd381d35d334730ed61c0a1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_cpu_dispatch.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor tanh(const at::Tensor & self);
+TORCH_API at::Tensor & tanh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & tanh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & tanh_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..526f7e21b55768396de83aa0bde4a3e550867f16
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API threshold {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::threshold";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value);
+};
+
+struct TORCH_API threshold_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::threshold_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value);
+};
+
+struct TORCH_API threshold_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::threshold";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e18c2984c2b94046761531a117e47ffcfe880902
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> topk(const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> topk_symint(const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> topk_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> topk_outf(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> topk_symint_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> topk_symint_outf(const at::Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9f458df316e1475a7f375f6c02e232685d3dadf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API transpose_int {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::transpose";
+  static constexpr const char* overload_name = "int";
+  static constexpr const char* schema_str = "transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, int64_t dim0, int64_t dim1);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1);
+};
+
+struct TORCH_API transpose_Dimname {
+  using schema = at::Tensor (const at::Tensor &, at::Dimname, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::transpose";
+  static constexpr const char* overload_name = "Dimname";
+  static constexpr const char* schema_str = "transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, at::Dimname dim0, at::Dimname dim1);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim0, at::Dimname dim1);
+};
+
+struct TORCH_API transpose_ {
+  using schema = at::Tensor & (at::Tensor &, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::transpose_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, int64_t dim0, int64_t dim1);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim0, int64_t dim1);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fbf6401b983fbd12ee15fd82cf01b45f00b224c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API tril_ {
+  using schema = at::Tensor & (at::Tensor &, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::tril_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, c10::SymInt diagonal);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::SymInt diagonal);
+};
+
+struct TORCH_API tril_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymInt, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::tril";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out);
+};
+
+struct TORCH_API tril {
+  using schema = at::Tensor (const at::Tensor &, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::tril";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "tril(Tensor self, SymInt diagonal=0) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, c10::SymInt diagonal);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f53872f4a5bfab95e042e0043b8c9187945bc303
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor triplet_margin_loss(const at::Tensor & anchor, const at::Tensor & positive, const at::Tensor & negative, double margin=1.0, double p=2, double eps=1e-06, bool swap=false, int64_t reduction=at::Reduction::Mean);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a6f786014abebc1b4ec403dd0720f80801f5f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h
@@ -0,0 +1,78 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API true_divide_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::true_divide";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "true_divide.Tensor(Tensor self, Tensor other) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API true_divide__Tensor {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::true_divide_";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API true_divide_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::true_divide";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+struct TORCH_API true_divide_Scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::true_divide";
+  static constexpr const char* overload_name = "Scalar";
+  static constexpr const char* schema_str = "true_divide.Scalar(Tensor self, Scalar other) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API true_divide__Scalar {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::true_divide_";
+  static constexpr const char* overload_name = "Scalar";
+  static constexpr const char* schema_str = "true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef1fa5feb1eaaaf1d1f908c2d6a852997180d29c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/unflatten_dense_tensors_ops.h>
+
+namespace at {
+
+
+// aten::unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
+inline ::std::vector<at::Tensor> unflatten_dense_tensors(const at::Tensor & flat, at::TensorList tensors) {
+    return at::_ops::unflatten_dense_tensors::call(flat, tensors);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0379fcfa0d812120afa20bbd52453af91ac58e4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor unfold_backward(const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step);
+TORCH_API at::Tensor unfold_backward_symint(const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a3d3b6008169b231cf3a0f9f842db6c542be3b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step);
+
+} // namespace cuda
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc67d28ed4dab093dd813b25d332783bd35bd793
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API unfold {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::unfold";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)";
+  static at::Tensor call(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d33ffa20727f7847813886dd89e13497b796ecb1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API uniform_ {
+  using schema = at::Tensor & (at::Tensor &, double, double, ::std::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::uniform_";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)";
+  static at::Tensor & call(at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+};
+
+struct TORCH_API uniform_out {
+  using schema = at::Tensor & (const at::Tensor &, double, double, ::std::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::uniform";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API uniform {
+  using schema = at::Tensor (const at::Tensor &, double, double, ::std::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::uniform";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae3167dbb12abf13532dd80f7b89f2ef9f73a61f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_out(const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_dim_cpu(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_dim_cuda(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false);
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..785055511a03466b9d59da907b1fffbf6f1d4a2c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API unsafe_split_Tensor {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, c10::SymInt, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::unsafe_split";
+  static constexpr const char* overload_name = "Tensor";
+  static constexpr const char* schema_str = "unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]";
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, c10::SymInt split_size, int64_t dim);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim);
+};
+
+struct TORCH_API unsafe_split_Tensor_out {
+  using schema = void (const at::Tensor &, c10::SymInt, int64_t, at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::unsafe_split";
+  static constexpr const char* overload_name = "Tensor_out";
+  static constexpr const char* schema_str = "unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()";
+  static void call(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h
new file mode 100644
index 0000000000000000000000000000000000000000..aba5e57625ec88ec2145a1677d40d509fed4a65c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unsafe_split_with_sizes.h
@@ -0,0 +1,97 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/unsafe_split_with_sizes_ops.h>
+
+namespace at {
+
+
+// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> unsafe_split_with_sizes(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes::call(self, c10::fromIntArrayRefSlow(split_sizes), dim);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  ::std::vector<at::Tensor> unsafe_split_with_sizes(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes::call(self, c10::fromIntArrayRefSlow(split_sizes), dim);
+  }
+}
+
+// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> unsafe_split_with_sizes_symint(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes::call(self, split_sizes, dim);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  ::std::vector<at::Tensor> unsafe_split_with_sizes(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes::call(self, split_sizes, dim);
+  }
+}
+
+// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+  }
+}
+
+// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void unsafe_split_with_sizes_outf(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  void unsafe_split_with_sizes_outf(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+  }
+}
+
+// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void unsafe_split_with_sizes_symint_out(at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  void unsafe_split_with_sizes_out(at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out);
+  }
+}
+
+// aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void unsafe_split_with_sizes_symint_outf(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  void unsafe_split_with_sizes_outf(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+    return at::_ops::unsafe_split_with_sizes_out::call(self, split_sizes, dim, out);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d2905d80cd04a9e19a7f5f0ff07dba8e4bd87c1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_meta_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_bilinear2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_bilinear2d_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+TORCH_API at::Tensor & upsample_bilinear2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_bilinear2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1079ca478abeab92be748e6e7f96271b9f6caf41
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cpu_dispatch.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor upsample_nearest2d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor upsample_nearest2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+TORCH_API at::Tensor & upsample_nearest2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..f509fe9a7b4b775ac8ae85a5694b33416b333a97
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_backward_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_upsample_nearest3d_backward : public at::impl::MetaBase {
+
+
+    void meta(const at::Tensor & grad_output, at::ArrayRef<int64_t> output_size, at::ArrayRef<int64_t> input_size, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cb89bf96a592c91aedfd4fad33f25b2ddbef542
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/zero_ops.h>
+
+namespace at {
+
+
+// aten::zero_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & zero_(at::Tensor & self) {
+    return at::_ops::zero_::call(self);
+}
+
+// aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & zero_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::zero_out::call(self, out);
+}
+// aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & zero_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::zero_out::call(self, out);
+}
+
+// aten::zero(Tensor self) -> Tensor
+inline at::Tensor zero(const at::Tensor & self) {
+    return at::_ops::zero::call(self);
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1771b96ff4094c38527df0e3e08e3796637654a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h
@@ -0,0 +1,38 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <ATen/detail/XPUHooksInterface.h>
+
+namespace at::xpu::detail {
+
+// The real implementation of XPUHooksInterface
+struct XPUHooks : public at::XPUHooksInterface {
+  XPUHooks(at::XPUHooksArgs) {}
+  void init() const override;
+  bool hasXPU() const override;
+  std::string showConfig() const override;
+  int32_t getGlobalIdxFromDevice(const at::Device& device) const override;
+  const Generator& getDefaultGenerator(
+      DeviceIndex device_index = -1) const override;
+  Generator getNewGenerator(DeviceIndex device_index = -1) const override;
+  Device getDeviceFromPtr(void* data) const override;
+  c10::DeviceIndex getNumGPUs() const override;
+  DeviceIndex current_device() const override;
+  void deviceSynchronize(DeviceIndex device_index) const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+
+  bool isBuilt() const override {
+    return true;
+  }
+  bool isAvailable() const override;
+  bool isPinnedPtr(const void* data) const override;
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  DeviceIndex deviceCount() const override;
+  DeviceIndex getCurrentDevice() const override;
+};
+
+} // namespace at::xpu::detail
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b66f075ec73fb77290e317e911c66e4497ca1469
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Allocator.h
@@ -0,0 +1,455 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/alignment.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/irange.h>
+
+namespace c10 {
+
+using CaptureId_t = unsigned long long;
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
+// A DataPtr is a unique pointer (with an attached deleter and some
+// context for the deleter) to some memory, which also records what
+// device is for its data.
+//
+// nullptr DataPtrs can still have a nontrivial device; this allows
+// us to treat zero-size allocations uniformly with non-zero allocations.
+//
+class C10_API DataPtr {
+ private:
+  c10::detail::UniqueVoidPtr ptr_;
+  Device device_;
+
+ public:
+  // Choice of CPU here is arbitrary; if there's an "undefined" device
+  // we could use that too
+  DataPtr() : device_(DeviceType::CPU) {}
+  DataPtr(void* data, Device device) : ptr_(data), device_(device) {}
+  DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
+      : ptr_(data, ctx, ctx_deleter), device_(device) {}
+  void* operator->() const {
+    return ptr_.get();
+  }
+  C10_ALWAYS_INLINE bool /* success */ unsafe_reset_data_and_ctx(
+      void* new_data_and_ctx) {
+    return ptr_.unsafe_reset_data_and_ctx(new_data_and_ctx);
+  }
+  void clear() {
+    ptr_.clear();
+  }
+  void* get() const {
+    return ptr_.get();
+  }
+  void* mutable_get() {
+    return ptr_.get();
+  }
+  void* get_context() const {
+    return ptr_.get_context();
+  }
+  void* release_context() {
+    return ptr_.release_context();
+  }
+  std::unique_ptr<void, DeleterFnPtr>&& move_context() {
+    return ptr_.move_context();
+  }
+  operator bool() const {
+    return static_cast<bool>(ptr_);
+  }
+  template <typename T>
+  T* cast_context(DeleterFnPtr expected_deleter) const {
+    return ptr_.cast_context<T>(expected_deleter);
+  }
+  DeleterFnPtr get_deleter() const {
+    return ptr_.get_deleter();
+  }
+  /**
+   * Compare the deleter in a DataPtr to expected_deleter.
+   * If it matches, replace the deleter with new_deleter
+   * and return true; otherwise, does nothing and returns
+   * false.
+   *
+   * In general, it is not safe to unconditionally set the
+   * deleter on a DataPtr, because you don't know what
+   * the deleter is, and thus will have a hard time properly
+   * disposing of the deleter without storing the original
+   * deleter (this is difficult to do, because DeleterFnPtr
+   * is not a closure, and because the context on DataPtr is
+   * only a single word, you generally don't have enough
+   * space to store both the original deleter and its context).
+   * However, in some cases, you know /exactly/ what the deleter
+   * is, and you have a new deleter that manually wraps
+   * the old one.  In this case, you can safely swap the deleter
+   * after asserting that the deleters line up.
+   *
+   * What are the requirements on new_deleter?  It must still
+   * properly dispose of the void* pointer passed in as its argument,
+   * where void* is whatever the context of the original deleter
+   * is.  So in general, you expect the new deleter to look something
+   * like this:
+   *
+   *      [](void* ptr) {
+   *        some_new_stuff(ptr);
+   *        get_orig_allocator()->raw_deleter(ptr);
+   *      }
+   *
+   * Note that it won't work to close over the original
+   * allocator; you don't have enough space to do that!  Also,
+   * it's unsafe to assume that the passed in pointer in
+   * question is the memory pointer in question; it might not
+   * be; be sure to read the source code of the Allocator
+   * in question to confirm this.
+   */
+  [[nodiscard]] bool compare_exchange_deleter(
+      DeleterFnPtr expected_deleter,
+      DeleterFnPtr new_deleter) {
+    return ptr_.compare_exchange_deleter(expected_deleter, new_deleter);
+  }
+  Device device() const {
+    return device_;
+  }
+  // Unsafely mutates the device on a DataPtr.  Under normal use,
+  // you should never actually need to call this function.
+  // We need this for the implementation of the hack detailed
+  // in Note [Masquerading as CUDA]
+  void unsafe_set_device(Device device) {
+    device_ = device;
+  }
+};
+
+// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a
+// CPU nullptr
+
+inline bool operator==(const DataPtr& dp, std::nullptr_t) noexcept {
+  return !dp;
+}
+inline bool operator==(std::nullptr_t, const DataPtr& dp) noexcept {
+  return !dp;
+}
+inline bool operator!=(const DataPtr& dp, std::nullptr_t) noexcept {
+  return dp;
+}
+inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
+  return dp;
+}
+
+// Note [raw_allocate/raw_deallocate and Thrust]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Thrust's support for custom allocators requires us to write something
+// like this:
+//
+//  class ThrustAllocator {
+//    char* allocate(size_t);
+//    void deallocate(char*, size_t);
+//  };
+//
+// This is not good for our unique_ptr based allocator interface, as
+// there is no way to get to the context when we free.
+//
+// However, in some cases the context is exactly the same as
+// the data pointer.  In this case, we can support the "raw"
+// allocate and deallocate interface.  This is what
+// raw_deleter signifies.  By default, it returns a nullptr, which means that
+// the raw interface is not implemented.  Be sure to implement it whenever
+// possible, or the raw interface will incorrectly reported as unsupported,
+// when it is actually possible.
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+struct C10_API Allocator {
+  virtual ~Allocator() = default;
+
+  virtual DataPtr allocate(size_t n) = 0;
+
+  // Clones an allocation that came from this allocator.
+  //
+  // To perform the copy, this function calls `copy_data`, which
+  // must be implemented by derived classes.
+  //
+  // Note that this explicitly ignores any context that may have been
+  // attached to the input data.
+  //
+  // Requires: input data was allocated by the same allocator.
+  DataPtr clone(const void* data, std::size_t n);
+
+  // Checks if DataPtr has a simple context, not wrapped with any out of the
+  // ordinary contexts.
+  virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const;
+
+  // If this returns a non nullptr, it means that allocate()
+  // is guaranteed to return a unique_ptr with this deleter attached;
+  // it means the rawAllocate and rawDeallocate APIs are safe to use.
+  // This function MUST always return the same BoundDeleter.
+  virtual DeleterFnPtr raw_deleter() const {
+    return nullptr;
+  }
+  void* raw_allocate(size_t n) {
+    auto dptr = allocate(n);
+    AT_ASSERT(dptr.get() == dptr.get_context());
+    return dptr.release_context();
+  }
+  void raw_deallocate(void* ptr) {
+    auto d = raw_deleter();
+    AT_ASSERT(d);
+    d(ptr);
+  }
+
+  // Copies data from one allocation to another.
+  // Pure virtual, so derived classes must define behavior.
+  // Derived class implementation can simply call `default_copy_data`
+  // to use `std::memcpy`.
+  //
+  // Requires: src and dest were allocated by this allocator
+  // Requires: src and dest both have length >= count
+  virtual void copy_data(void* dest, const void* src, std::size_t count)
+      const = 0;
+
+ protected:
+  // Uses `std::memcpy` to copy data.
+  // Child classes can use this as `copy_data` when an alternative copy
+  // API is not needed.
+  void default_copy_data(void* dest, const void* src, std::size_t count) const;
+};
+
+// This context is used to generate DataPtr which have arbitrary
+// std::function deleters associated with them.  In some user facing
+// functions, we give a (user-friendly) interface for constructing
+// tensors from external data which take an arbitrary std::function
+// deleter.  Grep for InefficientStdFunctionContext to find these
+// occurrences.
+//
+// This context is inefficient because we have to do a dynamic
+// allocation InefficientStdFunctionContext, on top of the dynamic
+// allocation which is implied by std::function itself.
+struct C10_API InefficientStdFunctionContext {
+  void* ptr_{nullptr};
+  std::function<void(void*)> deleter_;
+  InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
+      : ptr_(ptr), deleter_(std::move(deleter)) {}
+  InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete;
+  InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept
+      : ptr_(std::exchange(rhs.ptr_, nullptr)),
+        deleter_(std::move(rhs.deleter_)) {}
+  InefficientStdFunctionContext& operator=(
+      const InefficientStdFunctionContext&) = delete;
+  // NOLINTNEXTLINE(*-noexcept-move-*)
+  InefficientStdFunctionContext& operator=(
+      InefficientStdFunctionContext&& rhs) {
+    this->~InefficientStdFunctionContext();
+    ptr_ = std::exchange(rhs.ptr_, nullptr);
+    deleter_ = std::move(rhs.deleter_);
+    return *this;
+  }
+  ~InefficientStdFunctionContext() {
+    if (deleter_) {
+      deleter_(ptr_);
+    }
+  }
+  static DataPtr makeDataPtr(
+      void* ptr,
+      std::function<void(void*)> deleter,
+      Device device);
+};
+
+/** Set the allocator for DeviceType `t`. The passed in allocator pointer is
+ *  expected to have static lifetime; this function does NOT take ownership
+ *  of the raw pointer. (The reason for this is to prevent existing pointers
+ *  to an allocator of a particular device from being invalidated when
+ *  SetAllocator is called.)
+ *
+ *  Also note that this is not thread-safe, and we assume this function will
+ *  only be called during initialization.
+ *
+ *  The 'priority' flag is introduced when we want to overwrite the default
+ *  allocator, since the allocators are set statically. The default priority
+ *  is 0, which means the lowest. Only higher or equal priority can overwrite
+ *  existing ones.
+ */
+C10_API void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0);
+C10_API Allocator* GetAllocator(const DeviceType& t);
+
+template <DeviceType t>
+struct AllocatorRegisterer {
+  explicit AllocatorRegisterer(Allocator* alloc) {
+    SetAllocator(t, alloc);
+  }
+};
+
+#define REGISTER_ALLOCATOR(t, f)                       \
+  namespace {                                          \
+  static c10::AllocatorRegisterer<t> g_allocator_d(f); \
+  }
+
+// An interface for reporting thread local memory usage
+// per device
+struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
+  /**
+   * alloc_size corresponds to the size of the ptr.
+   *
+   * total_allocated corresponds to total allocated memory.
+   *
+   * total_reserved corresponds to total size of memory pool, both used and
+   * unused, if applicable.
+   */
+  virtual void reportMemoryUsage(
+      void* ptr,
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      Device device) = 0;
+
+  virtual void reportOutOfMemory(
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      Device device);
+
+  virtual bool memoryProfilingEnabled() const = 0;
+};
+
+C10_API bool memoryProfilingEnabled();
+C10_API void reportMemoryUsageToProfiler(
+    void* ptr,
+    int64_t alloc_size,
+    size_t total_allocated,
+    size_t total_reserved,
+    Device device);
+
+C10_API void reportOutOfMemoryToProfiler(
+    int64_t alloc_size,
+    size_t total_allocated,
+    size_t total_reserved,
+    Device device);
+
+// used to hold traceback information in allocators
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+struct GatheredContext {
+  virtual ~GatheredContext() = default;
+};
+
+namespace CachingAllocator {
+struct Stat {
+  void increase(size_t amount) {
+    current += static_cast<int64_t>(amount);
+    peak = std::max(current, peak);
+    allocated += static_cast<int64_t>(amount);
+  }
+
+  void decrease(size_t amount) {
+    current -= static_cast<int64_t>(amount);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        current >= 0,
+        "Negative tracked stat in device allocator (likely logic error).");
+    freed += static_cast<int64_t>(amount);
+  }
+
+  void reset_accumulated() {
+    allocated = 0;
+    freed = 0;
+  }
+
+  void reset_peak() {
+    peak = current;
+  }
+
+  int64_t current = 0;
+  int64_t peak = 0;
+  int64_t allocated = 0;
+  int64_t freed = 0;
+};
+
+enum struct StatType : uint64_t {
+  AGGREGATE = 0,
+  SMALL_POOL = 1,
+  LARGE_POOL = 2,
+  NUM_TYPES = 3 // remember to update this whenever a new stat type is added
+};
+
+using StatArray = std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)>;
+using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
+
+template <typename Func>
+void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
+  for (const auto stat_type : c10::irange(stat_types.size())) {
+    if (stat_types[stat_type]) {
+      f(stat_type);
+    }
+  }
+}
+
+// Structure for keeping timing information
+struct DurationStat {
+  void increase(int64_t amount) {
+    total += amount;
+    count += 1;
+    max = std::max(amount, max);
+    if (min == 0) {
+      min = amount;
+    } else {
+      min = std::min(amount, min);
+    }
+  }
+
+  void reset_accumulated() {
+    total = 0;
+    count = 0;
+  }
+
+  void reset_peak() {
+    min = 0;
+    max = 0;
+  }
+
+  int64_t total = 0;
+  int64_t max = 0;
+  int64_t min = 0;
+  int64_t count = 0;
+};
+
+// Size pretty-printer
+inline std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (static_cast<double>(size) / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << static_cast<double>(size) / 1048576.0;
+    os << " MiB";
+  } else {
+    os << static_cast<double>(size) / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+} // namespace CachingAllocator
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab6a23d24d0884d72c869947857c02c22584b9c3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AllocatorConfig.h
@@ -0,0 +1,390 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/llvmMathExtras.h>
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace c10::CachingAllocator {
+
+// "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;
+// "small" allocations are packed in 2 MiB blocks
+constexpr size_t kSmallBuffer = 2097152;
+// all sizes are rounded to at least 512 bytes
+constexpr size_t kMinBlockSize = 512;
+// largest "small" allocation is 1 MiB
+constexpr size_t kSmallSize = 1048576;
+// allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kMinLargeAlloc = 10485760;
+// round up large allocations to 2 MiB
+constexpr size_t kRoundLarge = 2097152;
+
+// A utility class for tokenizing allocator configuration strings into discrete
+// parts. For example, the config string:
+//   "key1:val1,key2:[val2,val3]"
+// is tokenized into:
+//   "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
+//
+// Tokens include keys, values, and special characters (':', ',', '[', ']').
+// Whitespace is ignored.
+class ConfigTokenizer {
+ public:
+  explicit ConfigTokenizer(const std::string& env) {
+    std::string buffer;
+    for (char ch : env) {
+      if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
+        if (!buffer.empty()) {
+          config_.emplace_back(std::move(buffer));
+          buffer.clear();
+        }
+        config_.emplace_back(1, ch);
+      } else if (!std::isspace(static_cast<unsigned char>(ch))) {
+        buffer += ch;
+      }
+    }
+    if (!buffer.empty()) {
+      config_.emplace_back(std::move(buffer));
+    }
+  }
+
+  const std::string& operator[](size_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(), "Index out of bounds in ConfigTokenizer");
+    return config_[i];
+  }
+
+  size_t size() const {
+    return config_.size();
+  }
+
+  bool checkToken(size_t i, const std::string& token) const {
+    checkIndex(i);
+    return config_[i] == token;
+  }
+
+  size_t toSizeT(size_t i) const {
+    checkIndex(i);
+    return std::stoull(config_[i]);
+  }
+
+  double toDouble(size_t i) const {
+    checkIndex(i);
+    return std::stod(config_[i]);
+  }
+
+  bool toBool(size_t i) const {
+    checkIndex(i);
+    const auto& token = config_[i];
+    if (token == "True") {
+      return true;
+    } else if (token == "False") {
+      return false;
+    } else {
+      TORCH_CHECK_VALUE(
+          false,
+          "Expected 'True' or 'False' at index ",
+          i,
+          " in ConfigTokenizer but got '",
+          token,
+          "'");
+    }
+  }
+
+  // Skips the current token group and returns the index of the value token.
+  // Assumes the current index `i` points to a key name in a key-value pair.
+  size_t skipKey(size_t i) const {
+    // Expect a colon after the key
+    checkToken(++i, ":");
+
+    ++i; // Move to the value
+    checkIndex(i);
+    if (config_[i] != "[") {
+      // Value is a single token (not a list) -> return its index
+      return i;
+    }
+
+    // Skip tokens inside the list until matching ']'
+    // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+    while (++i < config_.size() && config_[i] != "]") {
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(),
+        "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
+
+    return i; // Return the index of the closing ']'
+  }
+
+ private:
+  void checkIndex(size_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(), "Index out of bounds in ConfigTokenizer");
+  }
+
+  std::vector<std::string> config_;
+};
+
+/**
+ * Note [AcceleratorAllocatorConfig design]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This class configures memory allocation for both device and host memory. A
+ * single `AcceleratorAllocatorConfig` instance is shared across all accelerator
+ * backends, such as CUDA and XPU, under the assumption that relevant
+ * environment variables apply uniformly to all accelerators. Device-specific
+ * configuration extensions are supported via hooks (see
+ * `registerDeviceConfigParserHook`).
+ *
+ * Recommended design:
+ * - Place common configurations in `AcceleratorAllocatorConfig`.
+ * - Extend backend-specific configurations in corresponding device-specific
+ *     classes, such as `CUDAAllocatorConfig`, etc.
+ *
+ * Scope:
+ * - Configuration options must be environment-variable driven.
+ *
+ * Naming Convention:
+ * - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
+ * - Members prefixed with `pinned_` are specific to the host/pinned allocator.
+ * - Environment variable names should be generic across backends.
+ * - Comma-separated key-value pairs in the format: `key:value`. Use square
+ *     brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
+ *
+ * Environment Variables:
+ * - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
+ * - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
+ *     with lower priority.
+ */
+
+class C10_API AcceleratorAllocatorConfig {
+ public:
+  static AcceleratorAllocatorConfig& instance();
+
+  C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
+  AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
+  AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
+  ~AcceleratorAllocatorConfig() = default;
+
+  /* Device allocator settings */
+
+  // Returns the maximum block size (in MB) that is allowed to be split. The
+  // default is unlimited (all blocks can be split).
+  static size_t max_split_size() {
+    return instance().max_split_size_;
+  }
+
+  // Returns the maximum block size (in MB) that is allowed to be rounded up
+  // without requiring splitting when searching for a free block. The default is
+  // 20 MiB.
+  static size_t max_non_split_rounding_size() {
+    return instance().max_non_split_rounding_size_;
+  }
+
+  // Return the number of divisions used when rounding up allocation sizes (in
+  // MB) to the nearest power-of-2 boundary.
+  static size_t roundup_power2_divisions(size_t size);
+
+  // Returns the vector of division factors used for rounding up allocation
+  // sizes. These divisions apply to size intervals between 1MB and 64GB.
+  static const std::vector<size_t>& roundup_power2_divisions() {
+    return instance().roundup_power2_divisions_;
+  }
+
+  // Returns the threshold that triggers garbage collection when the ratio of
+  // used memory to maximum allowed memory exceeds this value. The default is 0,
+  // meaning no garbage collection is triggered. The value should be in the
+  // range (0.0, 1.0).
+  static double garbage_collection_threshold() {
+    return instance().garbage_collection_threshold_;
+  }
+
+  // Returns whether the expandable segment feature is enabled. This allows the
+  // allocator to start with one segment that grows as needed, rather than
+  // creating a new segment for each allocation. Default is false (expandable
+  // segments disabled).
+  static bool use_expandable_segments() {
+    return instance().use_expandable_segments_;
+  }
+
+  /* Host allocator settings */
+
+  // Returns whether the pinned host allocator uses background threads for
+  // processing events. This is useful for improving performance in scenarios
+  // where many small allocations are made. Default is false (background threads
+  // disabled).
+  static bool pinned_use_background_threads() {
+    return instance().pinned_use_background_threads_;
+  }
+
+  /* Settings for both device and host allocator */
+
+  // Returns the current allocator settings as a string. This string is useful
+  // to expand device-specific allocator configurations
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
+    return instance().last_allocator_settings_;
+  }
+
+  // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
+  // issue.
+  static std::unordered_set<std::string>& getMutableKeys() {
+    static std::unordered_set<std::string> keys{
+        "max_split_size_mb",
+        "max_non_split_rounding_mb",
+        "garbage_collection_threshold",
+        "roundup_power2_divisions",
+        "expandable_segments",
+        "pinned_use_background_threads"};
+    return keys;
+  }
+
+  // Returns the set of valid keys for the allocator configuration.
+  // This set is used to validate the presence and correctness of keys in
+  // device-specific configuration parsers.
+  static const std::unordered_set<std::string>& getKeys() {
+    return getMutableKeys();
+  }
+
+  // Registers a device-specific configuration parser hook and its key. This
+  // allows backends to parse additional device-specific configuration options
+  // from the environment variable. The hook should be a function that takes a
+  // string (the environment variable value) and parses it to set
+  // device-specific configuration options. The hook will be called when the
+  // environment variable is parsed. If a hook is already registered, it will be
+  // replaced with the new one.
+  static void registerDeviceConfigParserHook(
+      std::function<void(const std::string&)>&& hook,
+      const std::unordered_set<std::string>& keys) {
+    device_config_parser_hook_ = std::move(hook);
+    auto& mutable_keys = getMutableKeys();
+    for (auto& key : keys) {
+      TORCH_CHECK_VALUE(
+          mutable_keys.insert(key).second,
+          "Duplicated key '",
+          key,
+          "' found in device-specific configuration parser hook registration");
+    }
+  }
+
+  // Calls the registered device-specific configuration parser hook with the
+  // provided environment string. This allows backends to parse additional
+  // device-specific configuration options from the environment variable.
+  // If no hook is registered, this function does nothing.
+  static void callDeviceConfigParserHook(const std::string& env) {
+    if (device_config_parser_hook_) {
+      device_config_parser_hook_(env);
+    }
+  }
+
+  // Parses the environment variable `env` to update the allocator settings.
+  // If the environment variable is not set, it does nothing.
+  // The configuration string should be a comma-separated list of key-value
+  // pairs, where each key is a configuration option and the value is the
+  // corresponding setting. For example:
+  // "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
+  void parseArgs(const std::string& env);
+
+ private:
+  AcceleratorAllocatorConfig();
+
+  /* Internal functions for device allocator */
+
+  // Parse `max_split_size_mb` from environment variable.
+  size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
+  // Parse `max_non_split_rounding_mb` from environment variable.
+  size_t parseMaxNonSplitRoundingSize(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `garbage_collection_threshold` from environment variable.
+  size_t parseGarbageCollectionThreshold(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `roundup_power2_divisions` from environment variable.
+  size_t parseRoundUpPower2Divisions(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `expandable_segments` from environment variable.
+  size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
+
+  /* Internal functions for host allocator */
+
+  // Parse `pinned_use_background_threads` from environment variable.
+  size_t parsePinnedUseBackgroundThreads(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+
+  /* The following members are specifically used for the device allocator. */
+
+  // The maximum block size that is allowed to be split.
+  std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
+  // The maximum allowable extra size of a memory block without requiring
+  // splitting when searching for a free block.
+  std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
+  // Used to store how memory allocations of different sizes should be rounded
+  // up to the nearest power of 2 divisions.
+  std::vector<size_t> roundup_power2_divisions_;
+  // The threshold that triggers garbage collection when the ratio of used
+  // memory to maximum allowed memory exceeds this value.
+  std::atomic<double> garbage_collection_threshold_{0};
+  // A flag to enable expandable segments feature.
+  std::atomic<bool> use_expandable_segments_{false};
+
+  /* The following members are specifically used for the host allocator. */
+
+  // A flag to enable background thread for processing events.
+  std::atomic<bool> pinned_use_background_threads_{false};
+
+  /* The following members are used for both device and host allocator. */
+
+  // Record the last allocator config environment setting.
+  std::mutex last_allocator_settings_mutex_;
+  std::string last_allocator_settings_;
+
+  // Optional hook for parsing additional device-specific allocator settings.
+  // This allows backends (e.g., CUDA, XPU) to register a custom parser for
+  // their own environment configuration extensions.
+  inline static std::function<void(const std::string&)>
+      device_config_parser_hook_{nullptr};
+};
+
+C10_API inline void setAllocatorSettings(const std::string& env) {
+  AcceleratorAllocatorConfig::instance().parseArgs(env);
+  AcceleratorAllocatorConfig::callDeviceConfigParserHook(env);
+}
+
+C10_API inline std::string getAllocatorSettings() {
+  return AcceleratorAllocatorConfig::instance().last_allocator_settings();
+}
+
+struct DeviceConfigParserHookRegistry {
+  explicit DeviceConfigParserHookRegistry(
+      std::function<void(const std::string&)>&& hook,
+      const std::unordered_set<std::string>& keys) {
+    // Use static method to avoid static initialization order fiasco issues
+    AcceleratorAllocatorConfig::registerDeviceConfigParserHook(
+        std::move(hook), keys);
+  }
+};
+
+// Assume each config parser has `parseArgs` and `getKeys` methods
+#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(parser_cls)      \
+  namespace {                                                 \
+  static at::CachingAllocator::DeviceConfigParserHookRegistry \
+      g_device_config_parse_hook_registry_instance(           \
+          [](const std::string& env) {                        \
+            parser_cls::instance().parseArgs(env);            \
+          },                                                  \
+          parser_cls::getKeys());                             \
+  }
+
+} // namespace c10::CachingAllocator
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d596b01d233dad00702dcad5269f146672861c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/AutogradState.h
@@ -0,0 +1,90 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Export.h>
+#include <optional>
+
+namespace c10 {
+
+// Structure used to pack all the thread local boolean
+// flags used by autograd
+struct C10_API AutogradState {
+  static AutogradState& get_tls_state();
+  static void set_tls_state(AutogradState state);
+
+  AutogradState(
+      bool grad_mode,
+      bool inference_mode,
+      bool fw_grad_mode,
+      bool multithreading_enabled)
+      : graph_exec_group_(std::nullopt),
+        grad_mode_(grad_mode),
+        inference_mode_(inference_mode),
+        fw_grad_mode_(fw_grad_mode),
+        multithreading_enabled_(multithreading_enabled),
+        view_replay_enabled_(false) {}
+
+  void set_grad_mode(bool enabled) {
+    grad_mode_ = enabled;
+  }
+
+  void set_fw_grad_mode(bool enabled) {
+    fw_grad_mode_ = enabled;
+  }
+
+  void set_inference_mode(bool enabled) {
+    inference_mode_ = enabled;
+  }
+
+  void set_multithreading_enabled(bool multithreading_enabled) {
+    multithreading_enabled_ = multithreading_enabled;
+  }
+
+  void set_view_replay_enabled(bool view_replay_enabled) {
+    view_replay_enabled_ = view_replay_enabled;
+  }
+
+  void set_graph_exec_group(std::optional<SafePyObject> group) {
+    graph_exec_group_ = std::move(group);
+  }
+
+  bool get_grad_mode() const {
+    return grad_mode_;
+  }
+
+  bool get_fw_grad_mode() const {
+    return fw_grad_mode_;
+  }
+
+  bool get_inference_mode() const {
+    return inference_mode_;
+  }
+
+  bool get_multithreading_enabled() const {
+    return multithreading_enabled_;
+  }
+
+  bool get_view_replay_enabled() const {
+    return view_replay_enabled_;
+  }
+
+  const std::optional<SafePyObject>& get_graph_exec_group() const {
+    return graph_exec_group_;
+  }
+
+ private:
+  std::optional<SafePyObject> graph_exec_group_;
+  bool grad_mode_ : 1;
+  bool inference_mode_ : 1;
+  bool fw_grad_mode_ : 1;
+  bool multithreading_enabled_ : 1;
+  // NOLINTNEXTLINE(cppcoreguidelines-use-default-member-init)
+  bool view_replay_enabled_ : 1;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..d26c0089ae024b876be0df2821e3f562737ff35d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Backend.h
@@ -0,0 +1,414 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <stdexcept>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+
+namespace c10 {
+
+/**
+ * This legacy enum class defines the set of backends supported by old school,
+ * code generated Type-based ATen.  A "backend" in this sense roughly
+ * corresponds to the cartesian product of (device type, layout), but restricted
+ * only to combinations which we actually have kernels for.  Backend does NOT
+ * include dtype.
+ *
+ * The reason we are sunsetting this enum class is because it doesn't allow for
+ * open registration; e.g., if you want to add SparseXLA, you'd have to
+ * edit this enum; you wouldn't be able to do it out of tree.  DispatchKey is
+ * the replacement for Backend which supports open registration.
+ *
+ * NB: The concept of 'Backend' here disagrees with the notion of backend
+ * exposed to users in torch.backends.  Backend here is something like "CPU"
+ * or "SparseCUDA"; backend in torch.backends is something like "MKL" or
+ * "CUDNN".
+ */
+enum class Backend {
+  CPU,
+  CUDA,
+  HIP,
+  VE,
+  FPGA,
+  IPU,
+  XPU,
+  SparseCPU,
+  SparseCUDA,
+  SparseCsrCPU,
+  SparseCsrCUDA,
+  SparseCsrMPS,
+  SparseMPS,
+  SparseHIP,
+  SparseVE,
+  SparseXPU,
+  SparsePrivateUse1,
+  SparseCsrHIP,
+  SparseCsrVE,
+  SparseCsrXPU,
+  SparseCsrPrivateUse1,
+  MAIA,
+  XLA,
+  Vulkan,
+  Metal,
+  Meta,
+  QuantizedCPU,
+  QuantizedCUDA,
+  QuantizedXPU,
+  QuantizedPrivateUse1,
+  Undefined,
+  MkldnnCPU,
+  MPS,
+  HPU,
+  Lazy,
+  MTIA,
+  PrivateUse1,
+  NumOptions
+};
+
+inline Backend dispatchKeyToBackend(DispatchKey t) {
+  if (t == DispatchKey::CPU || t == DispatchKey::AutogradCPU) {
+    return Backend::CPU;
+  } else if (t == DispatchKey::CUDA || t == DispatchKey::AutogradCUDA) {
+    return Backend::CUDA;
+  } else if (t == DispatchKey::HIP) {
+    return Backend::HIP;
+  } else if (t == DispatchKey::VE) {
+    return Backend::VE;
+  } else if (t == DispatchKey::FPGA) {
+    return Backend::FPGA;
+  } else if (t == DispatchKey::MAIA || t == DispatchKey::AutogradMAIA) {
+    return Backend::MAIA;
+  } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
+    return Backend::XLA;
+  } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
+    return Backend::Lazy;
+  } else if (t == DispatchKey::MPS || t == DispatchKey::AutogradMPS) {
+    return Backend::MPS;
+  } else if (t == DispatchKey::Vulkan) {
+    return Backend::Vulkan;
+  } else if (t == DispatchKey::Metal) {
+    return Backend::Metal;
+  } else if (t == DispatchKey::Meta) {
+    return Backend::Meta;
+  } else if (t == DispatchKey::SparseCPU) {
+    return Backend::SparseCPU;
+  } else if (t == DispatchKey::SparseCUDA) {
+    return Backend::SparseCUDA;
+  } else if (t == DispatchKey::SparseMPS) {
+    return Backend::SparseMPS;
+  } else if (t == DispatchKey::SparseCsrMPS) {
+    return Backend::SparseCsrMPS;
+  } else if (t == DispatchKey::SparseHIP) {
+    return Backend::SparseHIP;
+  } else if (t == DispatchKey::SparseVE) {
+    return Backend::SparseVE;
+  } else if (t == DispatchKey::SparsePrivateUse1) {
+    return Backend::SparsePrivateUse1;
+  } else if (t == DispatchKey::SparseCsrCPU) {
+    return Backend::SparseCsrCPU;
+  } else if (t == DispatchKey::SparseCsrCUDA) {
+    return Backend::SparseCsrCUDA;
+  } else if (t == DispatchKey::SparseCsrHIP) {
+    return Backend::SparseCsrHIP;
+  } else if (t == DispatchKey::SparseCsrVE) {
+    return Backend::SparseCsrVE;
+  } else if (t == DispatchKey::SparseCsrPrivateUse1) {
+    return Backend::SparseCsrPrivateUse1;
+  } else if (t == DispatchKey::MkldnnCPU) {
+    return Backend::MkldnnCPU;
+  } else if (t == DispatchKey::QuantizedCPU) {
+    return Backend::QuantizedCPU;
+  } else if (t == DispatchKey::QuantizedCUDA) {
+    return Backend::QuantizedCUDA;
+  } else if (t == DispatchKey::IPU || t == DispatchKey::AutogradIPU) {
+    return Backend::IPU;
+  } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
+    return Backend::XPU;
+  } else if (t == DispatchKey::SparseXPU) {
+    return Backend::SparseXPU;
+  } else if (t == DispatchKey::SparseCsrXPU) {
+    return Backend::SparseCsrXPU;
+  } else if (t == DispatchKey::QuantizedXPU) {
+    return Backend::QuantizedXPU;
+  } else if (t == DispatchKey::QuantizedPrivateUse1) {
+    return Backend::QuantizedPrivateUse1;
+  } else if (t == DispatchKey::HPU || t == DispatchKey::AutogradHPU) {
+    return Backend::HPU;
+  } else if (t == DispatchKey::MTIA || t == DispatchKey::AutogradMTIA) {
+    return Backend::MTIA;
+  } else if (
+      t == DispatchKey::PrivateUse1 || t == DispatchKey::AutogradPrivateUse1) {
+    return Backend::PrivateUse1;
+  } else if (t == DispatchKey::Undefined) {
+    return Backend::Undefined;
+  } else {
+    TORCH_CHECK(false, "Unrecognized tensor type ID: ", t);
+  }
+}
+
+inline DispatchKey backendToDispatchKey(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return DispatchKey::CPU;
+    case Backend::CUDA:
+      return DispatchKey::CUDA;
+    case Backend::HIP:
+      return DispatchKey::HIP;
+    case Backend::VE:
+      return DispatchKey::VE;
+    case Backend::FPGA:
+      return DispatchKey::FPGA;
+    case Backend::MAIA:
+      return DispatchKey::MAIA;
+    case Backend::XLA:
+      return DispatchKey::XLA;
+    case Backend::Lazy:
+      return DispatchKey::Lazy;
+    case Backend::IPU:
+      return DispatchKey::IPU;
+    case Backend::XPU:
+      return DispatchKey::XPU;
+    case Backend::SparseXPU:
+      return DispatchKey::SparseXPU;
+    case Backend::SparseCsrXPU:
+      return DispatchKey::SparseCsrXPU;
+    case Backend::SparseCPU:
+      return DispatchKey::SparseCPU;
+    case Backend::SparseCUDA:
+      return DispatchKey::SparseCUDA;
+    case Backend::SparseMPS:
+      return DispatchKey::SparseMPS;
+    case Backend::SparseCsrMPS:
+      return DispatchKey::SparseCsrMPS;
+    case Backend::SparseHIP:
+      return DispatchKey::SparseHIP;
+    case Backend::SparseVE:
+      return DispatchKey::SparseVE;
+    case Backend::SparsePrivateUse1:
+      return DispatchKey::SparsePrivateUse1;
+    case Backend::SparseCsrCPU:
+      return DispatchKey::SparseCsrCPU;
+    case Backend::SparseCsrCUDA:
+      return DispatchKey::SparseCsrCUDA;
+    case Backend::SparseCsrHIP:
+      return DispatchKey::SparseCsrHIP;
+    case Backend::SparseCsrVE:
+      return DispatchKey::SparseCsrVE;
+    case Backend::SparseCsrPrivateUse1:
+      return DispatchKey::SparseCsrPrivateUse1;
+    case Backend::MkldnnCPU:
+      return DispatchKey::MkldnnCPU;
+    case Backend::Vulkan:
+      return DispatchKey::Vulkan;
+    case Backend::Metal:
+      return DispatchKey::Metal;
+    case Backend::Meta:
+      return DispatchKey::Meta;
+    case Backend::QuantizedCPU:
+      return DispatchKey::QuantizedCPU;
+    case Backend::QuantizedCUDA:
+      return DispatchKey::QuantizedCUDA;
+    case Backend::QuantizedPrivateUse1:
+      return DispatchKey::QuantizedPrivateUse1;
+    case Backend::Undefined:
+      return DispatchKey::Undefined;
+    case Backend::MPS:
+      return DispatchKey::MPS;
+    case Backend::HPU:
+      return DispatchKey::HPU;
+    case Backend::MTIA:
+      return DispatchKey::MTIA;
+    case Backend::PrivateUse1:
+      return DispatchKey::PrivateUse1;
+    default:
+      TORCH_CHECK(false, "Unknown backend");
+  }
+}
+
+inline DeviceType backendToDeviceType(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+    case Backend::MkldnnCPU:
+    case Backend::SparseCPU:
+    case Backend::SparseCsrCPU:
+    case Backend::QuantizedCPU:
+      return DeviceType::CPU;
+    case Backend::CUDA:
+    case Backend::SparseCUDA:
+    case Backend::QuantizedCUDA:
+    case Backend::SparseCsrCUDA:
+      return DeviceType::CUDA;
+    case Backend::HIP:
+      return DeviceType::HIP;
+    case Backend::VE:
+      return DeviceType::VE;
+    case Backend::FPGA:
+      return DeviceType::FPGA;
+    case Backend::MAIA:
+      return DeviceType::MAIA;
+    case Backend::XLA:
+      return DeviceType::XLA;
+    case Backend::Lazy:
+      return DeviceType::Lazy;
+    case Backend::SparseHIP:
+      return DeviceType::HIP;
+    case Backend::SparseVE:
+      return DeviceType::VE;
+    case Backend::SparseCsrHIP:
+      return DeviceType::HIP;
+    case Backend::SparseCsrVE:
+      return DeviceType::VE;
+    case Backend::IPU:
+      return DeviceType::IPU;
+    case Backend::XPU:
+    case Backend::SparseXPU:
+    case Backend::SparseCsrXPU:
+    case Backend::QuantizedXPU:
+      return DeviceType::XPU;
+    case Backend::Vulkan:
+      return DeviceType::Vulkan;
+    case Backend::Metal:
+      return DeviceType::Metal;
+    case Backend::Meta:
+      return DeviceType::Meta;
+    case Backend::MPS:
+    case Backend::SparseMPS:
+    case Backend::SparseCsrMPS:
+      return DeviceType::MPS;
+    case Backend::HPU:
+      return DeviceType::HPU;
+    case Backend::MTIA:
+      return DeviceType::MTIA;
+    case Backend::PrivateUse1:
+    case Backend::SparsePrivateUse1:
+    case Backend::SparseCsrPrivateUse1:
+    case Backend::QuantizedPrivateUse1:
+      return DeviceType::PrivateUse1;
+    case Backend::Undefined:
+      TORCH_CHECK(false, "Undefined backend is not a valid device type");
+    default:
+      TORCH_CHECK(false, "Unknown backend");
+  }
+}
+
+inline const char* toString(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return "CPU";
+    case Backend::CUDA:
+      return "CUDA";
+    case Backend::HIP:
+      return "HIP";
+    case Backend::VE:
+      return "VE";
+    case Backend::FPGA:
+      return "FPGA";
+    case Backend::XPU:
+      return "XPU";
+    case Backend::IPU:
+      return "IPU";
+    case Backend::MAIA:
+      return "MAIA";
+    case Backend::XLA:
+      return "XLA";
+    case Backend::Lazy:
+      return "Lazy";
+    case Backend::MPS:
+      return "MPS";
+    case Backend::SparseCPU:
+      return "SparseCPU";
+    case Backend::SparseCUDA:
+      return "SparseCUDA";
+    case Backend::SparseMPS:
+      return "SparseMPS";
+    case Backend::SparseCsrMPS:
+      return "SparseCsrMPS";
+    case Backend::SparseHIP:
+      return "SparseHIP";
+    case Backend::SparseVE:
+      return "SparseVE";
+    case Backend::SparseXPU:
+      return "SparseXPU";
+    case Backend::SparsePrivateUse1:
+      return "SparsePrivateUse1";
+    case Backend::SparseCsrCPU:
+      return "SparseCsrCPU";
+    case Backend::SparseCsrCUDA:
+      return "SparseCsrCUDA";
+    case Backend::SparseCsrHIP:
+      return "SparseCsrHIP";
+    case Backend::SparseCsrVE:
+      return "SparseCsrVE";
+    case Backend::SparseCsrXPU:
+      return "SparseCsrXPU";
+    case Backend::SparseCsrPrivateUse1:
+      return "SparseCsrPrivateUse1";
+    case Backend::MkldnnCPU:
+      return "MkldnnCPU";
+    case Backend::Vulkan:
+      return "Vulkan";
+    case Backend::Metal:
+      return "Metal";
+    case Backend::Meta:
+      return "Meta";
+    case Backend::QuantizedCPU:
+      return "QuantizedCPU";
+    case Backend::QuantizedCUDA:
+      return "QuantizedCUDA";
+    case Backend::QuantizedXPU:
+      return "QuantizedXPU";
+    case Backend::QuantizedPrivateUse1:
+      return "QuantizedPrivateUse1";
+    case Backend::HPU:
+      return "HPU";
+    case Backend::MTIA:
+      return "MTIA";
+    case Backend::PrivateUse1:
+      return "PrivateUseOne";
+    default:
+      return "UNKNOWN_BACKEND";
+  }
+}
+
+inline bool isSparse(Backend b) {
+  switch (b) {
+    case Backend::SparseXPU:
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+    case Backend::SparseMPS:
+    case Backend::SparseHIP:
+    case Backend::SparseVE:
+    case Backend::SparsePrivateUse1:
+      return true;
+    default:
+      return false;
+  }
+}
+
+inline bool isSparseCsr(Backend b) {
+  switch (b) {
+    case Backend::SparseCsrXPU:
+    case Backend::SparseCsrCPU:
+    case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrPrivateUse1:
+      return true;
+    default:
+      return false;
+  }
+}
+
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d43d48e32ee794092b23a488cbb8518a6d5d2623
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CPUAllocator.h
@@ -0,0 +1,64 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+
+#include <c10/core/Allocator.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+
+// TODO: rename to c10
+C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
+
+namespace c10 {
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+C10_API void NoDelete(void* /*unused*/);
+
+// A simple struct that is used to report C10's memory allocation,
+// deallocation status and out-of-memory events to the profiler
+class C10_API ProfiledCPUMemoryReporter {
+ public:
+  ProfiledCPUMemoryReporter() = default;
+  void New(void* ptr, size_t nbytes);
+  void OutOfMemory(size_t nbytes);
+  void Delete(void* ptr);
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<void*, size_t> size_table_;
+  size_t allocated_ = 0;
+  size_t log_cnt_ = 0;
+};
+
+C10_API ProfiledCPUMemoryReporter& profiledCPUMemoryReporter();
+
+// Get the CPU Allocator.
+C10_API at::Allocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+C10_API void SetCPUAllocator(at::Allocator* alloc, uint8_t priority = 0);
+
+// Get the Default CPU Allocator
+C10_API at::Allocator* GetDefaultCPUAllocator();
+
+// Get the Default Mobile CPU Allocator
+C10_API at::Allocator* GetDefaultMobileCPUAllocator();
+
+// The CPUCachingAllocator is experimental and might disappear in the future.
+// The only place that uses it is in StaticRuntime.
+// Set the CPU Caching Allocator
+C10_API void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority = 0);
+// Get the CPU Caching Allocator
+C10_API Allocator* GetCPUCachingAllocator();
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..23b413de834aae788e8f763f60cd75ec7750dbea
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CachingDeviceAllocator.h
@@ -0,0 +1,126 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
+
+namespace c10::CachingDeviceAllocator {
+
+using namespace c10::CachingAllocator;
+
+// Struct containing memory allocator summary statistics for a device.
+struct DeviceStats {
+  // COUNT: allocations requested by client code
+  StatArray allocation;
+  // COUNT: number of allocated segments from device memory allocation.
+  StatArray segment;
+  // COUNT: number of active memory blocks (allocated or used by stream)
+  StatArray active;
+  // COUNT: number of inactive, split memory blocks (unallocated but can't be
+  // released via device memory deallocation)
+  StatArray inactive_split;
+
+  // SUM: bytes allocated by this memory allocator
+  StatArray allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  StatArray reserved_bytes;
+  // SUM: bytes within active memory blocks
+  StatArray active_bytes;
+  // SUM: bytes within inactive, split memory blocks
+  StatArray inactive_split_bytes;
+  // SUM: bytes requested by client code
+  StatArray requested_bytes;
+
+  // COUNT: total number of failed calls to device malloc necessitating cache
+  // flushes.
+  int64_t num_alloc_retries = 0;
+
+  // COUNT: total number of OOMs (i.e. failed calls to device memory allocation
+  // after cache flush)
+  int64_t num_ooms = 0;
+
+  // COUNT: total number of oversize blocks allocated from pool
+  Stat oversize_allocations;
+
+  // COUNT: total number of oversize blocks requiring malloc
+  Stat oversize_segments;
+
+  // COUNT: total number of synchronize_and_free_events() calls
+  int64_t num_sync_all_streams = 0;
+
+  // COUNT: total number of device memory allocation calls. This includes both
+  // mapped and malloced memory.
+  int64_t num_device_alloc = 0;
+
+  // COUNT: total number of device memory deallocation calls. This includes both
+  // un-mapped and free memory.
+  int64_t num_device_free = 0;
+
+  // SIZE: maximum block size that is allowed to be split.
+  int64_t max_split_size = 0;
+};
+
+} // namespace c10::CachingDeviceAllocator
+
+namespace c10 {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by Graph mode capture_begin.
+// second is set if the instance is created by Graph mode graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct C10_API DeviceAllocator : public c10::Allocator {
+  DeviceAllocator();
+  ~DeviceAllocator() override;
+
+  // Returns true if the allocator has been properly initialized and is ready
+  // for use
+  virtual bool initialized() = 0;
+
+  // Releases all cached device memory from the specified memory pool back to
+  // the system
+  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
+
+  // Associates a memory allocation with a stream to establish dependency
+  // tracking. Prevents memory reuse until all operations on the specified
+  // stream complete
+  virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
+
+  // Retrieves comprehensive memory statistics for the specified device,
+  // including allocation patterns, usage metrics
+  virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) = 0;
+
+  // Resets cumulative allocation statistics for the specified device to zero
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+
+  // Resets peak memory usage statistics for the specified device
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+
+  // Return the free memory size and total memory size in bytes for the
+  // specified device.
+  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "getMemoryInfo is not implemented for this allocator yet.");
+  }
+};
+
+// This function is used to get the DeviceAllocator for a specific device type
+// and keep backward compatibility with c10::GetAllocator.
+C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
+  TORCH_CHECK(
+      t != DeviceType::CPU,
+      "getDeviceAllocator is not supported for CPU device type.");
+  auto* allocator = c10::GetAllocator(t);
+  auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
+  TORCH_INTERNAL_ASSERT(
+      device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
+  return device_allocator;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..28dd52759e8de0f4f2f2947e96ccd0dd7467a95c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h
@@ -0,0 +1,62 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/TypeTraits.h>
+#include <type_traits>
+
+namespace c10 {
+
+/**
+ * Represent a function pointer as a C++ type.
+ * This allows using the function pointer as a type
+ * in a template and calling it from inside the template
+ * allows the compiler to inline the call because it
+ * knows the function pointer at compile time.
+ *
+ * Example 1:
+ *  int add(int a, int b) {return a + b;}
+ *  using Add = TORCH_FN_TYPE(add);
+ *  template<class Func> struct Executor {
+ *    int execute(int a, int b) {
+ *      return Func::func_ptr()(a, b);
+ *    }
+ *  };
+ *  Executor<Add> executor;
+ *  EXPECT_EQ(3, executor.execute(1, 2));
+ *
+ * Example 2:
+ *  int add(int a, int b) {return a + b;}
+ *  template<class Func> int execute(Func, int a, int b) {
+ *    return Func::func_ptr()(a, b);
+ *  }
+ *  EXPECT_EQ(3, execute(TORCH_FN(add), 1, 2));
+ */
+template <class FuncType_, FuncType_* func_ptr_>
+struct CompileTimeFunctionPointer final {
+  static_assert(
+      guts::is_function_type<FuncType_>::value,
+      "TORCH_FN can only wrap function types.");
+  using FuncType = FuncType_;
+
+  static constexpr FuncType* func_ptr() {
+    return func_ptr_;
+  }
+};
+
+template <class T>
+struct is_compile_time_function_pointer : std::false_type {};
+template <class FuncType, FuncType* func_ptr>
+struct is_compile_time_function_pointer<
+    CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
+
+} // namespace c10
+
+#define TORCH_FN_TYPE(func)                                           \
+  ::c10::CompileTimeFunctionPointer<                                  \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
+      func>
+#define TORCH_FN(func) TORCH_FN_TYPE(func)()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..22a3cf2104d1c55c0d18681906cc4ae9c2c85800
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h
@@ -0,0 +1,115 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+namespace c10 {
+
+// Unlike other SymNodeImpl, this cannot be "dispatched" conventionally,
+// as it typically needs to defer to another SymNodeImpl
+//
+// Can either represent a bool, int (don't support float yet) this is useful
+// for representing otherwise unrepresentable large negative integer constant.
+template <typename T>
+class C10_API ConstantSymNodeImpl : public SymNodeImpl {
+  static_assert(
+      ::std::is_same_v<T, int64_t> || ::std::is_same_v<T, bool>,
+      "ConstantSymNodeImpl can only accept int64_t or bool types");
+
+ public:
+  ConstantSymNodeImpl(T val) : value_(val) {}
+
+  bool is_int() override {
+    return is_int_();
+  }
+  bool is_bool() override {
+    return is_bool_();
+  }
+  bool is_float() override {
+    return false;
+  }
+  int64_t guard_int(
+      const char* file [[maybe_unused]],
+      int64_t line [[maybe_unused]]) override {
+    TORCH_CHECK(is_int(), "not an int");
+    return int_();
+  }
+  bool guard_bool(
+      const char* file [[maybe_unused]],
+      int64_t line [[maybe_unused]]) override {
+    TORCH_CHECK(is_bool(), "not a bool");
+    return bool_();
+  }
+  double guard_float(
+      const char* file [[maybe_unused]],
+      int64_t line [[maybe_unused]]) override {
+    TORCH_CHECK(false, "not a float");
+  }
+  int64_t int_() override {
+    TORCH_CHECK(is_int(), "not an int");
+    return ::std::get<int64_t>(value_);
+  }
+  bool bool_() override {
+    TORCH_CHECK(is_bool(), "not a bool");
+    return ::std::get<bool>(value_);
+  }
+  bool has_hint() override {
+    return true;
+  }
+  c10::SymNode eq(const c10::SymNode& other) override;
+  c10::SymNode ne(const c10::SymNode& other) override;
+  c10::SymNode ge(const c10::SymNode& other) override;
+  c10::SymNode le(const c10::SymNode& other) override;
+  c10::SymNode lt(const c10::SymNode& other) override;
+  c10::SymNode gt(const c10::SymNode& other) override;
+  c10::SymNode mul(const c10::SymNode& other) override;
+  ::std::string str() override {
+    if constexpr (is_int_()) {
+      return ::std::to_string(::std::get<int64_t>(value_));
+    } else {
+      return ::std::get<bool>(value_) ? "true" : "false";
+    }
+  }
+  std::optional<int64_t> constant_int() override {
+    if constexpr (is_int_()) {
+      return ::std::get<int64_t>(value_);
+    } else {
+      return std::nullopt;
+    }
+  }
+  std::optional<bool> constant_bool() override {
+    if constexpr (is_bool_()) {
+      return ::std::get<bool>(value_);
+    } else {
+      return std::nullopt;
+    }
+  }
+  bool is_constant() override {
+    return true;
+  }
+  bool is_symbolic() override {
+    return false;
+  }
+
+ private:
+  ::std::variant<int64_t, bool> value_;
+
+  static constexpr bool is_int_() {
+    return ::std::is_same_v<T, int64_t>;
+  }
+  static constexpr bool is_bool_() {
+    return ::std::is_same_v<T, bool>;
+  }
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h
new file mode 100644
index 0000000000000000000000000000000000000000..014903df018c3db2b2df40ca72ee4cd40ebf21c6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Contiguity.h
@@ -0,0 +1,314 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace c10 {
+
+template <typename T>
+bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (numel == 0) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (size_d == 1) {
+      continue;
+    }
+
+    if (strides[d] != expected_stride) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+// Return a SymBool with underlying symbolic expression that represents
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
+inline static c10::SymBool _compute_contiguous_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides,
+    const c10::SymInt& numel) {
+  // If this return true, the tensor is contiguous indeed. Otherwise it could be
+  // either.
+  auto is_contiguous_or_false = [&]() {
+    if (TORCH_GUARD_OR_FALSE(sym_eq(numel, 0))) {
+      return true;
+    }
+
+    // When calculating the expected stride, we can choose to multiply
+    // with max(1, size[d]) or size[d]. Regardless, this is ok for this
+    // function. Why?
+    // (1) If size[d] == 0, then the tensor is contiguous and if
+    //     we return true or false it won't break this function.
+    // (2) If size[d] is not 0, then max(1,size[d]) and size[d] are equal.
+    //     Therefore, if we choose to use max(1, size[d]) or size[d] to
+    //     calculate the expected stride, the result is the same.
+    //
+    // We symbolically check both paths to maximize the cases where this
+    // function returns true. This is because make_contiguous_strides_for adds
+    // the max symbolically, and in some other situations the max might not be
+    // there. And we want to ensure we return true in both cases.
+    c10::SymInt expected_stride = 1;
+    c10::SymInt expected_stride_max = 1;
+    // NB: make sure we do signed arithmetic
+    for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+      if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+        continue;
+      }
+
+      if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride)) &&
+          TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride_max))) {
+        return false;
+      }
+      expected_stride_max *= sizes[d].max(1);
+      expected_stride *= sizes[d];
+    }
+    return true;
+  };
+
+  // We try to minimize creating large symbolic expressions when not needed to
+  // avoid symbolic evaluation perf issues.
+  if (is_contiguous_or_false()) {
+    return c10::SymBool(true);
+  }
+
+  // Build a single expression that represents contiguity and return it.
+  c10::SymBool is_empty = sym_eq(numel, 0);
+  c10::SymBool is_contiguous_cond = true;
+
+  c10::SymInt expected_stride = 1;
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    is_contiguous_cond = is_contiguous_cond.sym_and(
+        size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+    expected_stride = expected_stride * size_d;
+  }
+  return is_contiguous_cond.sym_or(is_empty);
+}
+
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_2d_sym does not. Only use this function
+// when inputs are hinted.
+template <typename T>
+bool _compute_channels_last_contiguous_2d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 4: {
+      T expected = 1;
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (size_d != 1) {
+          if (strides[d] != expected) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+// Return a SymBool with underlying symbolic expression that represents
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
+inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 4: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_3d_sym does not. Only use this function
+// when inputs are hinted.
+template <typename T>
+bool _compute_channels_last_contiguous_3d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 5: {
+      T expected = 1;
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (size_d != 1) {
+          if (strides[d] != expected) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 5: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 4, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
+template <typename T>
+bool _compute_non_overlapping_and_dense(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  auto dim = sizes.size();
+  if (dim == 1) {
+    return sizes[0] < 2 || strides[0] == 1;
+  }
+  SmallVector<int64_t, 5> perm;
+  perm.resize(dim);
+  for (const auto i : c10::irange(dim)) {
+    perm[i] = i;
+  }
+  // Sort by strides, leaving 0 and 1 sized dims at the end of the array
+  std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
+    if (sizes[a] < 2) {
+      return false;
+    } else if (sizes[b] < 2) {
+      return true;
+    }
+    return strides[a] < strides[b];
+  });
+  T require_stride = 1;
+  for (const auto i : c10::irange(dim)) {
+    const auto& size_perm_i = sizes[perm[i]];
+    if (size_perm_i < 2) {
+      return true;
+    }
+    if (strides[perm[i]] != require_stride) {
+      return false;
+    }
+    require_stride *= size_perm_i;
+  }
+  return true;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc2632794299da5a6c9c5d30be0b4591600bab2a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/CopyBytes.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <cstddef>
+
+namespace c10 {
+
+using CopyBytesFunction = void (*)(
+    size_t nbytes,
+    const void* src,
+    Device src_device,
+    void* dst,
+    Device dst_device);
+
+struct C10_API _CopyBytesFunctionRegisterer {
+  _CopyBytesFunctionRegisterer(
+      DeviceType from,
+      DeviceType to,
+      CopyBytesFunction func_sync,
+      CopyBytesFunction func_async = nullptr);
+};
+
+#define REGISTER_COPY_BYTES_FUNCTION(from, to, ...)           \
+  namespace {                                                 \
+  static _CopyBytesFunctionRegisterer C10_ANONYMOUS_VARIABLE( \
+      g_copy_function)(from, to, __VA_ARGS__);                \
+  }
+
+/*
+ * WARNING: Implementations for this function are currently registered from
+ * ATen and caffe2, not yet from c10. Don't use this if not either ATen
+ * or caffe2 is present as well.
+ * We can't move them yet, because the CUDA implementations aren't unified yet
+ * between ATen and caffe2.
+ * We're planning to move the implementations into c10/backend/xxx
+ * to make c10 self contained again.
+ */
+C10_API void CopyBytes(
+    size_t nbytes,
+    const void* src,
+    Device src_device,
+    void* dst,
+    Device dst_device,
+    bool async);
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h
new file mode 100644
index 0000000000000000000000000000000000000000..240c173ca22ae28ab20e243890b2f8a054156fa5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultDtype.h
@@ -0,0 +1,20 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Export.h>
+
+namespace caffe2 {
+class TypeMeta;
+} // namespace caffe2
+
+namespace c10 {
+C10_API void set_default_dtype(caffe2::TypeMeta dtype);
+C10_API const caffe2::TypeMeta get_default_dtype();
+C10_API ScalarType get_default_dtype_as_scalartype();
+C10_API const caffe2::TypeMeta get_default_complex_dtype();
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d5e66ec405ddeb1494d987a034cf1b945663667
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DefaultTensorOptions.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Layout.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/typeid.h>
+
+namespace c10 {
+
+struct TensorOptions;
+
+/// Like TensorOptions, but all fields are guaranteed to be filled.
+struct DefaultTensorOptions {
+  DefaultTensorOptions() = default;
+
+  caffe2::TypeMeta dtype() const noexcept {
+    return dtype_;
+  }
+  Device device() const noexcept {
+    return device_;
+  }
+  Layout layout() const noexcept {
+    return layout_;
+  }
+  bool requires_grad() const noexcept {
+    return requires_grad_;
+  }
+
+  // Defined in TensorOptions.h
+  inline DefaultTensorOptions& merge(const TensorOptions& options);
+
+ private:
+  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 64-bit
+  Device device_ = at::kCPU; // 32-bit
+  Layout layout_ = at::kStrided; // 8-bit
+  bool requires_grad_ = false; // 8-bit
+};
+
+inline const DefaultTensorOptions& getDefaultTensorOptions() {
+  static const auto options = DefaultTensorOptions();
+  return options;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3380f434c6c8284476ac3bc662fd88e10289a86
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Device.h
@@ -0,0 +1,221 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <string>
+
+namespace c10 {
+
+/// An index representing a specific device; e.g., the 1 in GPU 1.
+/// A DeviceIndex is not independently meaningful without knowing
+/// the DeviceType it is associated; try to use Device rather than
+/// DeviceIndex directly.
+using DeviceIndex = int8_t;
+
+/// Represents a compute device on which a tensor is located. A device is
+/// uniquely identified by a type, which specifies the type of machine it is
+/// (e.g. CPU or CUDA GPU), and a device index or ordinal, which identifies the
+/// specific compute device when there is more than one of a certain type. The
+/// device index is optional, and in its defaulted state represents (abstractly)
+/// "the current device". Further, there are two constraints on the value of the
+/// device index, if one is explicitly stored:
+/// 1. A negative index represents the current device, a non-negative index
+/// represents a specific, concrete device,
+/// 2. When the device type is CPU, the device index must be zero.
+struct C10_API Device final {
+  using Type = DeviceType;
+
+  /// Constructs a new `Device` from a `DeviceType` and an optional device
+  /// index.
+  /* implicit */ Device(DeviceType type, DeviceIndex index = -1)
+      : type_(type), index_(index) {
+    validate();
+  }
+
+  /// Constructs a `Device` from a string description, for convenience.
+  /// The string supplied must follow the following schema:
+  /// `(cpu|cuda)[:<device-index>]`
+  /// where `cpu` or `cuda` specifies the device type, and
+  /// `:<device-index>` optionally specifies a device index.
+  /* implicit */ Device(const std::string& device_string);
+
+  /// Returns true if the type and index of this `Device` matches that of
+  /// `other`.
+  bool operator==(const Device& other) const noexcept {
+    return this->type_ == other.type_ && this->index_ == other.index_;
+  }
+
+  /// Returns true if the type or index of this `Device` differs from that of
+  /// `other`.
+  bool operator!=(const Device& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// Sets the device index.
+  void set_index(DeviceIndex index) {
+    index_ = index;
+  }
+
+  /// Returns the type of device this is.
+  DeviceType type() const noexcept {
+    return type_;
+  }
+
+  /// Returns the optional index.
+  DeviceIndex index() const noexcept {
+    return index_;
+  }
+
+  /// Returns true if the device has a non-default index.
+  bool has_index() const noexcept {
+    return index_ != -1;
+  }
+
+  /// Return true if the device is of CUDA type.
+  bool is_cuda() const noexcept {
+    return type_ == DeviceType::CUDA;
+  }
+
+  /// Return true if the device is of PrivateUse1 type.
+  bool is_privateuseone() const noexcept {
+    return type_ == DeviceType::PrivateUse1;
+  }
+
+  /// Return true if the device is of MPS type.
+  bool is_mps() const noexcept {
+    return type_ == DeviceType::MPS;
+  }
+
+  /// Return true if the device is of HIP type.
+  bool is_hip() const noexcept {
+    return type_ == DeviceType::HIP;
+  }
+
+  /// Return true if the device is of VE type.
+  bool is_ve() const noexcept {
+    return type_ == DeviceType::VE;
+  }
+
+  /// Return true if the device is of XPU type.
+  bool is_xpu() const noexcept {
+    return type_ == DeviceType::XPU;
+  }
+
+  /// Return true if the device is of IPU type.
+  bool is_ipu() const noexcept {
+    return type_ == DeviceType::IPU;
+  }
+
+  /// Return true if the device is of XLA type.
+  bool is_xla() const noexcept {
+    return type_ == DeviceType::XLA;
+  }
+
+  /// Return true if the device is of MTIA type.
+  bool is_mtia() const noexcept {
+    return type_ == DeviceType::MTIA;
+  }
+
+  /// Return true if the device is of HPU type.
+  bool is_hpu() const noexcept {
+    return type_ == DeviceType::HPU;
+  }
+
+  /// Return true if the device is of Lazy type.
+  bool is_lazy() const noexcept {
+    return type_ == DeviceType::Lazy;
+  }
+
+  /// Return true if the device is of Vulkan type.
+  bool is_vulkan() const noexcept {
+    return type_ == DeviceType::Vulkan;
+  }
+
+  /// Return true if the device is of Metal type.
+  bool is_metal() const noexcept {
+    return type_ == DeviceType::Metal;
+  }
+
+  /// Return true if the device is of MAIA type.
+  bool is_maia() const noexcept {
+    return type_ == DeviceType::MAIA;
+  }
+
+  /// Return true if the device is of META type.
+  bool is_meta() const noexcept {
+    return type_ == DeviceType::Meta;
+  }
+
+  /// Return true if the device is of CPU type.
+  bool is_cpu() const noexcept {
+    return type_ == DeviceType::CPU;
+  }
+
+  /// Return true if the device supports arbitrary strides.
+  bool supports_as_strided() const noexcept {
+    return type_ != DeviceType::IPU && type_ != DeviceType::XLA &&
+        type_ != DeviceType::Lazy;
+  }
+
+  /// Same string as returned from operator<<.
+  std::string str() const;
+
+ private:
+  DeviceType type_;
+  DeviceIndex index_ = -1;
+  void validate() {
+    // Removing these checks in release builds noticeably improves
+    // performance in micro-benchmarks.
+    // This is safe to do, because backends that use the DeviceIndex
+    // have a later check when we actually try to switch to that device.
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        index_ >= -1,
+        "Device index must be -1 or non-negative, got ",
+        static_cast<int>(index_));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        !is_cpu() || index_ <= 0,
+        "CPU device index must be -1 or zero, got ",
+        static_cast<int>(index_));
+  }
+};
+
+C10_API std::ostream& operator<<(std::ostream& stream, const Device& device);
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::Device> {
+  size_t operator()(c10::Device d) const noexcept {
+    // Are you here because this static assert failed?  Make sure you ensure
+    // that the bitmasking code below is updated accordingly!
+    static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
+    // Note [Hazard when concatenating signed integers]
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // We must first convert to a same-sized unsigned type, before promoting to
+    // the result type, to prevent sign extension when any of the values is -1.
+    // If sign extension occurs, you'll clobber all of the values in the MSB
+    // half of the resulting integer.
+    //
+    // Technically, by C/C++ integer promotion rules, we only need one of the
+    // uint32_t casts to the result type, but we put in both for explicitness's
+    // sake.
+    uint32_t bits = static_cast<uint32_t>(static_cast<uint8_t>(d.type()))
+            << 16 |
+        static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
+    return std::hash<uint32_t>{}(bits);
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2b179b4d2d82385aefe1f1b79cb2069120500d7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceArray.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace c10 {
+
+template <typename T>
+class DeviceArray {
+ public:
+  DeviceArray(c10::Allocator& allocator, size_t size)
+      : data_ptr_(allocator.allocate(size * sizeof(T))) {
+    static_assert(std::is_trivial_v<T>, "T must be a trivial type");
+    TORCH_INTERNAL_ASSERT(
+        0 == (reinterpret_cast<intptr_t>(data_ptr_.get()) % alignof(T)),
+        "c10::DeviceArray: Allocated memory is not aligned for this data type");
+  }
+
+  T* get() {
+    return static_cast<T*>(data_ptr_.get());
+  }
+
+ private:
+  c10::DataPtr data_ptr_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h
new file mode 100644
index 0000000000000000000000000000000000000000..85477281261bed35e2652ddc471c9bae4042707a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceCapability.h
@@ -0,0 +1,81 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Export.h>
+#include <cstdint>
+
+namespace c10 {
+
+constexpr size_t NUMBER_OF_DEVICE_CAPABILITIES = NumScalarTypes;
+
+// Generate bitfields for each scalar type
+#define DEFINE_SCALAR_TYPE(_1, n) unsigned int has_##n : 1;
+
+// Generate enum indices for each scalar type
+#define DEFINE_SCALAR_ENUM(_1, name) kIndex_##name,
+
+enum ScalarTypeIndex {
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_ENUM)
+};
+
+/**
+ * @brief DeviceCapability represents the the common capabilities that all
+ * devices should support.
+ *
+ * This struct provides a compact way to represent the common capabilities that
+ * all devices should support. Includes the following capabilities:
+ * - Supported data types
+ *
+ * Purpose
+ * - Enable device-specific optimizations based on supported capabilities
+ *
+ * Contract
+ *
+ * Supported data types:
+ * - Each bitfield represents support for one device capability
+ * - Bit value 1 means the capability is supported, 0 means not supported
+ * - The struct is initialized with all capabilities enabled by default
+ *
+ * @note Adding New Capabilities
+ *
+ * 1. Define the new capability in the `DeviceCapability` struct
+ * 2. Update the support of the new capability in each accelerator
+ *    implementation
+ * 3. Add the new capability to the returned PyObject Dictionary
+ */
+struct C10_API DeviceCapability {
+  union {
+    struct {
+      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_TYPE)
+    } supported_scalar_types;
+    uint64_t capability_bits; // Allow direct bit manipulation
+  } capability_data;
+
+  // Default constructor with all capabilities enabled.
+  DeviceCapability() {
+    capability_data.capability_bits =
+        ((1ULL << NUMBER_OF_DEVICE_CAPABILITIES) - 1);
+  }
+
+  // Iterate supported ScalarTypes without allocating a vector
+  template <typename F>
+  void forEachSupportedScalarType(F&& visitor) const {
+#define VISIT_SCALAR_TYPE(_1, n)                        \
+  if (capability_data.supported_scalar_types.has_##n) { \
+    visitor(ScalarType::n);                             \
+  }
+
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(VISIT_SCALAR_TYPE)
+
+#undef VISIT_SCALAR_TYPE
+  }
+};
+
+#undef DEFINE_SCALAR_ENUM
+#undef DEFINE_SCALAR_TYPE
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..389ac29d10029d915279857f4fb4e2ffeb880307
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceGuard.h
@@ -0,0 +1,207 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+/// RAII guard that sets a certain default device in its constructor, and
+/// changes it back to the device that was originally active upon destruction.
+///
+/// The device is always reset to the one that was active at the time of
+/// construction of the guard. Even if you `set_device` after construction, the
+/// destructor will still reset the device to the one that was active at
+/// construction time.
+///
+/// This device guard does NOT have an uninitialized state; it is guaranteed
+/// to reset a device on exit.  If you are in a situation where you *might*
+/// want to setup a guard (i.e., are looking for the moral equivalent
+/// of std::optional<DeviceGuard>), see OptionalDeviceGuard.
+class DeviceGuard {
+ public:
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit DeviceGuard() = delete;
+
+  /// Set the current device to the passed Device.
+  explicit DeviceGuard(Device device) : guard_(device) {}
+
+  /// This constructor is for testing only.
+  explicit DeviceGuard(
+      Device device,
+      const impl::DeviceGuardImplInterface* impl)
+      : guard_(device, impl) {}
+
+  ~DeviceGuard() = default;
+
+  /// Copy is disallowed
+  DeviceGuard(const DeviceGuard&) = delete;
+  DeviceGuard& operator=(const DeviceGuard&) = delete;
+
+  /// Move is disallowed, as DeviceGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  DeviceGuard(DeviceGuard&& other) = delete;
+  DeviceGuard& operator=(DeviceGuard&& other) = delete;
+
+  /// Sets the device to the given one.  The specified device must be consistent
+  /// with the device type originally specified during guard construction.
+  ///
+  /// TODO: The consistency check here is inconsistent with StreamGuard's
+  /// behavior with set_stream, where a stream on a different device than
+  /// the original one isn't an error; we just reset the stream and then
+  /// switch devices.
+  void reset_device(at::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// This method is for testing only.
+  void reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl) {
+    guard_.reset_device(device, impl);
+  }
+
+  /// Sets the device index to the given one.  The device type is inferred
+  /// from the original device type the guard was constructed with.
+  void set_index(DeviceIndex index) {
+    guard_.set_index(index);
+  }
+
+  /// Returns the device that was set at the time the guard was constructed.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  impl::InlineDeviceGuard<impl::VirtualGuardImpl> guard_;
+};
+
+/**
+ * A OptionalDeviceGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * Morally, a OptionalDeviceGuard is equivalent to std::optional<DeviceGuard>,
+ * but with extra constructors and methods as appropriate.
+ *
+ * Besides its obvious use (optionally applying a DeviceGuard),
+ * OptionalDeviceGuard is often also used for the following idiom:
+ *
+ *    OptionalDeviceGuard g;
+ *    for (const auto& t : tensors) {
+ *      g.set_device(t.device());
+ *      do_something_with(t);
+ *    }
+ *
+ * This usage is marginally more efficient than constructing a DeviceGuard every
+ * iteration of the for loop, as it avoids an unnecessary device reset.
+ *
+ * Unlike DeviceGuard, a OptionalDeviceGuard may be uninitialized.  This occurs
+ * when you use the nullary constructor, or pass a nullopt to the constructor.
+ * Uninitialized OptionalDeviceGuards do *nothing*; they do not know what the
+ * original device was and they do not reset on destruction.  This is why
+ * original_device() and current_device() return std::optional<Device> rather
+ * than Device (as they do in DeviceGuard), and also is why we didn't just
+ * provide OptionalDeviceGuard by default and hide DeviceGuard from users.
+ *
+ * The semantics of an OptionalDeviceGuard are exactly explained by thinking
+ * of it as an std::optional<DeviceGuard>.  In particular, an initialized
+ * OptionalDeviceGuard doesn't restore device to its value at construction; it
+ * restores device to its value *at initialization*.  So if you have the
+ * program:
+ *
+ *     setDevice(1);
+ *     OptionalDeviceGuard g;
+ *     setDevice(2);
+ *     g.reset_device(Device(DeviceType::CUDA, 3));  // initializes!
+ *
+ * On destruction, g will reset device to 2, rather than 1.
+ *
+ * An uninitialized OptionalDeviceGuard is distinct from a (initialized)
+ * DeviceGuard whose original_device_ and current_device_ match, since the
+ * DeviceGuard will still reset the device to original_device_.
+ */
+class OptionalDeviceGuard {
+ public:
+  /// Create an uninitialized guard.  Set the guard later using reset_device.
+  explicit OptionalDeviceGuard() = default;
+
+  /// Initialize the guard, setting the current device to the passed Device.
+  explicit OptionalDeviceGuard(Device device) : guard_(device) {}
+
+  /// Initialize the guard if a Device is passed; otherwise leave the
+  /// guard uninitialized.
+  explicit OptionalDeviceGuard(std::optional<Device> device) : guard_(device) {}
+
+  /// Constructor for testing only.
+  explicit OptionalDeviceGuard(
+      Device device,
+      const impl::DeviceGuardImplInterface* impl)
+      : guard_(device, impl) {}
+
+  ~OptionalDeviceGuard() = default;
+  /// Copy is disallowed
+  OptionalDeviceGuard(const OptionalDeviceGuard&) = delete;
+  OptionalDeviceGuard& operator=(const OptionalDeviceGuard&) = delete;
+
+  /// Move is disallowed
+  /// See Note [Explicit initialization of optional fields]
+  /// and // Note [Move construction for RAII guards is tricky]
+  /// for rationale.
+  OptionalDeviceGuard(OptionalDeviceGuard&& other) = delete;
+  OptionalDeviceGuard& operator=(OptionalDeviceGuard&& other) = delete;
+
+  /// Sets the device to the given one.  The specified device must be consistent
+  /// with the device type originally specified during guard construction.
+  void reset_device(at::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// For testing only
+  void reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl) {
+    guard_.reset_device(device, impl);
+  }
+
+  /// Returns the device that was set at the time the guard was constructed.
+  std::optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via reset_device.
+  std::optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_;
+};
+
+// Note [Whither the DeviceGuard boilerplate]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Design note: in principle, we could avoid these wrappers using:
+//
+// using DeviceGuard = impl::InlineDeviceGuard<impl::VirtualGuardImpl>;
+// using OptionalDeviceGuard =
+// impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl>;
+//
+// But the error messages are worse, and our users can't just look at the
+// header file to find out what's going on.  Furthermore, for specializations
+// like CUDAStreamGuard, it can be profitable to replace some interfaces with
+// refined types (e.g., return CUDAStream instead of Stream).  So, we eat
+// the boilerplate and write out the API explicitly.
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h
new file mode 100644
index 0000000000000000000000000000000000000000..3847b5e2650e4100d19dc0031747769f709b92f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DeviceType.h
@@ -0,0 +1,35 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+
+// If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync
+// your changes into torch/headeronly/core/DeviceType.h.
+#include <torch/headeronly/core/DeviceType.h>
+
+#include <ostream>
+#include <string>
+
+namespace c10 {
+
+C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);
+
+C10_API bool isValidDeviceType(DeviceType d);
+
+C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type);
+
+C10_API void register_privateuse1_backend(const std::string& backend_name);
+C10_API std::string get_privateuse1_backend(bool lower_case = true);
+
+C10_API bool is_privateuse1_backend_registered();
+
+} // namespace c10
+
+namespace torch {
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::DeviceType;
+} // namespace torch
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h
new file mode 100644
index 0000000000000000000000000000000000000000..2aa647574ccbc1112d10a5558255d9a5b625a9b2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKey.h
@@ -0,0 +1,750 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+
+namespace c10 {
+
+// Semantically, each value of BackendComponent identifies a "backend" for our
+// dispatch. Some functionalities that we may dispatch to are allowed to
+// register different handlers for each backend. The BackendComponent is then
+// used to figure out which backend implementation to dispatch to.
+
+// In implementation terms, the backend component identifies a specific "bit" in
+// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
+// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
+// functionalities. When we encounter a functionality bit that is known to be
+// customizable per-backend, then we also look at the lower BackendComponent
+// bits and take the highest bit to determine which backend's implementation to
+// use.
+
+// WARNING!  If you add a new backend component to the end of this list,
+// make sure you register it before Meta.
+// Meta must be at the end so that meta key in tls triggers meta kernels.
+// (But you shouldn't: private use keys should have higher precedence than all
+// built-in keys)
+
+// If you add a new (non-privateuse) backend here,
+// make sure to add an Autograd<Backend> fallthrough kernel
+// in aten/src/ATen/core/VariableFallbackKernel.cpp
+
+#define C10_FORALL_BACKEND_COMPONENTS(_, extra) \
+  _(CPU, extra)                                 \
+  _(CUDA, extra)                                \
+  _(HIP, extra)                                 \
+  _(XLA, extra)                                 \
+  _(MPS, extra)                                 \
+  _(IPU, extra)                                 \
+  _(XPU, extra)                                 \
+  _(HPU, extra)                                 \
+  _(VE, extra)                                  \
+  _(Lazy, extra)                                \
+  _(MTIA, extra)                                \
+  _(MAIA, extra)                                \
+  _(PrivateUse1, extra)                         \
+  _(PrivateUse2, extra)                         \
+  _(PrivateUse3, extra)                         \
+  _(Meta, extra)
+
+// WARNING!  If we add a new per-backend functionality key that has higher
+// priority than Autograd, then make sure you update EndOfRuntimeBackendKeys
+
+#define C10_FORALL_FUNCTIONALITY_KEYS(_) \
+  _(Dense, )                             \
+  _(Quantized, Quantized)                \
+  _(Sparse, Sparse)                      \
+  _(SparseCsr, SparseCsr)                \
+  _(NestedTensor, NestedTensor)          \
+  _(AutogradFunctionality, Autograd)
+
+enum class BackendComponent : uint8_t {
+
+  // A "backend" is colloquially used to refer to handlers for dispatch
+  // which actually implement the numerics of an operation in question.
+  //
+  // Due to the nature of the enum, these backends are specified in
+  // an ordered way, but for most backends this order is not semantically
+  // meaningful (e.g., it's valid to reorder these backends without changing
+  // semantics).  The only situation when backend ordering is meaningful
+  // is when the backend participates in multiple dispatch with another
+  // backend; e.g., CPU and CUDA (cuda must have higher priority).
+
+  // These keys don't correspond to individual kernels.
+  // Instead, they represent the backends that are allowed to override specific
+  // pieces of functionality:
+  // - dense kernels (e.g. DispatchKey::CPU)
+  // - sparse kernels (e.g. DispatchKey::SparseCPU)
+  // - quantized kernels (e.g. DispatchKey::QuantizedCPU)
+  // - autograd kernels (e.g. DispatchKey::AutogradCPU)
+  // We reserve space in the runtime operator table for this full cross product
+  // of
+  // [backends in this enum] x [keys below that are explicitly marked as having
+  // per-backend functionality]
+  //
+  // A meta tensor is a tensor without any data associated with it.  (They
+  // have also colloquially been referred to as tensors on the "null" device).
+  // A meta tensor can be used to dry run operators without actually doing any
+  // computation, e.g., add on two meta tensors would give you another meta
+  // tensor with the output shape and dtype, but wouldn't actually add anything.
+
+  InvalidBit = 0,
+#define DEFINE_BACKEND_COMPONENT(n, _) n##Bit,
+  C10_FORALL_BACKEND_COMPONENTS(DEFINE_BACKEND_COMPONENT, unused)
+#undef DEFINE_BACKEND_COMPONENT
+
+  // Define an alias to represent end of backend dispatch keys.
+  // If you add new backend keys after PrivateUse3, please also update it here.
+  EndOfBackendKeys = MetaBit,
+};
+
+// Semantically, a dispatch key identifies a possible "level" in our
+// dispatch, for which a handler may be registered. Each handler corresponds
+// to a type of functionality.
+//
+// In implementation terms, the dispatch key identifies a specific "bit" in a
+// DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
+// we "count leading zeros" when we extract the highest priority dispatch
+// key.)
+//
+// Note [DispatchKey Classification]
+// This enum actually contains several types of keys, which are explained
+// in more detail further down:
+// (1) non-customizable backends (e.g. FPGA)
+// (2) non-customizable functionalities (e.g. Functionalize)
+// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
+// AutogradFunctionality) (4) per-backend instances of customizable
+// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
+// CompositeImplicitAutograd)
+//
+// Of the categories above, it's important to note:
+// (a) which keys are assigned individual bits in a DispatchKeySet
+// (b) which keys are assigned individual slots in the runtime operator table
+// ("Runtime keys")
+//
+// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
+// (1), (2) and (4) all get their own dedicated slots in the runtime operator
+// table.
+
+// See Note [DispatchKeySet Internal Representation] for more details.
+//
+// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py
+enum class DispatchKey : uint16_t {
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // This is not a "real" functionality, but it exists to give us a "nullopt"
+  // element we can return for cases when a DispatchKeySet contains no elements.
+  // You can think a more semantically accurate definition of DispatchKey is:
+  //
+  //    using DispatchKey = std::optional<RealDispatchKey>
+  //
+  // and Undefined == nullopt.  We didn't actually represent
+  // it this way because std::optional<RealDispatchKey> would take two
+  // words, when DispatchKey fits in eight bits.
+
+  Undefined = 0,
+
+  // Define an alias for Undefined to represent CatchAll (long term
+  // this will get eliminated, but for now it's convenient)
+  CatchAll = Undefined,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
+  // Every value in the enum (up to EndOfFunctionalityKeys)
+  // corresponds to an individual "functionality" that can be dispatched to.
+  // This is represented in the DispatchKeySet by assigning each of these enum
+  // values
+  // to each of the remaining (64 - len(BackendComponent)) bits.
+  //
+  // Most of these functionalities have a single handler assigned to them,
+  // making them "runtime keys".
+  // That map to a single slot in the runtime operator table.
+  //
+  // A few functionalities are allowed to be customizable per backend.
+  // See [Note: Per-Backend Functionality Dispatch Keys] for details.
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Dense,
+
+  // Below are non-extensible backends.
+  // These are backends that currently don't have their own overrides for
+  // Autograd/Sparse/Quantized kernels,
+  // and we therefore don't waste space in the runtime operator table allocating
+  // space for them.
+  // If any of these backends ever need to customize, e.g., Autograd, then we'll
+  // need to add a DispatchKey::*Bit for them.
+
+  // TODO: put this in BackendComponents
+  FPGA, // Xilinx support lives out of tree at
+  // https://gitlab.com/pytorch-complex/vitis_kernels
+
+  Vulkan, // TODO: put this in BackendComponents
+  Metal, // TODO: put this in BackendComponents
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Quantized,
+
+  // This backend is to support custom RNGs; it lets you go
+  // to a different kernel if you pass in a generator that is not a
+  // traditional CPUGeneratorImpl/CUDAGeneratorImpl.  To make use of this
+  // key:
+  //  1) set it as a second parameter of at::Generator constructor call in
+  //     the user-defined PRNG class.
+  //  2) use it as a dispatch key while registering custom kernels
+  //     (templatized kernels specialized for user-defined PRNG class)
+  // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp
+  CustomRNGKeyId,
+
+  // TODO: Make Mkldnn a functionality key, so we can give it Meta
+  // support
+  // Here are backends which specify more specialized operators
+  // based on the layout of the tensor.  Note that the sparse backends
+  // are one case where ordering matters: sparse multi-dispatches with
+  // the corresponding dense tensors, and must be handled before them.
+  MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
+  // NB: not to be confused with MKLDNN, which is Caffe2 only
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Sparse,
+
+  SparseCsr,
+
+  NestedTensor,
+
+  // In some situations, it is not immediately obvious what the correct
+  // backend for function is, because the function in question doesn't
+  // have any "tensor" arguments.  In this case, a BackendSelect function
+  // can be registered to implement the custom determination of the
+  // correct backend.
+  BackendSelect,
+
+  Python,
+
+  // Out-of-core key for Fake Tensor in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/fake_tensor.html
+  // TODO: delete this in favor of Python-implemented fake tensor
+  Fake,
+  // See Note [Out-of-tree vmap+grad prototype]. The purpose of this key
+  // is to insert code after the "autograd subsystem" runs, so this key should
+  // be directly after ADInplaceOrView and all of the autograd keys.
+  FuncTorchDynamicLayerBackMode,
+
+  // Alias and mutation removal.
+  // If some backends want to opt into only alias removal or only mutation
+  // removal,
+  // we can consider adding separate keys dedicated to those individual passes.
+  // See Note [Functionalization Pass In Core] for details.
+  Functionalize,
+
+  // The named dispatch key is set for any tensors with named dimensions.
+  // Although we have a dispatch key for named tensors, for historical reasons,
+  // this dispatch key doesn't do any of the substantive functionality for named
+  // tensor (though, hypothetically, it could!)  At the moment, it's just
+  // responsible for letting us give good error messages when operations
+  // don't support named tensors.
+  //
+  // NB: If you ever consider moving named tensor functionality into
+  // this dispatch key, note that it might be necessary add another dispatch
+  // key that triggers before composite operators, in case a composite operator
+  // has named dimension propagation that doesn't match that of its
+  // constituent parts.
+  // TODO: delete this once torchdim lands in functorch
+  Named,
+
+  // The Conjugate dispatch key is set for any tensors that need to perform
+  // conjugation
+  // This is implemented at a dispatch level right before any backends run
+  Conjugate,
+
+  // The Negative dispatch key is set for any tensors that need to perform
+  // negation
+  // This is implemented at a dispatch level right before any backends run
+  Negative,
+
+  ZeroTensor, // registered at build/aten/src/ATen/RegisterZeroTensor.cpp
+
+  // Note [ADInplaceOrView key]
+  // ADInplaceOrView key is used by inplace or view ops to register a kernel
+  // that does additional setup for future autograd computation.
+  //
+  // 1. For inplace ops this kernel does version bump
+  // 2. For view ops this kernel does `as_view` setup where we properly setup
+  //    DifferentiableViewMeta on the view tensors.
+  //
+  // For other ops it's fallthrough kernel since there's no extra
+  // work to do.
+  //
+  // Note [Dream: skip VariableType kernel when requires_grad=false]
+  //
+  // In an ideal world where we can skip VariableType kernel for inputs
+  // with requires_grad=false, instead of a fallthrough kernel, we'll
+  // register a kernel shown below to all functional ops as well:
+  // torch::Tensor my_functional_op(...) {
+  //   {
+  //     // Note for every op in VariableType, you need to go through
+  //     // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the
+  //     // key to TLS excluded set. If you don't go through it at all,
+  //     // inplace/view ops called through `at::` inside your backend
+  //     // kernel will dispatch to ADInplaceOrView kernels and do a lot
+  //     // of extra work.
+  //     at::AutoDispatchBelowADInplaceOrView guard;
+  //     at::redispatch::my_functional_op(...);
+  //   }
+  // }
+  // But this work is currently blocked since it adds an extra dispatch
+  // for all ops and it's non-trivial overhead at model level(a few percents).
+  // Thus our current approach takes advantage of the fact every kernel go
+  // through VariableType kernel first and pulls the
+  // `at::AutoDispatchBelowADInplaceOrView` guard of functional ops
+  // up to the `VariableType` kernel. Thus we only add the extra dispatch
+  // to view/inplace ops to minimize its perf impact to real models.
+  ADInplaceOrView,
+  // Note [Alias Dispatch Key : Autograd]
+  // All backends are oblivious to autograd; autograd is handled as a
+  // layer which happens on top of all backends. It inspects the autograd
+  // metadata of all inputs, determines what autograd metadata should be
+  // constructed by the output, and otherwise defers to the backend to
+  // actually do the numeric computation.  Autograd contains
+  // the bulk of this logic.
+
+  // Autograd is now an alias dispatch key which by default maps to all
+  // backend-specific autograd keys.
+  // Backend-specific allow backends to override the default kernel registered
+  // to Autograd key as needed.
+  // For example, XLA wants to define autograd for einsum directly.
+  // Registering a custom autograd implementation at the XLA key won't work
+  // because we process Autograd before XLA.  This key has higher priority and
+  // gets processed first.  You generally should NOT redispatch after handling
+  // autograd here (since that would result in execution of the Autograd
+  // operator, which you're trying to skip).  In AutogradXLA implementations,
+  // you are responsible for handling autograd yourself, or deferring to other
+  // operators which support autograd.
+
+  // Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and
+  // reserved user-defined backends. All other in-tree backends share the
+  // AutogradOther key. We can add specific autograd key for those backends
+  // upon request.
+  AutogradOther,
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  AutogradFunctionality,
+
+  // NestedTensor is an example of something that isn't a "real backend"
+  // (because it mostly consists of redispatching kernels)
+  // but it would like to override autograd functionality in C++.
+  // We can handle cases like this by adding an extra functionality key
+  // exclusively for handling autograd for NestedTensor.
+  // lives out of tree at
+  // https://github.com/pytorch/nestedtensor
+  AutogradNestedTensor,
+
+  Tracer,
+
+  // TODO: make Autocast a functionality key
+  // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
+  // and inputs are saved for backward in the post-autocast type.
+  AutocastCPU,
+  AutocastMTIA,
+  AutocastMAIA,
+  AutocastXPU,
+  AutocastIPU,
+  AutocastHPU,
+  AutocastXLA,
+  // AutocastXLA is only being used for TPUs. XLA GPUs continue to use
+  // AutocastCUDA.
+  AutocastMPS,
+  AutocastCUDA,
+  AutocastPrivateUse1,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // There are a number of alternative modes which may want to handle before
+  // autograd; for example, error checking, tracing, profiling or vmap.  They
+  // go here.
+
+  FuncTorchBatched, // See Note [Out-of-tree vmap+grad prototype]
+
+  // Dispatch key for BatchedTensorImpl wrapping a nested tensor.
+  BatchedNestedTensor,
+
+  FuncTorchVmapMode, // See Note [Out-of-tree vmap+grad prototype]
+
+  // This is the dispatch key for BatchedTensorImpl, which is used to implement
+  // batching rules for vmap.
+  Batched,
+
+  // When we are inside a vmap, all tensors dispatch on this key.
+  // See Note: [DispatchKey::VmapMode usage] for more details.
+  VmapMode,
+
+  FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype]
+
+  // Out-of-core key for Deferred Module Initialization in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/deferred_init.html
+  DeferredInit,
+
+  // Used by Python key logic to know the set of tls on entry to the dispatcher
+  // This kernel assumes it is the top-most non-functorch-related DispatchKey.
+  // If you add a key above, make sure to update the fallback implementation for
+  // this.
+  PythonTLSSnapshot,
+
+  // This key should be at the very top of the dispatcher
+  FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]
+
+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptable use is within a single
+  // process test.  Use it by creating a TensorImpl with this DispatchKey, and
+  // then registering operators to operate on this type id.  See
+  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example.
+  TESTING_ONLY_GenericWrapper,
+
+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptable use is within a ingle
+  // process test.  Use it by toggling the mode on and off via
+  // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators
+  // to operate on this type id.  See
+  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+  // for a usage example
+  TESTING_ONLY_GenericMode,
+
+  // This key is used for pre-dispatch tracing in make_fx.
+  // It has lower priority than the PythonDispatcher key
+  // because we use the PythonDispatcher to intercept the key from python,
+  // and avoid having to implement it in C++.
+  PreDispatch,
+
+  // This is a bypass that allows you to skip running the C++ dispatcher
+  // entirely
+  PythonDispatcher,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  EndOfFunctionalityKeys, // End of functionality keys.
+
+// ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
+// Here are backends which you think of as traditionally specifying
+// how to implement operations on some device.
+
+#define DEFINE_PER_BACKEND_KEYS_FOR_BACKEND(n, prefix) prefix##n,
+
+#define DEFINE_PER_BACKEND_KEYS(fullname, prefix)      \
+  StartOf##fullname##Backends,                         \
+      C10_FORALL_BACKEND_COMPONENTS(                   \
+          DEFINE_PER_BACKEND_KEYS_FOR_BACKEND, prefix) \
+          EndOf##fullname##Backends = prefix##Meta,
+
+  C10_FORALL_FUNCTIONALITY_KEYS(DEFINE_PER_BACKEND_KEYS)
+
+#undef DEFINE_PER_BACKEND_KEYS
+#undef DEFINE_PER_BACKEND_KEYS_FOR_BACKEND
+
+      EndOfRuntimeBackendKeys = EndOfAutogradFunctionalityBackends,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // Note [Alias Dispatch Keys]
+  // Alias dispatch keys are synthetic dispatch keys which map to multiple
+  // runtime dispatch keys. Alisa keys have precedence, but they are always
+  // lower precedence than runtime keys. You can register a kernel to an
+  // alias key, the kernel might be populated to the mapped runtime keys
+  // during dispatch table computation.
+  // If a runtime dispatch key has multiple kernels from alias keys, which
+  // kernel wins is done based on the precedence of alias keys (but runtime
+  // keys always have precedence over alias keys).
+  // Alias keys won't be directly called during runtime.
+
+  // See Note [Alias Dispatch Key : Autograd]
+  Autograd,
+  CompositeImplicitAutograd, // registered at
+  // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp
+
+  // Note: The alias keyset for FuncTorchBatchedDecomposition is disjoint from
+  // all
+  // other alias keysets
+  // and so precedence order doesn't matter
+  FuncTorchBatchedDecomposition, // registered at
+  // build/aten/src/ATen/RegisterFuncTorchBatchedDecomposition.cpp
+  // Note: The alias keyset for CompositeImplicitAutogradNestedTensor is
+  // disjoint from all other alias keysets
+  CompositeImplicitAutogradNestedTensor, // registered at
+  // build/aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp
+  CompositeExplicitAutograd, // registered at
+  // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
+  // See Note [CompositeExplicitAutogradNonFunctional Key]
+  CompositeExplicitAutogradNonFunctional, // registered at
+  // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
+
+  // Define an alias key to represent end of alias dispatch keys.
+  // If you add new alias keys after Autograd, please also update it here.
+  StartOfAliasKeys = Autograd,
+  EndOfAliasKeys = CompositeExplicitAutogradNonFunctional, //
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // The aliases exist for backwards compatibility reasons, they shouldn't
+  // be used
+  CPUTensorId = CPU,
+  CUDATensorId = CUDA,
+  DefaultBackend = CompositeExplicitAutograd,
+  PrivateUse1_PreAutograd = AutogradPrivateUse1,
+  PrivateUse2_PreAutograd = AutogradPrivateUse2,
+  PrivateUse3_PreAutograd = AutogradPrivateUse3,
+  Autocast = AutocastCUDA,
+};
+
+// Note [Private use DispatchKey]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Private use tensor IDs are preallocated tensor type IDs for use in user
+// applications.  Similar to private use fields in HTTP, they can be used
+// by end users for experimental or private applications, without needing
+// to "standardize" the tensor ID (which would be done by submitting a PR
+// to PyTorch to add your type ID).
+//
+// Private use tensor IDs are appropriate to use if you want to experiment
+// with adding a new tensor type (without having to patch PyTorch first) or
+// have a private, non-distributed application that needs to make use of a
+// new tensor type.  Private use tensor IDs are NOT appropriate to use for
+// libraries intended to be distributed to further users: please contact
+// the PyTorch developers to get a type ID registered in this case.
+//
+// We provide two classes of private user tensor id: regular DispatchKeys
+// and Autograd DispatchKeys.  DispatchKeys serve the role of ordinary "backend"
+// DispatchKeys; if you were adding support for a new type of accelerator, you
+// would use a backend DispatchKey, and ideally automatically reuse
+// AutogradOther definitions already defined in PyTorch.  AutogradPrivateUse
+// DispatchKeys serve as "wrapper" DispatchKeys: they are only necessary for
+// tensors that compose multiple internal tensors, and for cases when the
+// built-in autograd formulas for operators are not appropriate.
+
+static_assert(
+    (static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
+     static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
+    "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
+    " both map to backend and functionality bits"
+    " into a 64-bit bitmask; you must have less than 64 total entries between them");
+
+// Check if a DispatchKey is an alias mapping to other runtime keys.
+constexpr bool isAliasDispatchKey(DispatchKey k) {
+  return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
+}
+
+// [Note: Per-Backend Functionality Dispatch Keys]
+// Check if a DispatchKey is a per-backend functionality key
+// Any functionalities that can be customized per-backend should be added here.
+// These keys correspond to functionalities that can be customized individually
+// per backend. While they only take up one bit in the `DispatchKeySet` bitset,
+// they map to (# backends) slots in the operator table.
+// Each of these keys also has a separate set of "runtime keys" in the dispatch
+// key enum, per backend, which *do* map to the individual operator table slots.
+// For example, the "Sparse" key maps to an individual bit in the
+// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
+// slots in the runtime operator table.
+
+constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
+  if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
+      k == DispatchKey::Sparse || k == DispatchKey::SparseCsr ||
+      k == DispatchKey::AutogradFunctionality ||
+      k == DispatchKey::NestedTensor) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Note that this includes Undefined in the total count.
+// BUT EndOfFunctionalityKeys is its own (placeholder) key.
+// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
+// In the above example, there are 3 total functionality keys.
+constexpr uint8_t num_functionality_keys =
+    static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);
+
+constexpr uint8_t num_backends =
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);
+
+// Note [No More Than 16 Backends]
+// Search for this note to find places in the code where the "no more than 16
+// backends" invariant is baked in.
+static_assert(
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
+    "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
+there are a few places where this invariant is baked in");
+
+constexpr uint8_t numPerBackendFunctionalityKeys() {
+  uint8_t count = 0;
+  for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
+    if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
+      ++count;
+  }
+  return count;
+}
+
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+// See [Note: Trimmed Mobile Dispatch Keys]
+constexpr uint16_t num_runtime_entries = 8;
+#else
+constexpr uint16_t num_runtime_entries = num_functionality_keys +
+    (numPerBackendFunctionalityKeys() * (num_backends - 1));
+#endif
+
+// See Note [No More Than 16 Backends]
+constexpr uint16_t full_backend_mask =
+    (static_cast<uint16_t>(1) << num_backends) - 1;
+
+C10_API const char* toString(DispatchKey /*t*/);
+C10_API const char* toString(BackendComponent /*t*/);
+C10_API std::ostream& operator<<(std::ostream& /*str*/, DispatchKey /*rhs*/);
+C10_API std::ostream& operator<<(
+    std::ostream& /*str*/,
+    BackendComponent /*rhs*/);
+
+C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
+
+// Parses a string into a dispatch key.
+// If the string cannot be correctly parsed, throws an exception.
+C10_API c10::DispatchKey parseDispatchKey(const std::string& k);
+
+// These are some convenience identifiers for dispatch keys which are
+// shorter to type than their long counterparts.  Note that some of these
+// dispatch keys directly correspond to DeviceType; and most APIs that
+// accept DispatchKey also accept DeviceType; e.g.,
+// torch::dispatch(torch::kCPU, ...) is also valid.
+constexpr DispatchKey kAutograd = DispatchKey::Autograd;
+
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr BackendComponent toBackendComponent(DispatchKey k) {
+  if (k >= DispatchKey::StartOfDenseBackends &&
+      k <= DispatchKey::EndOfDenseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
+  } else if (
+      k >= DispatchKey::StartOfQuantizedBackends &&
+      k <= DispatchKey::EndOfQuantizedBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseBackends &&
+      k <= DispatchKey::EndOfSparseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseCsrBackends &&
+      k <= DispatchKey::EndOfSparseCsrBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends));
+  } else if (
+      k >= DispatchKey::StartOfNestedTensorBackends &&
+      k <= DispatchKey::EndOfNestedTensorBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends));
+  } else if (
+      k >= DispatchKey::StartOfAutogradFunctionalityBackends &&
+      k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(
+            DispatchKey::StartOfAutogradFunctionalityBackends));
+  } else {
+    return BackendComponent::InvalidBit;
+  }
+}
+
+constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
+  if (k <= DispatchKey::EndOfFunctionalityKeys) {
+    return k;
+  } else if (k <= DispatchKey::EndOfDenseBackends) {
+    return DispatchKey::Dense;
+  } else if (k <= DispatchKey::EndOfQuantizedBackends) {
+    return DispatchKey::Quantized;
+  } else if (k <= DispatchKey::EndOfSparseBackends) {
+    return DispatchKey::Sparse;
+  } else if (k <= DispatchKey::EndOfSparseCsrBackends) {
+    return DispatchKey::SparseCsr;
+  } else if (k <= DispatchKey::EndOfNestedTensorBackends) {
+    return DispatchKey::NestedTensor;
+  } else if (k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
+    return DispatchKey::AutogradFunctionality;
+  } else {
+    return DispatchKey::Undefined;
+  }
+}
+
+BackendComponent toBackendComponent(DeviceType device_type);
+
+// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
+// DispatchKey::CUDA.
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
+    DispatchKey functionality_k,
+    BackendComponent backend_k) {
+  if (functionality_k == DispatchKey::Dense) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Sparse) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::SparseCsr) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Quantized) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::NestedTensor) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::AutogradFunctionality) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(
+            DispatchKey::StartOfAutogradFunctionalityBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  return DispatchKey::Undefined;
+}
+
+} // namespace c10
+
+namespace torch {
+// Expose the constant, but not the TYPE (DispatchKey is an implementation
+// detail!)
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::kAutograd;
+} // namespace torch
+
+// NB: You really shouldn't use this instance; this enum is guaranteed
+// to be pretty small so a regular array should be acceptable.
+namespace std {
+template <>
+struct hash<c10::DispatchKey> {
+  typedef size_t result_type;
+  typedef c10::DispatchKey argument_type;
+
+  size_t operator()(c10::DispatchKey x) const {
+    return static_cast<size_t>(x);
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec3aff4e0c2295b2490cd29d30aa1117e6bb0441
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DispatchKeySet.h
@@ -0,0 +1,977 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/DispatchKey.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/llvmMathExtras.h>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+
+namespace c10 {
+
+struct FunctionalityOffsetAndMask {
+  // empty constructor shouldn't be used; only needed to initialize
+  // the array before populating it.
+  FunctionalityOffsetAndMask() = default;
+  FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask)
+      : offset(offset), mask(mask) {}
+  // This needs to big enough to cover the size of the operator table.
+  uint16_t offset{};
+  // See Note [No More Than 16 Backends]
+  // This mask needs to be big enough to mask all of the backend bits.
+  // We probably don't ever want to have more than 16 backend bits, so uint16_t
+  // should be enough.
+  uint16_t mask{};
+};
+static_assert(
+    c10::num_runtime_entries < 65536,
+    "The dispatcher currently only supports up to 2^16 runtime entries");
+
+C10_API std::array<FunctionalityOffsetAndMask, num_functionality_keys>
+initializeFunctionalityOffsetsAndMasks();
+
+C10_ALWAYS_INLINE static const std::
+    array<FunctionalityOffsetAndMask, num_functionality_keys>&
+    offsetsAndMasks() {
+  static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks();
+  return offsets_and_masks_;
+}
+
+// A representation of a set of DispatchKeys. A DispatchKeySet contains both
+// "functionality" bits and "backend bits", and every tensor holds its own
+// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the
+// keyset on every input tensor, or’ing them together, and dispatching to a
+// specific piece of functionality. The functionality bits are *ordered*. When
+// multiple functionality bits are set, we use the highest priority
+// functionality. Similarly, multiple backend bits can theoretically be set if
+// you call an operator with multiple tensors from difference devices (e.g. CPU
+// and CUDA), although support for mixed device dispatch is limited (the only
+// kernels that gracefully handle mixed device inputs for now are cuda kernels
+// that take in a scalar cpu tensor).
+
+// A representation of a set of DispatchKeys.  A tensor may have multiple
+// tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the
+// DispatchKeySet specifies what type ids apply.  The internal representation is
+// as a 64-bit bit set (this means only 64 tensor type ids are supported).
+//
+// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like
+// "what is the highest priority DispatchKey in the set"?  (The set itself is
+// not ordered; two sets with the same ids will always have the ids ordered in
+// the same way.)
+//
+// Note [DispatchKeySet Internal Representation]
+// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects
+// that get passed around at runtime.
+// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset
+// and individual dispatch keys.
+//
+// First: why do we have this distinction, and why not map every dispatch key
+// directly to a bit? This is mostly because we have several types of
+// functionalities that different backends would like to customize. For example,
+// we have:
+// - "Dense":     CPU, CUDA, XLA, ... (~12 keys)
+// - "Sparse":    SparseCPU, SparseCUDA, ...
+// - "SparseCsr": SparseCsrCPU, SparseCsrCUDA, ...
+// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
+// - "Autograd":  AutogradCPU, AutogradCUDA, Autograd XLA, ...
+// The problem is that total number of keys grows quadratically with [#
+// backends] x [# functionalities], making it very difficult to map each key
+// directly to a bit in a bitset without dramatically increasing the size of the
+// bitset over time.
+//
+// The two enums (BackendComponent and DispatchKey) can be divided roughly into
+// 5 categories.
+//
+// (1) "Building block" keys
+//    (a) backends: Everything in the BackendComponent enum (e.g. CPUBit,
+//    CUDABit) (b) functionalities: (per-backend) functionality-bit DispatchKeys
+//    (e.g. AutogradFunctionality, SparseCsr, Sparse, Dense)
+// (2) "Runtime" keys
+//    (a) "non-customizable backends" (e.g. FPGA)
+//    (b) "non-customizable functionalities" (e.g. Functionalize)
+//    (c) "per-backend instances of customizable functionalities" (e.g. CPU,
+//    SparseCPU, AutogradCPU)
+// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys])
+//
+// (1) Building block keys always correspond to individual bits in a
+// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual
+// runtime keys. e.g.
+//     auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit,
+//     DispatchKey::Dense});
+//     // The keyset has the runtime dense-cpu key.
+//     dense_cpu_ks.has(DispatchKey::CPU);
+//     // And it contains the building block keys too.
+//     dense_cpu_ks.has(DispatchKey::CPUBit);
+//     dense_cpu_ks.has(DispatchKey::Dense);
+//
+// Not every backend and not every functionality counts as a "building block
+// key". This is mostly to give us more levers to pull in the design space.
+// Backend keys and functionality keys that count as "building blocks" will
+// contribute to a full cross product of functionality that can be overridden.
+//
+// For example, right now we have at least 12 "backend" building
+// blocks (CPU, CUDA, XLA, ...) and at least 5 "functionality"
+// building blocks (Dense, Sparse, SparseCsr, Quantized,
+// AutogradFunctionality, ...). These keys together allow every
+// dispatcher operator to be customized in up to 12*4 different
+// ways. Each of those requires a slot in the operator table of every
+// dispatcher operator.  Not every piece of functionality necessarily
+// needs to be customizable per-backend, and not every backend
+// necessarily needs to be able to customize every type of
+// functionality.
+//
+//
+// (2) Every runtime key corresponds directly to a slot in an operator's runtime
+// dispatch table, and you can directly register kernels to a runtime dispatch
+// key.
+//
+// For per-backend functionalities like "Dense" or "AutogradFunctionality",
+// you can think of the corresponding runtime dispatch keys as "instances" of
+// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all
+// runtime instances of the "Dense" building block key.
+
+// (2a) and (2b) are represented identically in the DispatchKeySet logic:
+// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT
+// customizable per backend.
+//   In order to do so, we'd need to promote it to a per-backend functionality
+//   "building block" key.
+// - non-customizable backends (e.g. FPGA) can NOT customize existing
+// functionality like Sparse, Autograd, etc.
+//   In order to do so, we'd need to promote it to a backend "building block"
+//   key.
+//
+// In both cases, these keys directly correspond to runtime slots in the
+// operator table.
+//
+//
+// (3) "Alias" keys
+// See Note [Alias Dispatch Keys]
+//
+// Final note: for anyone making future changes to the Dispatcher +
+// DispatchKeySet internals, there's a closed PR with a basic
+// python-implementation of the Dispatcher that might be useful in quickly
+// testing out and validating changes. See it at
+// https://github.com/pytorch/pytorch/pull/68743
+
+// An undefined tensor is one with an empty tensor type set.
+class DispatchKeySet final {
+ public:
+  enum Full { FULL };
+  enum FullAfter { FULL_AFTER };
+  enum Raw { RAW };
+
+  // NB: default constructor representation as zero is MANDATORY as
+  // use of DispatchKeySet in TLS requires this.
+  constexpr DispatchKeySet() = default;
+
+  constexpr DispatchKeySet(Full /*unused*/)
+      : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
+
+  constexpr DispatchKeySet(FullAfter /*unused*/, DispatchKey t)
+      // LSB after t are OK, but not t itself.
+      // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
+      // Quantized > Dense). But backends don't really have an ordering.
+      // Therefore, we're enforcing that FullAfter can only be used on
+      // "functionality" keys.
+      : repr_(
+            (1ULL
+             << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
+                 1)) -
+            1) {
+    *this = add(DispatchKey::PythonDispatcher);
+  }
+
+  // Public version of DispatchKeySet(uint64_t) API; external users
+  // must be explicit when they do this!
+  constexpr DispatchKeySet(Raw /*unused*/, uint64_t x) : repr_(x) {}
+
+  constexpr explicit DispatchKeySet(BackendComponent k) {
+    if (k == BackendComponent::InvalidBit) {
+      repr_ = 0;
+    } else {
+      repr_ = 1ULL << (static_cast<uint8_t>(k) - 1);
+    }
+  }
+
+  constexpr explicit DispatchKeySet(DispatchKey k) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (k == DispatchKey::Undefined) {
+      // Case 1: handle Undefined specifically
+      repr_ = 0;
+    } else if (k <= DispatchKey::EndOfFunctionalityKeys) {
+      // Case 2: handle "functionality-only" keys
+      // These keys have a functionality bit set, but no backend bits
+      // These can technically be either:
+      // - valid runtime keys (e.g. DispatchKey::AutogradOther,
+      // DispatchKey::FuncTorchBatched, etc)
+      // - "building block" keys that aren't actual runtime keys (e.g.
+      // DispatchKey::Dense or Sparse)
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(k) - 1);
+      repr_ = functionality_val;
+    } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) {
+      // Case 3: "runtime" keys that have a functionality bit AND a backend bit.
+      // First compute which bit to flip for the functionality.
+      auto functionality_k = toFunctionalityKey(k);
+      // The - 1 is because Undefined is technically a "functionality" that
+      // doesn't show up in the bitset. So e.g. Dense is technically the second
+      // functionality, but the lowest functionality bit.
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(functionality_k) - 1);
+
+      // then compute which bit to flip for the backend
+      // Case 4a: handle the runtime instances of "per-backend functionality"
+      // keys For example, given DispatchKey::CPU, we should set:
+      // - the Dense functionality bit
+      // - the CPUBit backend bit
+      // first compute which bit to flip for the backend
+      auto backend_k = toBackendComponent(k);
+      uint64_t backend_val = backend_k == BackendComponent::InvalidBit
+          ? 0
+          : 1ULL << (static_cast<uint8_t>(backend_k) - 1);
+      repr_ = functionality_val + backend_val;
+    } else {
+      // At this point, we should have covered every case except for alias keys.
+      // Technically it would be possible to add alias dispatch keys to a
+      // DispatchKeySet, but the semantics are a little confusing and this
+      // currently isn't needed anywhere.
+      repr_ = 0;
+    }
+  }
+
+  constexpr uint64_t keys_to_repr(std::initializer_list<DispatchKey> ks) {
+    uint64_t repr = 0;
+    for (auto k : ks) {
+      repr |= DispatchKeySet(k).repr_;
+    }
+    return repr;
+  }
+
+  constexpr uint64_t backend_bits_to_repr(
+      std::initializer_list<BackendComponent> ks) {
+    uint64_t repr = 0;
+    for (auto k : ks) {
+      repr |= DispatchKeySet(k).repr_;
+    }
+    return repr;
+  }
+
+  explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
+      : repr_(keys_to_repr(ks)) {}
+
+  explicit constexpr DispatchKeySet(std::initializer_list<BackendComponent> ks)
+      // Note: for some reason, putting this logic directly in the constructor
+      // appears to fail to compile on CUDA 10.1.
+      // See an example internal failure at
+      // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr
+      : repr_(backend_bits_to_repr(ks)) {}
+
+  // Test if a DispatchKey is in the set
+  inline bool has(DispatchKey t) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined);
+    return has_all(DispatchKeySet(t));
+  }
+  constexpr bool has_backend(BackendComponent t) const {
+    return has_all(DispatchKeySet(t));
+  }
+
+  // Test if a DispatchKey is in the set
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if all of them are in the current set.
+  constexpr bool has_all(DispatchKeySet ks) const {
+    return static_cast<bool>((repr_ & ks.repr_) == ks.repr_);
+  }
+
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if any of them are in the current set. This could technically
+  // be pretty easily implemented using has(). It is strictly a perf
+  // optimization though. There are many places in the code base where we want
+  // to test for multiple functionality keys together. HOWEVER, runtime
+  // per-backend functionality keys aren't allowed to be used with this
+  // function, because you can end up with weird results. e.g.
+  // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU))
+  // would return true.
+  inline bool has_any(DispatchKeySet ks) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // Either there are no backend bits in the input keyset
+        ((ks.repr_ & full_backend_mask) == 0) ||
+        // or there are no per-backend-functionality bits
+        // See [Note: Per-Backend Functionality Dispatch Keys]
+        ((ks &
+          DispatchKeySet({
+                             DispatchKey::Dense,
+                             DispatchKey::Quantized,
+                             DispatchKey::Sparse,
+                             DispatchKey::SparseCsr,
+                             DispatchKey::AutogradFunctionality,
+                         })
+              .repr_) == 0));
+    return static_cast<bool>((repr_ & ks.repr_) != 0);
+  }
+  // Test if DispatchKeySet is a superset of ks.
+  bool isSupersetOf(DispatchKeySet ks) const {
+    return (repr_ & ks.repr_) == ks.repr_;
+  }
+  // Perform set union
+  constexpr DispatchKeySet operator|(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ | other.repr_);
+  }
+  // Perform set intersection
+  constexpr DispatchKeySet operator&(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ & other.repr_);
+  }
+  // Compute the set difference self - other,
+  // but ONLY for the functionality keys.
+  // Any backend bits set on self will remain unchanged.
+  // See Note [Removing keys from DispatchKeySet Only Affects Functionality
+  // Keys]
+  constexpr DispatchKeySet operator-(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_));
+  }
+
+  // Compute self ^ other
+  constexpr DispatchKeySet operator^(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ ^ other.repr_);
+  }
+  bool operator==(DispatchKeySet other) const {
+    return repr_ == other.repr_;
+  }
+  bool operator!=(DispatchKeySet other) const {
+    return repr_ != other.repr_;
+  }
+  // Add a DispatchKey to the DispatchKey set.  Does NOT mutate,
+  // returns the extended DispatchKeySet!
+  [[nodiscard]] constexpr DispatchKeySet add(DispatchKey t) const {
+    return *this | DispatchKeySet(t);
+  }
+  [[nodiscard]] constexpr DispatchKeySet add(DispatchKeySet ks) const {
+    return *this | ks;
+  }
+
+  // Remove a DispatchKey from the DispatchKey set.
+  // This is generally not an operation you should be doing
+  // (it's used to implement the printing overload, operator<<)
+  //
+  // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys]
+  // Only functionality bits are allowed to be removed from a keyset.
+  // For now, we're only allowing removal of "functionality bits" from the
+  // keyset, which is specifically needed by the fallthrough key calculation
+  // logic. Why is removing backend bits problematic? Consider this example:
+  //
+  // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA,
+  // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA)
+  // DispatchKeySet([DispatchKey.CPU,
+  // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA)
+  //
+  // What do we want to happen?
+  // Technically, we'd like it to be true that after removal,
+  // the first keyset still has the CUDA dispatch key while the second doesn't.
+  // Unfortunately there's no way to represent that, because the two keysets are
+  // represented the same way internally: functionality bits: Autograd, Dense
+  // backend bits: CPU, CUDA
+  //
+  // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd"
+  // bit from the bitset.
+  [[nodiscard]] constexpr DispatchKeySet remove(DispatchKey t) const {
+    return DispatchKeySet(
+        repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask));
+  }
+  // You're allowed to remove a backend bit from a DispatchKeySet,
+  // but you have to be explicit about it (remove_backend() instead of
+  // remove()).
+  constexpr DispatchKeySet remove_backend(BackendComponent b) const {
+    return DispatchKeySet(repr_ & ~(DispatchKeySet(b).repr_));
+  }
+  // Is the set empty?  (AKA undefined tensor)
+  bool empty() const {
+    return repr_ == 0;
+  }
+  uint64_t raw_repr() const {
+    return repr_;
+  }
+
+  static DispatchKeySet from_raw_repr(uint64_t x) {
+    return DispatchKeySet(RAW, x);
+  }
+
+  DispatchKey highestFunctionalityKey() const {
+    auto functionality_idx = indexOfHighestBit();
+    // This means that none of the functionality bits were set.
+    if (functionality_idx < num_backends)
+      return DispatchKey::Undefined;
+    // The first num_backend bits in the keyset don't correspond to real
+    // dispatch keys.
+    return static_cast<DispatchKey>(functionality_idx - num_backends);
+  }
+
+  // This is similar like toBackendComponent(DispatchKey), but less restrictive.
+  // toBackendComponent() errors out if the key that it was passed has no
+  // backend bits, which is useful for error checking. We need a version of that
+  // here that can also handle "fake" backends like FPGA, because they need to
+  // map to the AutogradOther key. For those backends, we return
+  // BackendComponent::InvalidBit.
+  BackendComponent highestBackendKey() const {
+    // mask to mask out functionality bits
+    auto backend_idx =
+        DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit();
+    // all zeros across the backend bits means that no backend bits are set.
+    if (backend_idx == 0)
+      return BackendComponent::InvalidBit;
+    return static_cast<BackendComponent>(backend_idx);
+  }
+
+  // returns the DispatchKey of highest priority in the set.
+  DispatchKey highestPriorityTypeId() const {
+    auto functionality_k = highestFunctionalityKey();
+    if (isPerBackendFunctionalityKey(functionality_k)) {
+      return toRuntimePerBackendFunctionalityKey(
+          functionality_k, highestBackendKey());
+    }
+    return functionality_k;
+  }
+
+  // Returns the index of the most-significant bit in the keyset.
+  // This is used to as part of the calculation into the operator table to get:
+  // - the highest "functionality" bit in the keyset.
+  // - the highest "backend" bit in the keyset.
+  uint8_t indexOfHighestBit() const {
+    return 64 - llvm::countLeadingZeros(repr_);
+  }
+
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+  // [Note: Trimmed Mobile Dispatch Keys]
+  /**
+   * The method below maps the dispatch key in the enum DispatchKey to an
+   * integer index in the dispatchTable_ array in OperatorEntry. The array
+   * is trimmed for mobile to reduce peak memory usage since it's
+   * unnecessary to reserve additional space for dispatch keys that will
+   * never be used on mobile.
+   */
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto dk = highestPriorityTypeId();
+    switch (dk) {
+      case DispatchKey::Undefined:
+        return 0;
+      case DispatchKey::CPU:
+        return 1;
+      case DispatchKey::QuantizedCPU:
+        return 2;
+      case DispatchKey::SparseCPU:
+        return 3;
+      case DispatchKey::BackendSelect:
+        return 4;
+      case DispatchKey::ADInplaceOrView:
+        return 5;
+      case DispatchKey::AutogradOther:
+        return 6;
+      case DispatchKey::AutogradCPU:
+        return 7;
+      default:
+        return -1;
+    }
+  }
+#else
+  // returns the index in the operator table of highest priority key in the the
+  // keyset Note that we could in theory implement this using
+  // highestPriorityTypeId(), but this code is very hotpath and we can do it
+  // faster without it.
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto functionality_idx =
+        DispatchKeySet(repr_ >> num_backends).indexOfHighestBit();
+    auto offset_and_mask = offsetsAndMasks()[functionality_idx];
+    // Mask the functionality bits out first, then right-shift by 1.
+    // right-shifting by 1 because everything is zero-indexed.
+    // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should
+    // give us an offset of 1, etc.
+    auto backend_idx =
+        DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit();
+    return offset_and_mask.offset + backend_idx;
+  }
+#endif
+
+  // returns the "index" of the highest priority backend in the keyset.
+  // This is pretty similar to getBackendKey(), but:
+  // - It's hotpath code (part of the runtime bitset calculation)
+  // - I's returns an integer index, not an enum value
+  // - Everything is shifted to the right by 1.
+  //   BackendComponent::InvalidBit is technically the lowest enum value,
+  //   but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2,
+  //   etc.
+  uint64_t getBackendIndex() const {
+    return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit();
+  }
+
+ private:
+  constexpr DispatchKeySet(uint64_t repr) : repr_(repr) {}
+  uint64_t repr_ = 0;
+
+ public:
+  // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys
+  // in the set. The iterator is only invalidated by the destruction of the
+  // underlying DispatchKeySet as the iterator stores a pointer to the raw
+  // representation of the DispatchKeySet. Note: When we encounter a per-backend
+  // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend
+  // in the keyset, for that functionality. For example, if the next
+  // functionality key to iterate over is Autograd, and the backend bits in the
+  // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit],
+  // then the next two keys we return will be DispatchKey::AutogradCPU,
+  // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than
+  // CUDA in DispatchKey.h).
+  class iterator {
+   public:
+    using self_type = iterator;
+    using iterator_category = std::input_iterator_tag;
+    using value_type = DispatchKey;
+    using difference_type = ptrdiff_t;
+    using reference = value_type&;
+    using pointer = value_type*;
+    // final mask value should mask out the entire keyset
+    static const uint8_t end_iter_mask_val =
+        num_backends + num_functionality_keys;
+    // final key value should be the last DispatchKey
+    static const uint8_t end_iter_key_val = num_functionality_keys;
+
+    // current_dispatchkey_idx_ will iterate through all functionality bits.
+    // current_backendcomponent_idx_ will iterate through all backend bits.
+    explicit iterator(
+        const uint64_t* data_ptr,
+        uint8_t next_functionality = num_backends,
+        uint8_t next_backend = 0)
+        : data_ptr_(data_ptr),
+          next_functionality_(next_functionality),
+          next_backend_(next_backend),
+          // These are in an invalid state at construction time, and set by the
+          // first increment call
+          current_dispatchkey_idx_(end_iter_key_val),
+          current_backendcomponent_idx_(end_iter_key_val) {
+      // Go to the first key in the set
+      TORCH_INTERNAL_ASSERT(
+          next_functionality_ >= num_backends,
+          "num_backends=",
+          static_cast<uint32_t>(num_backends),
+          "next_functionality_=",
+          static_cast<uint32_t>(next_functionality_));
+      ++(*this);
+    }
+
+    C10_API self_type& operator++();
+
+    self_type operator++(int) {
+      self_type previous_iterator = *this;
+      ++(*this);
+      return previous_iterator;
+    }
+
+    bool operator==(const self_type& rhs) const {
+      return next_functionality_ == rhs.next_functionality_ &&
+          current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ &&
+          next_backend_ == rhs.next_backend_ &&
+          current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_;
+    }
+    bool operator!=(const self_type& rhs) const {
+      return next_functionality_ != rhs.next_functionality_ ||
+          current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ ||
+          next_backend_ != rhs.next_backend_ ||
+          current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_;
+    }
+    DispatchKey operator*() const {
+      auto functionality_key =
+          static_cast<DispatchKey>(current_dispatchkey_idx_);
+      if (isPerBackendFunctionalityKey(functionality_key)) {
+        auto next_key = toRuntimePerBackendFunctionalityKey(
+            functionality_key,
+            static_cast<BackendComponent>(current_backendcomponent_idx_));
+        // We expect all of the Dense, Sparse, Quantized, and Autograd keys to
+        // be ordered the same way with respect to their backends
+        TORCH_INTERNAL_ASSERT(
+            toBackendComponent(next_key) ==
+                static_cast<BackendComponent>(current_backendcomponent_idx_),
+            "Tried to map functionality key ",
+            toString(functionality_key),
+            " and backend bit ",
+            toString(
+                static_cast<BackendComponent>(current_backendcomponent_idx_)),
+            " to a runtime key, but ended up with ",
+            toString(next_key),
+            ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.",
+            " Please double check that enum for inconsistencies.");
+        return next_key;
+      } else {
+        return functionality_key;
+      }
+    }
+
+   private:
+    const uint64_t* data_ptr_;
+    uint8_t next_functionality_;
+    uint8_t next_backend_;
+    uint8_t current_dispatchkey_idx_;
+    uint8_t current_backendcomponent_idx_;
+  };
+
+ public:
+  // Returns iterator to the first key in the set. If no keys are in the
+  // set, then will return the end iterator.
+  iterator begin() const {
+    return iterator(&repr_);
+  }
+
+  // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat
+  // this as the end iterator.
+  iterator end() const {
+    return iterator(&repr_, iterator::end_iter_mask_val);
+  }
+};
+
+C10_API std::string toString(DispatchKeySet /*ts*/);
+C10_API std::ostream& operator<<(std::ostream& /*os*/, DispatchKeySet /*ts*/);
+
+inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+  return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
+}
+
+// Alias key DispatchKey::Autograd maps to
+// (autograd_dispatch_keyset x full_backend_mask)
+// NB: keys in this set also get associated with CompositeImplicitAutograd
+//
+// Note [autograd_dispatch_keyset Does Not Include Backend Bits]
+// We don't want to include any backend bits (BackendComponent::CPUBit, etc)
+// directly in autograd_dispatch_keyset.
+// Why? keysets like autograd_dispatch_keyset are commonly used to remove
+// autograd keys from a DispatchKeySet throughout the code base. However, you
+// are only allowed to remove functionality bits from a keyset, not backend
+// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality
+// Keys] for details. To be consistent and avoid confusion, we're explicitly
+// setting up autograd_dispatch_keyset to not have any backend bits.
+constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
+    DispatchKey::AutogradFunctionality,
+    DispatchKey::AutogradOther,
+    DispatchKey::AutogradNestedTensor,
+});
+
+constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
+    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastMPS,
+    DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
+    DispatchKey::AutocastIPU,
+    DispatchKey::AutocastHPU,
+    DispatchKey::AutocastXLA,
+    DispatchKey::AutocastPrivateUse1,
+    DispatchKey::AutocastMTIA,
+    DispatchKey::AutocastMAIA,
+});
+
+// See Note [TLS Initialization]
+constexpr DispatchKeySet default_included_set = DispatchKeySet({
+    DispatchKey::BackendSelect,
+    DispatchKey::ADInplaceOrView,
+});
+
+constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
+    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastMPS,
+    DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
+    DispatchKey::AutocastIPU,
+    DispatchKey::AutocastHPU,
+    DispatchKey::AutocastXLA,
+    DispatchKey::AutocastPrivateUse1,
+    DispatchKey::AutocastMTIA,
+    DispatchKey::AutocastMAIA,
+});
+
+constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
+    autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView);
+
+constexpr DispatchKeySet python_ks = DispatchKeySet({
+    DispatchKey::Python,
+    DispatchKey::PythonTLSSnapshot,
+});
+
+constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
+
+constexpr DispatchKeySet sparse_csr_ks = DispatchKeySet(DispatchKey::SparseCsr);
+
+constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
+
+// backend dispatch keys that map to DispatchKey::AutogradOther
+// NB: keys in this set also get associated with CompositeImplicitAutograd
+constexpr DispatchKeySet autogradother_backends =
+    DispatchKeySet(
+        // HIP and VE aren't in this list: they now have their own backend bits
+        // which means that they can now have their own Autograd keys.
+        // Technically, HIP will now redispatch to its own custom AutogradHIP
+        // slot in the runtime table.
+        {DispatchKey::FPGA,
+         DispatchKey::Vulkan,
+         DispatchKey::Metal,
+         DispatchKey::CustomRNGKeyId,
+         DispatchKey::MkldnnCPU,
+         // Sparse and Quantized backends also live here.
+         DispatchKey::Sparse,
+         DispatchKey::SparseCsr,
+         DispatchKey::Quantized})
+    // Including the backend bits because this keyset is used during op
+    // registration, which requires looping over all runtime autogradother
+    // backend keys.
+    | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+
+// The set of dispatch keys that come after autograd
+// n.b. this relies on the fact that AutogradOther is currently the lowest
+// Autograd key
+constexpr DispatchKeySet after_autograd_keyset =
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::AutogradOther);
+
+// The set of dispatch keys that come after ADInplaceOrView
+constexpr DispatchKeySet after_ADInplaceOrView_keyset = DispatchKeySet(
+    DispatchKeySet::FULL_AFTER,
+    c10::DispatchKey::ADInplaceOrView);
+
+// The set of dispatch keys that come after Functionalize
+constexpr DispatchKeySet after_func_keyset =
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::Functionalize)
+        .remove(
+            // NOTE: we also need to remove ADInplaceOrView from the keyset when
+            // redispatching after the func kernels. This is because we're not
+            // calling the same op; we originally called an inplace op, and now
+            // we aren't. The original key calculation figured out which keys
+            // were Fallthrough based on the inplace op. That means that it did
+            // not include the ADInPlaceOrView kernel as a fallthrough key.
+            // However, we WANT the ADInPlaceOrView kernel to be ignored now
+            // that we're calling an out-of-place op. Re-invoking
+            // Dispatcher::call would re-run the Fallthrough key calculation and
+            // get us that, But at::redispatch is more performant. We can get
+            // away with it by explicitly removing the key here.
+            c10::DispatchKey::ADInplaceOrView);
+
+constexpr DispatchKeySet backend_bitset_mask =
+    DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1);
+
+constexpr auto inplace_or_view_ks =
+    DispatchKeySet(DispatchKey::ADInplaceOrView);
+constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
+constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU);
+constexpr auto autograd_mtia_ks = DispatchKeySet(DispatchKey::AutogradMTIA);
+constexpr auto autograd_maia_ks = DispatchKeySet(DispatchKey::AutogradMAIA);
+constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
+constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
+constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
+constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy);
+constexpr auto autograd_meta_ks = DispatchKeySet(DispatchKey::AutogradMeta);
+constexpr auto autograd_mps_ks = DispatchKeySet(DispatchKey::AutogradMPS);
+constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU);
+constexpr auto autograd_privateuse1_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse1);
+constexpr auto autograd_privateuse2_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse2);
+constexpr auto autograd_privateuse3_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse3);
+constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther);
+constexpr auto autograd_nested =
+    DispatchKeySet(DispatchKey::AutogradNestedTensor);
+// keyset corresponding to functorch keys that have their own dedicated
+// TensorImpl subclass.
+constexpr auto functorch_transforms_ks = DispatchKeySet(
+    {DispatchKey::FuncTorchBatched,
+     DispatchKey::FuncTorchVmapMode,
+     DispatchKey::Batched,
+     DispatchKey::VmapMode,
+     DispatchKey::FuncTorchGradWrapper});
+
+constexpr auto functorch_batched_ks =
+    DispatchKeySet({DispatchKey::FuncTorchBatched});
+
+// This keyset has:
+// (1) the functionality bits corresponding to backends (dense, sparse,
+// quantized) (2) all of the backend bits set
+constexpr DispatchKeySet backend_functionality_keys =
+    DispatchKeySet({
+        DispatchKey::Dense,
+        DispatchKey::Quantized,
+        DispatchKey::Sparse,
+        DispatchKey::SparseCsr,
+    }) |
+    DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+
+struct OpTableOffsetAndMask {
+  uint16_t offset;
+  uint16_t backend_mask;
+};
+
+static_assert(
+    num_backends <= 16,
+    "Right now we expect the number of backends not to exceed 16. In the (unlikely) event"
+    " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too.");
+
+// true if t is a backend dispatch key
+C10_API bool isBackendDispatchKey(DispatchKey t);
+
+// Resolve alias dispatch key to DispatchKeySet if applicable
+C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t);
+
+// Resolve alias dispatch key to DispatchKeySet if applicable,
+// and check if k is a part of that set
+C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k);
+
+// Returns a DispatchKeySet of all backend keys mapped to Autograd dispatch key
+// t, DispatchKeySet is empty if t is not alias of DispatchKey::Autograd.
+C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
+
+// Returns a DispatchKeySet of autograd related keys mapped to backend.
+// for a given backend key, use the associated autograd key.
+// for non-backend keys, use AutogradOther as a default.
+// Note: it's convenient and fast to return a default here rather than (say)
+// returning an std::optional<DispatchKey>, or throwing. But it makes callers
+// responsible for either a) enforcing the invariant that only backend keys
+// be passed as arguments, or b) interpreting our return value carefully.
+inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return inplace_or_view_ks | autograd_cpu_ks;
+    case BackendComponent::IPUBit:
+      return inplace_or_view_ks | autograd_ipu_ks;
+    case BackendComponent::MTIABit:
+      return inplace_or_view_ks | autograd_mtia_ks;
+    case BackendComponent::MAIABit:
+      return inplace_or_view_ks | autograd_maia_ks;
+    case BackendComponent::XPUBit:
+      return inplace_or_view_ks | autograd_xpu_ks;
+    case BackendComponent::CUDABit:
+      return inplace_or_view_ks | autograd_cuda_ks;
+    case BackendComponent::XLABit:
+      return inplace_or_view_ks | autograd_xla_ks;
+    case BackendComponent::LazyBit:
+      return inplace_or_view_ks | autograd_lazy_ks;
+    case BackendComponent::MetaBit:
+      return inplace_or_view_ks | autograd_meta_ks;
+    case BackendComponent::MPSBit:
+      return inplace_or_view_ks | autograd_mps_ks;
+    case BackendComponent::HPUBit:
+      return inplace_or_view_ks | autograd_hpu_ks;
+    case BackendComponent::PrivateUse1Bit:
+      return inplace_or_view_ks | autograd_privateuse1_ks;
+    case BackendComponent::PrivateUse2Bit:
+      return inplace_or_view_ks | autograd_privateuse2_ks;
+    case BackendComponent::PrivateUse3Bit:
+      return inplace_or_view_ks | autograd_privateuse3_ks;
+    default:
+      return inplace_or_view_ks | autograd_other_ks;
+  }
+}
+
+// Returns a DispatchKeySet of autocast related keys mapped to backend.
+inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
+  constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
+  constexpr auto autocast_mtia_ks = DispatchKeySet(DispatchKey::AutocastMTIA);
+  constexpr auto autocast_maia_ks = DispatchKeySet(DispatchKey::AutocastMAIA);
+  constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU);
+  constexpr auto autocast_ipu_ks = DispatchKeySet(DispatchKey::AutocastIPU);
+  constexpr auto autocast_hpu_ks = DispatchKeySet(DispatchKey::AutocastHPU);
+  constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA);
+  constexpr auto autocast_xla_ks = DispatchKeySet(DispatchKey::AutocastXLA);
+  constexpr auto autocast_privateuse1_ks =
+      DispatchKeySet(DispatchKey::AutocastPrivateUse1);
+  constexpr auto autocast_mps_ks = DispatchKeySet(DispatchKey::AutocastMPS);
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return autocast_cpu_ks;
+    case BackendComponent::MTIABit:
+      return autocast_mtia_ks;
+    case BackendComponent::MAIABit:
+      return autocast_maia_ks;
+    case BackendComponent::XPUBit:
+      return autocast_xpu_ks;
+    case BackendComponent::IPUBit:
+      return autocast_ipu_ks;
+    case BackendComponent::HPUBit:
+      return autocast_hpu_ks;
+    case BackendComponent::CUDABit:
+      return autocast_cuda_ks;
+    case BackendComponent::XLABit:
+      return autocast_xla_ks;
+    case BackendComponent::PrivateUse1Bit:
+      return autocast_privateuse1_ks;
+    case BackendComponent::MPSBit:
+      return autocast_mps_ks;
+    default:
+      return DispatchKeySet();
+  }
+}
+
+// returns the "backend" DispatchKey of highest priority in the set.
+// This is basically like highestBackendKey(), except that we have some
+// "functionality" bits that correspond to backends (Sparse, Quantized)
+inline DispatchKey highestPriorityBackendTypeId(DispatchKeySet ks) {
+  return (ks & backend_functionality_keys).highestPriorityTypeId();
+}
+
+// This API exists because we have a use case for checking
+// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
+// in OperatorEntry.cpp but we disallow it in has() API.
+C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias);
+
+// Historically, every tensor only had a single DispatchKey, and it was always
+// something like CPU, and there wasn't any of this business where TLS
+// could cause the DispatchKey of a tensor to change.  But we still have some
+// legacy code that is still using DispatchKey for things like instanceof
+// checks; if at all possible, refactor the code to stop using DispatchKey in
+// those cases.
+inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) {
+  // NB: If you add any extra keys that can be stored in TensorImpl on
+  // top of existing "backend" keys like CPU/CUDA, you need to add it
+  // here.  At the moment, autograd keys and ADInplaceOrView key need this
+  // treatment;
+  return (s - autograd_dispatch_keyset_with_ADInplaceOrView -
+          autocast_dispatch_keyset -
+          DispatchKeySet(
+              {DispatchKey::Functionalize,
+               DispatchKey::PythonTLSSnapshot,
+               DispatchKey::FuncTorchGradWrapper,
+               DispatchKey::FuncTorchVmapMode,
+               DispatchKey::FuncTorchBatched,
+               DispatchKey::Python}))
+      .highestPriorityTypeId();
+}
+
+template <class T>
+using is_not_DispatchKeySet = std::negation<std::is_same<DispatchKeySet, T>>;
+
+// Given a function type, constructs a function_traits type that drops the first
+// parameter type if the first parameter is of type DispatchKeySet. NB:
+// DispatchKeySet is currently explicitly hidden from JIT (mainly to avoid
+// pushing unnecessary arguments on the stack - see Note [ Plumbing Keys Through
+// the Dispatcher] for details). If at any point in the future we need to expose
+// this type to JIT, revisit the usage of this type alias.
+template <class FuncType>
+using remove_DispatchKeySet_arg_from_func = guts::make_function_traits_t<
+    typename guts::infer_function_traits_t<FuncType>::return_type,
+    typename std::conditional_t<
+        std::is_same_v<
+            DispatchKeySet,
+            typename guts::typelist::head_with_default_t<
+                void,
+                typename guts::infer_function_traits_t<
+                    FuncType>::parameter_types>>,
+        guts::typelist::drop_if_nonempty_t<
+            typename guts::infer_function_traits_t<FuncType>::parameter_types,
+            1>,
+        typename guts::infer_function_traits_t<FuncType>::parameter_types>>;
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0f0f0b27c97bf7521a09fae5c6d7c04d9e0b46e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/DynamicCast.h
@@ -0,0 +1,134 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Load.h>
+#include <c10/util/TypeCast.h>
+
+namespace c10 {
+
+// Dynamic type casting utils:
+// - fetch_and_cast
+// - cast_and_store
+//
+// fetch_and_cast fetch a value with dynamic type specified by a ScalarType
+// from a void pointer and cast it to a static type.
+//
+// cast_and_store casts a static typed value into dynamic type specified
+// by a ScalarType, and store it into a void pointer.
+//
+// NOTE:
+//
+// Dynamic casting allows us to support type promotion without blowing up
+// the combination space: For example, without dynamic cast, in order to
+// implement `add_` with type promotion, we would need something like
+//
+// AT_DISPATCH_ALL_TYPES(output.dtype(),
+//    AT_DISPATCH_ALL_TYPES(input1.dtype(),
+//       AT_DISPATCH_ALL_TYPES(input2.dtype(),
+//           [](arg0_t a, arg1_t b) -> out_t { return a + b; }
+//       )
+//    )
+// )
+//
+// If we support N dtypes, the above code would generate the a+b kernel for
+// all the N * N * N different supported types, the compilation time and
+// binary size would become horrible.
+//
+// Dynamic casting might sounds like a bad idea in terms of performance.
+// Especially if you ever do it in a loop, you are going to do a billion tests.
+// But in practice it is not as bad as it might look:
+//
+// - on CPU, this is a branch that always has the same outcome, therefore
+//   hopefully the branch predictor could do the job pretty well
+// - on GPU, these branches will not diverge, so we could still have the same
+//   warp executing the same line of code
+// - Most kernels, like `add`, are bandwidth bound, adding a few clock cycles to
+//   check an integer does not hurt the performance much because the ALUs would
+//   wait for load instructions anyway.
+//
+// For the discussion and benchmark, refer to:
+// - https://github.com/pytorch/pytorch/pull/28343
+// - https://github.com/pytorch/pytorch/pull/28344
+// - https://github.com/pytorch/pytorch/pull/28345
+//
+
+#ifdef C10_HOST_DEVICE
+#define ERROR_UNSUPPORTED_CAST CUDA_KERNEL_ASSERT(false);
+#else
+#define ERROR_UNSUPPORTED_CAST TORCH_CHECK(false, "Unexpected scalar type");
+#endif
+
+// Fetch a value with dynamic type src_type from ptr, and cast it to static type
+// dest_t.
+#define FETCH_AND_CAST_CASE(type, scalartype) \
+  case ScalarType::scalartype:                \
+    return c10::convert<dest_t>(c10::load<type>(ptr));
+
+template <typename dest_t>
+C10_HOST_DEVICE inline dest_t fetch_and_cast(
+    const ScalarType src_type,
+    const void* ptr) {
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+  switch (src_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(FETCH_AND_CAST_CASE)
+    FETCH_AND_CAST_CASE(uint16_t, UInt16)
+    FETCH_AND_CAST_CASE(uint32_t, UInt32)
+    FETCH_AND_CAST_CASE(uint64_t, UInt64)
+    default:
+      ERROR_UNSUPPORTED_CAST
+  }
+  C10_DIAGNOSTIC_POP()
+  return dest_t(0); // just to avoid compiler warning
+}
+
+// Cast a value with static type src_t into dynamic dest_type, and store it to
+// ptr.
+#define CAST_AND_STORE_CASE(type, scalartype) \
+  case ScalarType::scalartype:                \
+    *(type*)ptr = c10::convert<type>(value);  \
+    return;
+template <typename src_t>
+C10_HOST_DEVICE inline void cast_and_store(
+    const ScalarType dest_type,
+    void* ptr,
+    src_t value) {
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+  switch (dest_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(CAST_AND_STORE_CASE)
+    CAST_AND_STORE_CASE(uint16_t, UInt16)
+    CAST_AND_STORE_CASE(uint32_t, UInt32)
+    CAST_AND_STORE_CASE(uint64_t, UInt64)
+    default:;
+  }
+  C10_DIAGNOSTIC_POP()
+  ERROR_UNSUPPORTED_CAST
+}
+
+#define DEFINE_UNCASTABLE(T, scalartype_)                     \
+  template <>                                                 \
+  C10_HOST_DEVICE inline T fetch_and_cast<T>(                 \
+      const ScalarType src_type, const void* ptr) {           \
+    CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == src_type);  \
+    return c10::load<T>(ptr);                                 \
+  }                                                           \
+  template <>                                                 \
+  C10_HOST_DEVICE inline void cast_and_store<T>(              \
+      const ScalarType dest_type, void* ptr, T value) {       \
+    CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == dest_type); \
+    *(T*)ptr = value;                                         \
+  }
+
+AT_FORALL_QINT_TYPES(DEFINE_UNCASTABLE)
+
+#undef FETCH_AND_CAST_CASE
+#undef CAST_AND_STORE_CASE
+#undef DEFINE_UNCASTABLE
+#undef ERROR_UNSUPPORTED_CAST
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed1a213bfb4724b5019909adafc237297262f9e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Event.h
@@ -0,0 +1,142 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/InlineEvent.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+
+namespace c10 {
+
+/**
+ * A backend-generic movable, not copyable, not thread-safe event.
+ *
+ * The design of this event follows that of CUDA and HIP events. These events
+ * are recorded and waited on by streams and can be rerecorded to,
+ * each rerecording essentially creating a new version of the event.
+ * For example, if (in CPU time), stream X is asked to record E,
+ * stream Y waits on E, and stream X is asked to record E again, then Y will
+ * wait for X to finish the first call to record and not the second, because
+ * it's waiting on the first version of event E, not the second.
+ * Querying an event only returns the status of its most recent version.
+ *
+ * Backend-generic events are implemented by this class and
+ * impl::InlineEvent. In addition to these events there are also
+ * some backend-specific events, like ATen's CUDAEvent. Each of these
+ * classes has its own use.
+ *
+ * impl::InlineEvent<...> or a backend-specific event should be
+ * preferred when the backend is known at compile time and known to
+ * be compiled. Backend-specific events may have additional functionality.
+ *
+ * This Event should be used if a particular backend may not be available,
+ * or the backend required is not known at compile time.
+ *
+ * These generic events are built on top of DeviceGuardImpls, analogous
+ * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls,"
+ * is no longer entirely accurate, as these classes implement the
+ * backend-specific logic for a generic backend interface.
+ *
+ * See DeviceGuardImplInterface.h for a list of all supported flags.
+ */
+
+struct Event final {
+  // Constructors
+  Event() = delete;
+  Event(
+      const DeviceType _device_type,
+      const EventFlag _flag = EventFlag::PYTORCH_DEFAULT)
+      : impl_{_device_type, _flag} {}
+
+  // Copy constructor and copy assignment operator (deleted)
+  Event(const Event&) = delete;
+  Event& operator=(const Event&) = delete;
+
+  // Move constructor and move assignment operator
+  Event(Event&&) noexcept = default;
+  Event& operator=(Event&&) noexcept = default;
+
+  // Destructor
+  ~Event() = default;
+
+  // Getters
+  Device device() const noexcept {
+    return Device(device_type(), device_index());
+  }
+  DeviceType device_type() const noexcept {
+    return impl_.device_type();
+  }
+  DeviceIndex device_index() const noexcept {
+    return impl_.device_index();
+  }
+  EventFlag flag() const noexcept {
+    return impl_.flag();
+  }
+  bool was_marked_for_recording() const noexcept {
+    return impl_.was_marked_for_recording();
+  }
+
+  /**
+   * Calls record() if and only if record() has never been called for this
+   * event. Note: because Event is not thread-safe recordOnce() may call
+   * record() multiple times if called from multiple threads.
+   */
+  void recordOnce(const Stream& stream) {
+    impl_.recordOnce(stream);
+  }
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  void record(const Stream& stream) {
+    impl_.record(stream);
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  void block(const Stream& stream) const {
+    impl_.block(stream);
+  }
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  bool query() const {
+    return impl_.query();
+  }
+
+  double elapsedTime(const Event& event) const {
+    return impl_.elapsedTime(event.impl_);
+  }
+
+  void* eventId() const {
+    return impl_.eventId();
+  }
+
+  void synchronize() const {
+    impl_.synchronize();
+  }
+
+ private:
+  impl::InlineEvent<impl::VirtualGuardImpl> impl_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d7aac9243ffbbfc4f79471ebceee04ced485219
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GeneratorImpl.h
@@ -0,0 +1,116 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+#include <mutex>
+
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/python_stub.h>
+
+/**
+ * Note [Generator]
+ * ~~~~~~~~~~~~~~~~
+ * A Pseudo Random Number Generator (PRNG) is an engine that uses an algorithm
+ * to generate a seemingly random sequence of numbers, that may be later be used
+ * in creating a random distribution. Such an engine almost always maintains a
+ * state and requires a seed to start off the creation of random numbers. Often
+ * times, users have found it beneficial to be able to explicitly create,
+ * retain, and destroy PRNG states and also be able to have control over the
+ * seed value.
+ *
+ * A Generator in ATen gives users the ability to read, write and modify a PRNG
+ * engine. For instance, it does so by letting users seed a PRNG engine, fork
+ * the state of the engine, etc.
+ *
+ * By default, there is one generator per device, and a device's generator is
+ * lazily created. A user can use the torch.Generator() api to create their own
+ * generator. Currently torch.Generator() can only create a CPUGeneratorImpl.
+ */
+
+/**
+ * Note [Acquire lock when using random generators]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Generator and its derived classes are NOT thread-safe. Please note that most
+ * of the places where we have inserted locking for generators are historically
+ * based, and we haven't actually checked that everything is truly thread safe
+ * (and it probably isn't). Please use the public mutex_ when using any methods
+ * from these classes, except for the read-only methods. You can learn about the
+ * usage by looking into the unittests (aten/src/ATen/cpu_generator_test.cpp)
+ * and other places where we have used lock_guard.
+ *
+ * TODO: Look into changing the threading semantics of Generators in ATen (e.g.,
+ * making them non-thread safe and instead making the generator state
+ * splittable, to accommodate forks into other threads).
+ */
+
+namespace c10 {
+
+// The default seed is selected to be a large number
+// with good distribution of 0s and 1s in bit representation
+constexpr uint64_t default_rng_seed_val = 67280421310721;
+
+struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
+  // Constructors
+  GeneratorImpl(Device device_in, DispatchKeySet key_set);
+
+  // Delete all copy and move assignment in favor of clone()
+  // method
+  GeneratorImpl(const GeneratorImpl& other) = delete;
+  GeneratorImpl(GeneratorImpl&& other) = delete;
+  GeneratorImpl& operator=(const GeneratorImpl& other) = delete;
+  GeneratorImpl& operator=(GeneratorImpl&& other) = delete;
+
+  ~GeneratorImpl() override = default;
+  c10::intrusive_ptr<GeneratorImpl> clone() const;
+
+  // Common methods for all generators
+  virtual void set_current_seed(uint64_t seed) = 0;
+  virtual void set_offset(uint64_t offset) = 0;
+  virtual uint64_t get_offset() const = 0;
+  virtual uint64_t current_seed() const = 0;
+  virtual uint64_t seed() = 0;
+  virtual void set_state(const c10::TensorImpl& new_state) = 0;
+  virtual c10::intrusive_ptr<c10::TensorImpl> get_state() const = 0;
+  virtual void graphsafe_set_state(
+      const c10::intrusive_ptr<c10::GeneratorImpl>& new_state);
+  virtual c10::intrusive_ptr<c10::GeneratorImpl> graphsafe_get_state() const;
+  Device device() const;
+
+  // See Note [Acquire lock when using random generators]
+  std::mutex mutex_;
+
+  DispatchKeySet key_set() const {
+    return key_set_;
+  }
+
+  inline void set_pyobj(PyObject* pyobj) noexcept {
+    pyobj_ = pyobj;
+  }
+
+  inline PyObject* pyobj() const noexcept {
+    return pyobj_;
+  }
+
+ protected:
+  Device device_;
+  DispatchKeySet key_set_;
+  PyObject* pyobj_ = nullptr;
+
+  virtual GeneratorImpl* clone_impl() const = 0;
+};
+
+namespace detail {
+
+C10_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
+
+} // namespace detail
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..391b293f9f005af1035dbf9e43be91bf5b353bed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/GradMode.h
@@ -0,0 +1,57 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/AutogradState.h>
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+struct C10_API GradMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct C10_API AutoGradMode {
+  AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
+    GradMode::set_enabled(enabled);
+  }
+  AutoGradMode(const AutoGradMode&) = delete;
+  AutoGradMode(AutoGradMode&&) = delete;
+  AutoGradMode& operator=(const AutoGradMode&) = delete;
+  AutoGradMode& operator=(AutoGradMode&&) = delete;
+  ~AutoGradMode() {
+    GradMode::set_enabled(prev_mode);
+  }
+  bool prev_mode;
+};
+
+// A RAII, thread local (!) guard that stops future operations from building
+// gradients.
+struct C10_API NoGradGuard : public AutoGradMode {
+  NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
+};
+
+// A RAII, thread local (!) guard that enables or disables forward grad mode
+// upon construction, and sets it back to the original value upon destruction.
+struct C10_API AutoFwGradMode {
+  AutoFwGradMode(bool enabled)
+      : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
+    AutogradState::get_tls_state().set_fw_grad_mode(enabled);
+  }
+  AutoFwGradMode(const AutoFwGradMode&) = delete;
+  AutoFwGradMode(AutoFwGradMode&&) = delete;
+  AutoFwGradMode& operator=(const AutoFwGradMode&) = delete;
+  AutoFwGradMode& operator=(AutoFwGradMode&&) = delete;
+  ~AutoFwGradMode() {
+    AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
+  }
+  bool prev_mode;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..8da25b5427e61d250268a352f11757a4e1d7ab24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/InferenceMode.h
@@ -0,0 +1,96 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/AutogradState.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+// A RAII, thread local (!) guard that enables or disables inference mode upon
+// construction, and sets it back to the original value upon destruction.
+struct C10_API InferenceMode {
+  // Note [Expected TLS state in InferenceMode]:
+  //   InferenceMode: ADInplaceOrView not in
+  //   raw_local_dispatch_key_set.included(),
+  //                  Autograd in raw_local_dispatch_key_set.excluded()
+  //                  GradMode is disabled.
+  //   NormalMode: ADInplaceOrView in raw_local_dispatch_key_set.included(),
+  //               Autograd not in raw_local_dispatch_key_set.excluded()
+  //               GradMode is enabled by default unless toggled manually
+  //               through other APIs, e.g. NoGradGuard.
+  //
+  // Invariant:
+  // - ADInplaceOrView is never in the excluded set
+  // - Autograd is never in the included set
+  // - Setting InferenceMode will set GradMode accordingly, but not vice versa.
+  //
+  //  1. Why do we put ADInplaceOrView in included set outside InferenceMode?
+  //
+  //     Inplace update to inference tensor outside InferenceMode is not
+  //     allowed. See Note [Inplace update inference tensor] for more details.
+  //     Without going through ADInplaceOrView kernel, we cannot throw error
+  //     for `inference_tensor.add_(1)` case.
+  //
+  // 2. Why not put ADInplaceOrView in the excluded set inside InferenceMode?
+  //
+  //    For example:
+  //    torch::Tensor a = torch::ones({1, 2, 3}).set_requires_grad(true);
+  //    torch::Tensor k = a + 2;
+  //    {
+  //      c10::InferenceMode guard(true);
+  //      k.add_(2);
+  //    }
+  //    `k.add_(2)` still need to go through ADInplaceOrView kernel so that it's
+  //    prepared for future autograd.
+  //
+  // 3. Why does setting InferenceMode also set GradMode?
+  //
+  //    This is required since InferenceMode is a faster and more restrictive
+  //    version of NoGradGuard. All runtime checks using GradMode::is_enabled()
+  //    are applicable to InferenceMode as well, e.g.
+  //    `tensorTypeInCurrentExecutionContext` in interpreter.cpp.
+  InferenceMode(bool enabled = true)
+      : prev_mode(AutogradState::get_tls_state()),
+        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
+    // Enabling inference mode means disabling grad modes
+    // And disabling inference mode means enabling grad modes
+    AutogradState::set_tls_state(AutogradState(
+        /* grad_mode */ !enabled,
+        /* inference_mode */ enabled,
+        /* fw_grad_mode */ !enabled,
+        /* multithreading_enabled*/ !enabled));
+    DispatchKeySet included = enabled
+        ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
+        : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
+    DispatchKeySet excluded = enabled
+        ? (prev_keyset.excluded_ | c10::autograd_dispatch_keyset)
+        : (prev_keyset.excluded_ - c10::autograd_dispatch_keyset);
+    c10::impl::PODLocalDispatchKeySet cur_keyset{};
+    cur_keyset.set_included(included);
+    cur_keyset.set_excluded(excluded);
+    c10::impl::_force_tls_local_dispatch_key_set(cur_keyset);
+  }
+
+  InferenceMode(const InferenceMode&) = delete;
+  InferenceMode(InferenceMode&&) = delete;
+  InferenceMode& operator=(const InferenceMode&) = delete;
+  InferenceMode& operator=(InferenceMode&&) = delete;
+
+  ~InferenceMode() {
+    AutogradState::set_tls_state(prev_mode);
+    c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
+  }
+  static bool is_enabled();
+
+ private:
+  AutogradState prev_mode;
+  c10::impl::LocalDispatchKeySet prev_keyset;
+};
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..194e1863cb18cf2759f2c4e3e1ace298efd76150
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Layout.h
@@ -0,0 +1,67 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/util/Exception.h>
+
+#include <torch/headeronly/core/Layout.h>
+
+namespace c10 {
+
+inline Layout layout_from_backend(Backend backend) {
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+  switch (backend) {
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+    case Backend::SparseMPS:
+    case Backend::SparseHIP:
+    case Backend::SparseVE:
+    case Backend::SparseXPU:
+    case Backend::SparsePrivateUse1:
+      return Layout::Sparse;
+    case Backend::MkldnnCPU:
+      return Layout::Mkldnn;
+    case Backend::SparseCsrCPU:
+    case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrMPS:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrXPU:
+      TORCH_CHECK(
+          false,
+          "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU|MPS) to a unique layout.");
+    default:
+      return Layout::Strided;
+  }
+  C10_DIAGNOSTIC_POP()
+}
+
+inline std::ostream& operator<<(std::ostream& stream, at::Layout layout) {
+  switch (layout) {
+    case at::kStrided:
+      return stream << "Strided";
+    case at::kSparse:
+      return stream << "Sparse";
+    case at::kSparseCsr:
+      return stream << "SparseCsr";
+    case at::kSparseCsc:
+      return stream << "SparseCsc";
+    case at::kSparseBsr:
+      return stream << "SparseBsr";
+    case at::kSparseBsc:
+      return stream << "SparseBsc";
+    case at::kMkldnn:
+      return stream << "Mkldnn";
+    case at::kJagged:
+      return stream << "Jagged";
+    case Layout::NumOptions:
+    default:
+      TORCH_CHECK(false, "Unknown layout");
+  }
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h
new file mode 100644
index 0000000000000000000000000000000000000000..63cdb757952b073d957fc91c33357136c1287679
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/MemoryFormat.h
@@ -0,0 +1,268 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+#include <torch/headeronly/core/MemoryFormat.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace c10 {
+
+// If you are seeing this, it means that this call site was not checked if
+// the memory format could be preserved, and it was switched to old default
+// behaviour of contiguous
+#define LEGACY_CONTIGUOUS_MEMORY_FORMAT c10::get_contiguous_memory_format()
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::MemoryFormat memory_format) {
+  switch (memory_format) {
+    case MemoryFormat::Preserve:
+      return stream << "Preserve";
+    case MemoryFormat::Contiguous:
+      return stream << "Contiguous";
+    case MemoryFormat::ChannelsLast:
+      return stream << "ChannelsLast";
+    case MemoryFormat::ChannelsLast3d:
+      return stream << "ChannelsLast3d";
+    case MemoryFormat::NumOptions:
+    default:
+      TORCH_CHECK(false, "Unknown memory format ", memory_format);
+  }
+}
+
+// Note: Hardcoded the channel last stride indices here to get better
+// performance
+template <typename T>
+inline std::vector<T> get_channels_last_strides_2d(ArrayRef<T> sizes) {
+  std::vector<T> strides(sizes.size());
+  switch (sizes.size()) {
+    case 4:
+      strides[1] = 1;
+      strides[3] = sizes[1];
+      strides[2] = strides[3] * sizes[3];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    case 3:
+      strides[0] = 1;
+      strides[2] = sizes[0];
+      strides[1] = strides[2] * sizes[2];
+      return strides;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "ChannelsLast2d doesn't support size ", sizes.size());
+  }
+}
+
+inline std::vector<int64_t> get_channels_last_strides_2d(IntArrayRef sizes) {
+  return get_channels_last_strides_2d<int64_t>(sizes);
+}
+
+template <typename T>
+std::vector<T> get_channels_last_strides_3d(ArrayRef<T> sizes) {
+  std::vector<T> strides(sizes.size());
+  switch (sizes.size()) {
+    case 5:
+      strides[1] = 1;
+      strides[4] = sizes[1];
+      strides[3] = strides[4] * sizes[4];
+      strides[2] = strides[3] * sizes[3];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    case 4:
+      strides[0] = 1;
+      strides[3] = sizes[0];
+      strides[2] = strides[3] * sizes[3];
+      strides[1] = strides[2] * sizes[2];
+      return strides;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "ChannelsLast3d doesn't support size ", sizes.size());
+  }
+}
+
+inline std::vector<int64_t> get_channels_last_strides_3d(IntArrayRef sizes) {
+  return get_channels_last_strides_3d<int64_t>(sizes);
+}
+
+// NOTE:
+// Below are Helper functions for is_channels_last_strides_xd.
+// 1. Please do not combine these helper functions, each helper function handles
+// exactly one case of sizes + memory_format, by doing this, the strides indices
+// will be a constant array and we can access it using constant index number,
+// the compiler will fully unroll the loop on strides indices to gain a better
+// performance.
+// 2. No error check in helper function, caller ensures the correctness of the
+// input
+// 3. All helper functions have similar comments, only 1st helper function is
+// commented here.
+template <typename T>
+inline bool is_channels_last_strides_2d_s4(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  T min = 0;
+  // special case for trivial C dimension. default to NCHW
+  if (strides[1] == 0) {
+    return false;
+  }
+  // loop strides indices
+  for (auto& d : {1, 3, 2, 0}) {
+    if (sizes[d] == 0) {
+      return false;
+    }
+    if (strides[d] < min) {
+      return false;
+    }
+    // Fallback to NCHW as default layout for ambiguous cases
+    // This is the flaw of implicit memory_format from strides.
+    // N111 tensor with identical strides for size 1 dimension;
+    // Two cases could lead us here:
+    // a. N111 contiguous Tensor ([N,1,1,1]@[1,1,1,1])
+    // b. N11W contiguous Tensor sliced on the W-dimension.
+    // ([N,1,1,1]@[W,W,W,W])
+    if (d == 0 && min == strides[1]) {
+      return false;
+    }
+    // This is necessary to:
+    // 1. distinguish the memory_format of N1H1;
+    //     [H, 1, 1, 1] channels_last stride
+    //     [H, H, 1, 1] contiguous stride
+    // 2. permutation of 1C1W:
+    //     [1, C, 1, H]@[HC, H, H, 1] transpose(1, 3)
+    //     [1, H, 1, C]@[HC, 1, H, H] shouldn't be identified as channels_last
+    min = strides[d];
+    if (sizes[d] > 1) {
+      min *= sizes[d];
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline bool is_channels_last_strides_3d_s5(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  T min = 0;
+  if (strides[1] == 0) {
+    return false;
+  }
+  for (auto& d : {1, 4, 3, 2, 0}) {
+    if (sizes[d] == 0) {
+      return false;
+    }
+    if (strides[d] < min) {
+      return false;
+    }
+    if (d == 0 && min == strides[1]) {
+      return false;
+    }
+    min = strides[d];
+    if (sizes[d] > 1) {
+      min *= sizes[d];
+    }
+  }
+  return true;
+}
+
+// Note [Ambiguous is_channels_last_strides_xd]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// The flaw of carrying memory_format implicitly through strides is very hard
+// to WAR properly. issue #24090
+// Without the history of permutation, we can't infer the memory_format of a
+// tensor from the snapshot of its size & stride
+// e.g.
+//
+// 1. We can NOT specify the memory_format of N111 tensor through strides in a
+//  meaningful way;
+//
+// 2. Two path that ended up with identical size/stride
+//  N11W contiguous tensor sliced at w-dimension becomes [N,1,1,1]@[W,W,W,W]
+//  NC11 channels_last tensor sliced at c-dimension becomes [N,1,1,1]@[C,C,C,C]
+//    So if we see a tensor [N,1,1,1]@[X,X,X,X], there's no way for us to infer
+//    the memory_format of the original tensor.
+//
+// Due to the limitations, our temporary WAR `is_channels_last_strides` does the
+// best effort to infer whether the original memory_format of a tensor is
+// at::MemoryFormat::ChannelsLast. The two objectives of this function (ordered
+// by their importance):
+//   1. Ensure that normal shape manipulation does not accidentally change the
+//      MemoryFormat of an existing tensor.
+//   2. Allows user to mark MemoryFormat::ChannelsLast to tensors;
+//
+// The function does so via checking strides of the tensor, including strides of
+// size-1 dimensions. Although conventionally PyTorch implies no restriction on
+// trivial stride (stride for size-1 dimension).
+//
+// Note that this approach is a compromise. We did not solve the problem
+// completely. Many cases we will not be able to infer the correct memory
+// format.
+// The implementation of `is_channels_last_strides` is to serve the objectives:
+// MemoryFormat::ChannelsLast has to be explicitly opted-in (no accidental
+// conversion); Best effort to maintain the ChannelsLast flag.
+//
+// Due to the fact that this is not a bulletproof solution, through testing
+// (aten/src/ATen/test/memory_format_test.cpp)
+//   a. we ensure that the common tasks are supported;
+//   a. we identify corner cases where the implementation compromises on.
+//
+// By the time accumulated permutation is enabled to replace implicit
+// memory_format through strides, we should be updating our tests and fix the
+// issues in our tests.
+//
+// We use Channels Last 2d as an example above.
+// This is a general problem for all the is_channels_last_strides_xd
+// implementation. Please check the helper functions
+// (is_channels_last_strides_*d_s*) for more details.
+
+template <typename T>
+inline bool is_channels_last_strides_2d(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  switch (sizes.size()) {
+    case 4:
+      return is_channels_last_strides_2d_s4(sizes, strides);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+inline bool is_channels_last_strides_3d(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  switch (sizes.size()) {
+    case 5:
+      return is_channels_last_strides_3d_s5(sizes, strides);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+inline bool is_channels_last_strides_2d(
+    const IntArrayRef sizes,
+    const IntArrayRef strides) {
+  return is_channels_last_strides_2d<int64_t>(sizes, strides);
+}
+
+inline bool is_channels_last_strides_3d(
+    const IntArrayRef sizes,
+    const IntArrayRef strides) {
+  return is_channels_last_strides_3d<int64_t>(sizes, strides);
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1199e1945a65866cfd17c5301e20454721dc117
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/OptionalRef.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+namespace c10 {
+
+template <typename T>
+class OptionalRef {
+ public:
+  OptionalRef() : data_(nullptr) {}
+  OptionalRef(const T* data) : data_(data) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_);
+  }
+  OptionalRef(const T& data) : data_(&data) {}
+
+  bool has_value() const {
+    return data_ != nullptr;
+  }
+
+  const T& get() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_);
+    return *data_;
+  }
+
+  operator bool() const {
+    return has_value();
+  }
+
+ private:
+  const T* data_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c39510078bc70aa95e205176fd8bebeeb332065
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/PyHandleCache.h
@@ -0,0 +1,81 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/python_stub.h>
+
+#include <atomic>
+
+namespace c10 {
+
+// A PyHandleCache represents a cached pointer from a C++ object to
+// a Python object that represents that object analogously in Python.
+// Upon a cache hit, the relevant object can be retrieved after a test
+// and then a memory load.  Two conditions must hold to be able to use this
+// class:
+//
+//  - This must truly be a cache; e.g., the caller must be able to produce
+//    the object some other way if the cache hit misses.
+//
+//  - This must truly be a handle; e.g., the Python object referenced by
+//    this class must have static lifetime.  This means we don't have to
+//    maintain strong ownership or deallocate the object when the C++ object
+//    dies.  Static lifetime is a good idea in conjunction with the cache,
+//    since if you are producing a fresh object on miss you won't be
+//    maintaining object identity.  If you need bidirectional ownership,
+//    you will want to factor out the pattern in TensorImpl with
+//    resurrection.
+//
+// This cache is expected to not improve perf under torchdeploy, as one
+// interpreter will fill up the cache, and all the interpreters will be
+// unable to use the slot.  A potential improvement is to have multiple
+// slots (one per interpreter), which will work in deployment scenarios
+// where there a stable, fixed number of interpreters.  You can also store
+// the relevant state in the Python library, rather than in the non-Python
+// library (although in many cases, this is not convenient, as there may
+// not be a way to conveniently index based on the object.)
+class PyHandleCache {
+ public:
+  PyHandleCache() : pyinterpreter_(nullptr) {}
+
+  // Attempt to fetch the pointer from the cache, if the PyInterpreter
+  // matches.  If it doesn't exist, or the cache entry is not valid,
+  // use slow_accessor to get the real pointer value and return that
+  // (possibly writing it to the cache, if the cache entry is
+  // available.)
+  template <typename F>
+  PyObject* ptr_or(impl::PyInterpreter* self_interpreter, F slow_accessor)
+      const {
+    // Note [Memory ordering on Python interpreter tag]
+    impl::PyInterpreter* interpreter =
+        pyinterpreter_.load(std::memory_order_acquire);
+    if (C10_LIKELY(interpreter == self_interpreter)) {
+      return data_;
+    } else if (interpreter == nullptr) {
+      auto* r = slow_accessor();
+      impl::PyInterpreter* expected = nullptr;
+      // attempt to claim this cache entry with the specified interpreter tag
+      if (pyinterpreter_.compare_exchange_strong(
+              expected, self_interpreter, std::memory_order_acq_rel)) {
+        data_ = r;
+      }
+      // This shouldn't be possible, as you should be GIL protected
+      TORCH_INTERNAL_ASSERT(expected != self_interpreter);
+      return r;
+    } else {
+      return slow_accessor();
+    }
+  }
+
+ private:
+  mutable std::atomic<impl::PyInterpreter*> pyinterpreter_;
+  mutable PyObject* data_{nullptr};
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0bb6a245643a3e093c02ae80756403b931245ba
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QEngine.h
@@ -0,0 +1,51 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <string>
+
+namespace c10 {
+
+/**
+ * QEngine is an enum that is used to select the engine to run quantized ops.
+ * Keep this enum in sync with get_qengine_id() in
+ * torch/backends/quantized/__init__.py
+ */
+enum class QEngine : uint8_t {
+  NoQEngine = 0,
+  FBGEMM = 1,
+  QNNPACK = 2,
+  ONEDNN = 3,
+  X86 = 4,
+};
+
+constexpr auto kNoQEngine = QEngine::NoQEngine;
+constexpr auto kFBGEMM = QEngine::FBGEMM;
+constexpr auto kQNNPACK = QEngine::QNNPACK;
+constexpr auto kONEDNN = QEngine::ONEDNN;
+constexpr auto kX86 = QEngine::X86;
+
+inline std::string toString(QEngine qengine) {
+  switch (qengine) {
+    case kNoQEngine:
+      return "NoQEngine";
+    case kFBGEMM:
+      return "FBGEMM";
+    case kQNNPACK:
+      return "QNNPACK";
+    case kONEDNN:
+      return "ONEDNN";
+    case kX86:
+      return "X86";
+    default:
+      TORCH_CHECK(
+          false, "Unrecognized Quantized Engine: ", static_cast<int>(qengine));
+  }
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h
new file mode 100644
index 0000000000000000000000000000000000000000..f557affb1de8ff54fc961159d3cc67e2f11ef3b7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/QScheme.h
@@ -0,0 +1,60 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <string>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+
+namespace c10 {
+
+/**
+ * QScheme is an enum that specifies the type of quantization. This has a one
+ * to one correspondence with Quantizer
+ * Please refer to ATen/quantized/Quantizer.h to see the Quantizers classes.
+ * Keep this file in sync with torch/nn/_qscheme.py
+ */
+enum class QScheme : uint8_t {
+  PER_TENSOR_AFFINE = 0,
+  PER_CHANNEL_AFFINE = 1,
+  PER_TENSOR_SYMMETRIC = 2,
+  PER_CHANNEL_SYMMETRIC = 3,
+  PER_CHANNEL_AFFINE_FLOAT_QPARAMS = 4,
+  COMPILE_TIME_NUM_QSCHEMES = 5,
+};
+
+constexpr auto kPerTensorAffine = QScheme::PER_TENSOR_AFFINE;
+constexpr auto kPerChannelAffine = QScheme::PER_CHANNEL_AFFINE;
+constexpr auto kPerTensorSymmetric = QScheme::PER_TENSOR_SYMMETRIC;
+constexpr auto kPerChannelSymmetric = QScheme::PER_CHANNEL_SYMMETRIC;
+constexpr auto kPerChannelAffineFloatQParams =
+    QScheme::PER_CHANNEL_AFFINE_FLOAT_QPARAMS;
+constexpr int COMPILE_TIME_NUM_QSCHEMES =
+    static_cast<int>(QScheme::COMPILE_TIME_NUM_QSCHEMES);
+
+inline std::string toString(QScheme qscheme) {
+  switch (qscheme) {
+    case kPerTensorAffine:
+      return "per_tensor_affine";
+    case kPerChannelAffine:
+      return "per_channel_affine";
+    case kPerTensorSymmetric:
+      return "per_tensor_symmetric";
+    case kPerChannelSymmetric:
+      return "per_channel_symmetric";
+    case kPerChannelAffineFloatQParams:
+      return "per_channel_affine_float_qparams";
+    default:
+      TORCH_CHECK(false, "Unrecognized qscheme: ", static_cast<int>(qscheme));
+  }
+}
+
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b1e9ca7071a032e6a383dc539b8010af535471b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/RefcountedDeleter.h
@@ -0,0 +1,57 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Storage.h>
+#include <c10/macros/Export.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <atomic>
+#include <memory>
+
+namespace c10 {
+
+// A RefcountedDeleterContext object is used as the `ctx` argument for DataPtr
+// to implement a shared DataPtr. Normally, a DataPtr is unique, but we use
+// this custom context and the `refcounted_deleter` function below to make the
+// DataPtr act like a non-unique DataPtr. This context object holds onto an
+// inner context and deleter function which handle the actual deletion of the
+// data when the refcount reaches 0.
+//
+// This shared DataPtr feature is only used when storages are shared between
+// multiple Python interpreters in MultiPy. // codespell:ignore multipy
+// Before storages had PyObject preservation, interpreters could just share the
+// same StorageImpl instance. But now a StorageImpl can only be associated with
+// one interpreter in order to properly manage a zombie PyObject. So we share
+// storages across Python interpreters by creating a different StorageImpl
+// instance for each one, but they all point to the same data.
+struct C10_API RefcountedDeleterContext {
+  RefcountedDeleterContext(void* other_ctx, c10::DeleterFnPtr other_deleter)
+      : other_ctx(other_ctx, other_deleter), refcount(1) {}
+
+  std::unique_ptr<void, c10::DeleterFnPtr> other_ctx;
+  std::atomic_int refcount;
+};
+
+// `refcounted_deleter` is used as the `ctx_deleter` for DataPtr to implement
+// a shared DataPtr.
+//
+// Warning: This should only be called on a pointer to
+// a RefcountedDeleterContext that was allocated on the heap with `new`,
+// because when the refcount reaches 0, the context is deleted with `delete`
+C10_API void refcounted_deleter(void* ctx_);
+
+// If the storage's DataPtr does not use `refcounted_deleter`, replace it with
+// a DataPtr that does, so it can be shared between multiple StorageImpls
+C10_API void maybeApplyRefcountedDeleter(const c10::Storage& storage);
+
+// Create a new StorageImpl that points to the same data. If the original
+// StorageImpl's DataPtr does not use `refcounted_deleter`, it will be replaced
+// with one that does
+C10_API c10::Storage newStorageImplFromRefcountedDataPtr(
+    const c10::Storage& storage);
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf8eee0e004b5e49c39d9718736df1099769ef24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SafePyObject.h
@@ -0,0 +1,125 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Export.h>
+#include <c10/util/python_stub.h>
+#include <utility>
+
+namespace c10 {
+
+// This is an safe owning holder for a PyObject, akin to pybind11's
+// py::object, with two major differences:
+//
+//  - It is in c10/core; i.e., you can use this type in contexts where
+//    you do not have a libpython dependency
+//
+//  - It is multi-interpreter safe (ala torchdeploy); when you fetch
+//    the underlying PyObject* you are required to specify what the current
+//    interpreter context is and we will check that you match it.
+//
+// It is INVALID to store a reference to a Tensor object in this way;
+// you should just use TensorImpl directly in that case!
+struct C10_API SafePyObject {
+  // Steals a reference to data
+  SafePyObject(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+  SafePyObject(SafePyObject&& other) noexcept
+      : data_(std::exchange(other.data_, nullptr)),
+        pyinterpreter_(other.pyinterpreter_) {}
+  // For now it's not used, so we just disallow it.
+  SafePyObject& operator=(SafePyObject&&) = delete;
+
+  SafePyObject(SafePyObject const& other)
+      : data_(other.data_), pyinterpreter_(other.pyinterpreter_) {
+    if (data_ != nullptr) {
+      (*pyinterpreter_)->incref(data_);
+    }
+  }
+
+  SafePyObject& operator=(SafePyObject const& other) {
+    if (this == &other) {
+      return *this; // Handle self-assignment
+    }
+    if (other.data_ != nullptr) {
+      (*other.pyinterpreter_)->incref(other.data_);
+    }
+    if (data_ != nullptr) {
+      (*pyinterpreter_)->decref(data_);
+    }
+    data_ = other.data_;
+    pyinterpreter_ = other.pyinterpreter_;
+    return *this;
+  }
+
+  ~SafePyObject() {
+    if (data_ != nullptr) {
+      (*pyinterpreter_)->decref(data_);
+    }
+  }
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
+
+  // stop tracking the current object, and return it
+  PyObject* release() {
+    auto rv = data_;
+    data_ = nullptr;
+    return rv;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
+// A newtype wrapper around SafePyObject for type safety when a python object
+// represents a specific type. Note that `T` is only used as a tag and isn't
+// actually used for any true purpose.
+template <typename T>
+struct SafePyObjectT : private SafePyObject {
+  SafePyObjectT(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : SafePyObject(data, pyinterpreter) {}
+  ~SafePyObjectT() = default;
+  SafePyObjectT(SafePyObjectT&& other) noexcept : SafePyObject(other) {}
+  SafePyObjectT(SafePyObjectT const&) = delete;
+  SafePyObjectT& operator=(SafePyObjectT const&) = delete;
+  SafePyObjectT& operator=(SafePyObjectT&&) = delete;
+
+  using SafePyObject::ptr;
+  using SafePyObject::pyinterpreter;
+  using SafePyObject::release;
+};
+
+// Like SafePyObject, but non-owning.  Good for references to global PyObjects
+// that will be leaked on interpreter exit.  You get a copy constructor/assign
+// this way.
+struct C10_API SafePyHandle {
+  SafePyHandle() : data_(nullptr), pyinterpreter_(nullptr) {}
+  SafePyHandle(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
+  void reset() {
+    data_ = nullptr;
+    pyinterpreter_ = nullptr;
+  }
+  operator bool() {
+    return data_;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..863a993ed08a614ca4526fee426ebd46f5633be0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Scalar.h
@@ -0,0 +1,471 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <c10/core/OptionalRef.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/complex.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/overflows.h>
+
+namespace c10 {
+
+/**
+ * Scalar represents a 0-dimensional tensor which contains a single element.
+ * Unlike a tensor, numeric literals (in C++) are implicitly convertible to
+ * Scalar (which is why, for example, we provide both add(Tensor) and
+ * add(Scalar) overloads for many operations). It may also be used in
+ * circumstances where you statically know a tensor is 0-dim and single size,
+ * but don't know its type.
+ */
+class C10_API Scalar {
+ public:
+  Scalar() : Scalar(int64_t(0)) {}
+
+  void destroy() {
+    if (Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag) {
+      raw::intrusive_ptr::decref(v.p);
+      v.p = nullptr;
+    }
+  }
+
+  ~Scalar() {
+    destroy();
+  }
+
+#define DEFINE_IMPLICIT_CTOR(type, name) \
+  Scalar(type vv) : Scalar(vv, true) {}
+
+  AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR)
+
+  // Helper constructors to allow Scalar creation from long and long long types
+  // As std::is_same_v<long, long long> is false(except Android), one needs to
+  // provide a constructor from either long or long long in addition to one from
+  // int64_t
+#if defined(__APPLE__) || defined(__MACOSX)
+  static_assert(
+      std::is_same_v<long long, int64_t>,
+      "int64_t is the same as long long on MacOS");
+  Scalar(long vv) : Scalar(vv, true) {}
+#endif
+#if defined(_MSC_VER)
+  static_assert(
+      std::is_same_v<long long, int64_t>,
+      "int64_t is the same as long long on Windows");
+  Scalar(long vv) : Scalar(vv, true) {}
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+  static_assert(
+      sizeof(void*) != 8 || std::is_same_v<long, int64_t>,
+      "int64_t is the same as long on 64 bit Linux");
+#if LONG_MAX != INT_MAX
+  Scalar(long long vv) : Scalar(vv, true) {}
+#endif /* not 32-bit system */
+#endif
+
+  Scalar(uint16_t vv) : Scalar(vv, true) {}
+  Scalar(uint32_t vv) : Scalar(vv, true) {}
+  Scalar(uint64_t vv) {
+    if (vv > static_cast<uint64_t>(INT64_MAX)) {
+      tag = Tag::HAS_u;
+      v.u = vv;
+    } else {
+      tag = Tag::HAS_i;
+      // NB: no need to use convert, we've already tested convertibility
+      v.i = static_cast<int64_t>(vv);
+    }
+  }
+
+#undef DEFINE_IMPLICIT_CTOR
+
+  // Value* is both implicitly convertible to SymbolicVariable and bool which
+  // causes ambiguity error. Specialized constructor for bool resolves this
+  // problem.
+  template <
+      typename T,
+      typename std::enable_if_t<std::is_same_v<T, bool>, bool>* = nullptr>
+  Scalar(T vv) : tag(Tag::HAS_b) {
+    v.i = convert<int64_t, bool>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<std::is_same_v<T, c10::SymBool>, bool>* =
+          nullptr>
+  Scalar(T vv) : tag(Tag::HAS_sb) {
+    v.i = convert<int64_t, c10::SymBool>(vv);
+  }
+
+#define DEFINE_ACCESSOR(type, name)                                   \
+  type to##name() const {                                             \
+    if (Tag::HAS_d == tag) {                                          \
+      return checked_convert<type, double>(v.d, #type);               \
+    } else if (Tag::HAS_z == tag) {                                   \
+      return checked_convert<type, c10::complex<double>>(v.z, #type); \
+    } else if (Tag::HAS_sd == tag) {                                  \
+      return checked_convert<type, double>(                           \
+          toSymFloat().guard_float(__FILE__, __LINE__), #type);       \
+    }                                                                 \
+    if (Tag::HAS_b == tag) {                                          \
+      return checked_convert<type, bool>(v.i, #type);                 \
+    } else if (Tag::HAS_i == tag) {                                   \
+      return checked_convert<type, int64_t>(v.i, #type);              \
+    } else if (Tag::HAS_u == tag) {                                   \
+      return checked_convert<type, uint64_t>(v.u, #type);             \
+    } else if (Tag::HAS_si == tag) {                                  \
+      return checked_convert<type, int64_t>(                          \
+          toSymInt().guard_int(__FILE__, __LINE__), #type);           \
+    } else if (Tag::HAS_sb == tag) {                                  \
+      return checked_convert<type, int64_t>(                          \
+          toSymBool().guard_bool(__FILE__, __LINE__), #type);         \
+    }                                                                 \
+    TORCH_CHECK(false)                                                \
+  }
+
+  // TODO: Support ComplexHalf accessor
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ACCESSOR)
+  DEFINE_ACCESSOR(uint16_t, UInt16)
+  DEFINE_ACCESSOR(uint32_t, UInt32)
+  DEFINE_ACCESSOR(uint64_t, UInt64)
+
+#undef DEFINE_ACCESSOR
+
+  SymInt toSymInt() const {
+    if (Tag::HAS_si == tag) {
+      return c10::SymInt(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toLong();
+    }
+  }
+
+  SymFloat toSymFloat() const {
+    if (Tag::HAS_sd == tag) {
+      return c10::SymFloat(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toDouble();
+    }
+  }
+
+  SymBool toSymBool() const {
+    if (Tag::HAS_sb == tag) {
+      return c10::SymBool(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toBool();
+    }
+  }
+
+  // also support scalar.to<int64_t>();
+  // Deleted for unsupported types, but specialized below for supported types
+  template <typename T>
+  T to() const = delete;
+
+  // audit uses of data_ptr
+  const void* data_ptr() const {
+    TORCH_INTERNAL_ASSERT(!isSymbolic());
+    return static_cast<const void*>(&v);
+  }
+
+  bool isFloatingPoint() const {
+    return Tag::HAS_d == tag || Tag::HAS_sd == tag;
+  }
+
+  [[deprecated(
+      "isIntegral is deprecated. Please use the overload with 'includeBool' parameter instead.")]] bool
+  isIntegral() const {
+    return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag;
+  }
+
+  bool isIntegral(bool includeBool) const {
+    return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag ||
+        (includeBool && isBoolean());
+  }
+
+  // See Note [Meaning of HAS_u]
+  bool isUnsigned() const {
+    return Tag::HAS_u == tag || (Tag::HAS_i == tag && v.i >= 0);
+  }
+
+  bool isComplex() const {
+    return Tag::HAS_z == tag;
+  }
+  bool isBoolean() const {
+    return Tag::HAS_b == tag || Tag::HAS_sb == tag;
+  }
+
+  // you probably don't actually want these; they're mostly for testing
+  bool isSymInt() const {
+    return Tag::HAS_si == tag;
+  }
+  bool isSymFloat() const {
+    return Tag::HAS_sd == tag;
+  }
+  bool isSymBool() const {
+    return Tag::HAS_sb == tag;
+  }
+
+  bool isSymbolic() const {
+    return Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag;
+  }
+
+  C10_ALWAYS_INLINE Scalar& operator=(Scalar&& other) noexcept {
+    if (&other == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(other));
+    return *this;
+  }
+
+  C10_ALWAYS_INLINE Scalar& operator=(const Scalar& other) {
+    if (&other == this) {
+      return *this;
+    }
+
+    *this = Scalar(other);
+    return *this;
+  }
+
+  Scalar operator-() const;
+  Scalar conj() const;
+  Scalar log() const;
+
+  template <
+      typename T,
+      typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      auto val = v.z;
+      return (val.real() == num) && (val.imag() == T());
+    } else if (isFloatingPoint()) {
+      return toDouble() == num;
+    } else if (tag == Tag::HAS_i) {
+      if (overflows<T>(v.i, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.i) == num;
+      }
+    } else if (tag == Tag::HAS_u) {
+      if (overflows<T>(v.u, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.u) == num;
+      }
+    } else if (tag == Tag::HAS_si) {
+      TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality");
+    } else if (isBoolean()) {
+      // boolean scalar does not equal to a non boolean value
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return false;
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return v.z == num;
+    } else if (isFloatingPoint()) {
+      return (toDouble() == num.real()) && (num.imag() == T());
+    } else if (tag == Tag::HAS_i) {
+      if (overflows<T>(v.i, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.i) == num.real() && num.imag() == T();
+      }
+    } else if (tag == Tag::HAS_u) {
+      if (overflows<T>(v.u, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.u) == num.real() && num.imag() == T();
+      }
+    } else if (tag == Tag::HAS_si) {
+      TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality");
+    } else if (isBoolean()) {
+      // boolean scalar does not equal to a non boolean value
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return false;
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+  bool equal(bool num) const {
+    if (isBoolean()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return static_cast<bool>(v.i) == num;
+    } else {
+      return false;
+    }
+  }
+
+  ScalarType type() const {
+    if (isComplex()) {
+      return ScalarType::ComplexDouble;
+    } else if (isFloatingPoint()) {
+      return ScalarType::Double;
+    } else if (isIntegral(/*includeBool=*/false)) {
+      // Represent all integers as long, UNLESS it is unsigned and therefore
+      // unrepresentable as long
+      if (Tag::HAS_u == tag) {
+        return ScalarType::UInt64;
+      }
+      return ScalarType::Long;
+    } else if (isBoolean()) {
+      return ScalarType::Bool;
+    } else {
+      TORCH_CHECK(false, "Unknown scalar type.");
+    }
+  }
+
+  Scalar(Scalar&& rhs) noexcept : tag(rhs.tag) {
+    moveFrom(std::move(rhs));
+  }
+
+  Scalar(const Scalar& rhs) : tag(rhs.tag), v(rhs.v) {
+    if (isSymbolic()) {
+      c10::raw::intrusive_ptr::incref(v.p);
+    }
+  }
+
+  Scalar(c10::SymInt si) {
+    if (auto m = si.maybe_as_int()) {
+      tag = Tag::HAS_i;
+      v.i = *m;
+    } else {
+      tag = Tag::HAS_si;
+      v.p = std::move(si).release();
+    }
+  }
+
+  Scalar(c10::SymFloat sd) {
+    if (sd.is_symbolic()) {
+      tag = Tag::HAS_sd;
+      v.p = std::move(sd).release();
+    } else {
+      tag = Tag::HAS_d;
+      v.d = sd.as_float_unchecked();
+    }
+  }
+
+  Scalar(c10::SymBool sb) {
+    if (auto m = sb.maybe_as_bool()) {
+      tag = Tag::HAS_b;
+      v.i = *m;
+    } else {
+      tag = Tag::HAS_sb;
+      v.p = std::move(sb).release();
+    }
+  }
+
+  // We can't set v in the initializer list using the
+  // syntax v{ .member = ... } because it doesn't work on MSVC
+ private:
+  enum class Tag { HAS_d, HAS_i, HAS_u, HAS_z, HAS_b, HAS_sd, HAS_si, HAS_sb };
+
+  // Note [Meaning of HAS_u]
+  // ~~~~~~~~~~~~~~~~~~~~~~~
+  // HAS_u is a bit special.  On its face, it just means that we
+  // are holding an unsigned integer.  However, we generally don't
+  // distinguish between different bit sizes in Scalar (e.g., we represent
+  // float as double), instead, it represents a mathematical notion
+  // of some quantity (integral versus floating point).  So actually,
+  // HAS_u is used solely to represent unsigned integers that could
+  // not be represented as a signed integer.  That means only uint64_t
+  // potentially can get this tag; smaller types like uint8_t fits into a
+  // regular int and so for BC reasons we keep as an int.
+
+  // NB: assumes that self has already been cleared
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  C10_ALWAYS_INLINE void moveFrom(Scalar&& rhs) noexcept {
+    v = rhs.v;
+    tag = rhs.tag;
+    if (rhs.tag == Tag::HAS_si || rhs.tag == Tag::HAS_sd ||
+        rhs.tag == Tag::HAS_sb) {
+      // Move out of scalar
+      rhs.tag = Tag::HAS_i;
+      rhs.v.i = 0;
+    }
+  }
+
+  Tag tag;
+
+  union v_t {
+    double d{};
+    int64_t i;
+    // See Note [Meaning of HAS_u]
+    uint64_t u;
+    c10::complex<double> z;
+    c10::intrusive_ptr_target* p;
+    // NOLINTNEXTLINE(modernize-use-equals-default)
+    v_t() {} // default constructor
+  } v;
+
+  template <
+      typename T,
+      typename std::enable_if_t<
+          std::is_integral_v<T> && !std::is_same_v<T, bool>,
+          bool>* = nullptr>
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_i) {
+    v.i = convert<decltype(v.i), T>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<
+          !std::is_integral_v<T> && !c10::is_complex<T>::value,
+          bool>* = nullptr>
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_d) {
+    v.d = convert<decltype(v.d), T>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<c10::is_complex<T>::value, bool>* = nullptr>
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_z) {
+    v.z = convert<decltype(v.z), T>(vv);
+  }
+};
+
+using OptionalScalarRef = c10::OptionalRef<Scalar>;
+
+// define the scalar.to<int64_t>() specializations
+#define DEFINE_TO(T, name)         \
+  template <>                      \
+  inline T Scalar::to<T>() const { \
+    return to##name();             \
+  }
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_TO)
+DEFINE_TO(uint16_t, UInt16)
+DEFINE_TO(uint32_t, UInt32)
+DEFINE_TO(uint64_t, UInt64)
+#undef DEFINE_TO
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h
new file mode 100644
index 0000000000000000000000000000000000000000..b678a22630d3d9e625b62149a580b3a0b3bbed9a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarType.h
@@ -0,0 +1,285 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Float4_e2m1fn_x2.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
+#include <c10/util/Half.h>
+#include <c10/util/bits.h>
+#include <c10/util/complex.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint2x4.h>
+#include <c10/util/quint4x2.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+#include <unordered_map>
+
+#include <torch/headeronly/core/ScalarType.h>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
+namespace c10 {
+
+// See [dtype Macros note] in torch/headeronly/core/ScalarType.h
+// regarding macros.
+
+#define DEFINE_CONSTANT(_, name) \
+  constexpr ScalarType k##name = ScalarType::name;
+
+// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
+#undef DEFINE_CONSTANT
+
+inline size_t elementSize(ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, name) \
+  case ScalarType::name:                   \
+    return sizeof(ctype);
+
+  switch (t) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(CASE_ELEMENTSIZE_CASE)
+    default:
+      TORCH_CHECK(false, "Unknown ScalarType");
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+inline bool isIntegralType(ScalarType t, bool includeBool) {
+  bool isIntegral =
+      (t == ScalarType::Byte || t == ScalarType::Char || t == ScalarType::Int ||
+       t == ScalarType::Long || t == ScalarType::Short ||
+       t == ScalarType::UInt16 || t == ScalarType::UInt32 ||
+       t == ScalarType::UInt64);
+
+  return isIntegral || (includeBool && t == ScalarType::Bool);
+}
+
+[[deprecated(
+    "isIntegralType is deprecated. Please use the overload with 'includeBool' parameter instead.")]] inline bool
+isIntegralType(ScalarType t) {
+  return isIntegralType(t, /*includeBool=*/false);
+}
+
+inline bool isFloat8Type(ScalarType t) {
+  return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
+      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz ||
+      t == ScalarType::Float8_e8m0fnu;
+}
+
+inline bool isReducedFloatingType(ScalarType t) {
+  return t == ScalarType::Half || t == ScalarType::BFloat16 ||
+      isFloat8Type(t) || t == ScalarType::Float4_e2m1fn_x2;
+}
+
+inline bool isFloatingType(ScalarType t) {
+  return t == ScalarType::Double || t == ScalarType::Float ||
+      isReducedFloatingType(t);
+}
+
+inline bool isComplexType(ScalarType t) {
+  return (
+      t == ScalarType::ComplexHalf || t == ScalarType::ComplexFloat ||
+      t == ScalarType::ComplexDouble);
+}
+
+inline bool isBitsType(ScalarType t) {
+  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
+      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
+      t == ScalarType::Bits16;
+}
+
+inline bool isBarebonesUnsignedType(ScalarType t) {
+  return t == ScalarType::UInt1 || t == ScalarType::UInt2 ||
+      t == ScalarType::UInt3 || t == ScalarType::UInt4 ||
+      t == ScalarType::UInt5 || t == ScalarType::UInt6 ||
+      t == ScalarType::UInt7 || t == ScalarType::UInt16 ||
+      t == ScalarType::UInt32 || t == ScalarType::UInt64;
+}
+
+inline ScalarType toQIntType(ScalarType t) {
+  switch (t) {
+    case ScalarType::Byte:
+      return ScalarType::QUInt8;
+    case ScalarType::Char:
+      return ScalarType::QInt8;
+    case ScalarType::Int:
+      return ScalarType::QInt32;
+    default:
+      return t;
+  }
+}
+
+inline bool isSignedType(ScalarType t) {
+#define CASE_ISSIGNED(name)     \
+  case ScalarType::name:        \
+    return std::numeric_limits< \
+        ::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;
+
+  // TODO(#146647): If we expect to have numeric_limits for everything,
+  // let's just have a big macro for the whole thing.
+  // If we're hardcoding it, let's just use the macro and a "true"/"false"
+  // below?
+  switch (t) {
+    case ScalarType::QInt8:
+    case ScalarType::QUInt8:
+    case ScalarType::QInt32:
+    case ScalarType::QUInt4x2:
+    case ScalarType::QUInt2x4:
+      TORCH_CHECK(false, "isSignedType not supported for quantized types");
+    case ScalarType::Bits1x8:
+    case ScalarType::Bits2x4:
+    case ScalarType::Bits4x2:
+    case ScalarType::Bits8:
+    case ScalarType::Bits16:
+      TORCH_CHECK(false, "Bits types are undefined");
+      CASE_ISSIGNED(UInt16);
+      CASE_ISSIGNED(UInt32);
+      CASE_ISSIGNED(UInt64);
+      CASE_ISSIGNED(BFloat16);
+      CASE_ISSIGNED(Float8_e5m2);
+      CASE_ISSIGNED(Float8_e5m2fnuz);
+      CASE_ISSIGNED(Float8_e4m3fn);
+      CASE_ISSIGNED(Float8_e4m3fnuz);
+      CASE_ISSIGNED(Float8_e8m0fnu);
+      CASE_ISSIGNED(Byte);
+      CASE_ISSIGNED(Char);
+      CASE_ISSIGNED(Short);
+      CASE_ISSIGNED(Int);
+      CASE_ISSIGNED(Long);
+      CASE_ISSIGNED(Half);
+      CASE_ISSIGNED(Float);
+      CASE_ISSIGNED(Double);
+      CASE_ISSIGNED(ComplexHalf);
+      CASE_ISSIGNED(ComplexFloat);
+      CASE_ISSIGNED(ComplexDouble);
+      CASE_ISSIGNED(Bool);
+    case ScalarType::Int1:
+    case ScalarType::Int2:
+    case ScalarType::Int3:
+    case ScalarType::Int4:
+    case ScalarType::Int5:
+    case ScalarType::Int6:
+    case ScalarType::Int7:
+    case ScalarType::Float4_e2m1fn_x2:
+      return true;
+    case ScalarType::UInt1:
+    case ScalarType::UInt2:
+    case ScalarType::UInt3:
+    case ScalarType::UInt4:
+    case ScalarType::UInt5:
+    case ScalarType::UInt6:
+    case ScalarType::UInt7:
+      return false;
+    case ScalarType::Undefined:
+    case ScalarType::NumOptions:
+      break;
+      // Do not add default here, but rather define behavior of every new entry
+      // here.  `-Wswitch-enum` would raise a warning in those cases.
+      // TODO: get PyTorch to adopt exhaustive switches by default with a way to
+      // opt specific switches to being non-exhaustive.
+      // Exhaustive:
+      // `-Wswitch-enum`, `-Wswitch-default`, `-Wno-covered-switch-default`
+      // Non-Exhaustive:
+      // `-Wno-switch-enum`, `-Wswitch-default`, `-Wcovered-switch-default`
+  }
+  TORCH_CHECK(false, "Unknown ScalarType ", t);
+#undef CASE_ISSIGNED
+}
+
+inline bool isUnderlying(ScalarType type, ScalarType qtype) {
+  return type == toUnderlying(qtype);
+}
+
+inline ScalarType toRealValueType(ScalarType t) {
+  switch (t) {
+    case ScalarType::ComplexHalf:
+      return ScalarType::Half;
+    case ScalarType::ComplexFloat:
+      return ScalarType::Float;
+    case ScalarType::ComplexDouble:
+      return ScalarType::Double;
+    default:
+      return t;
+  }
+}
+
+inline ScalarType toComplexType(ScalarType t) {
+  switch (t) {
+    case ScalarType::BFloat16:
+      // BFloat16 has range equivalent to Float,
+      // so we map it to ComplexFloat.
+      return ScalarType::ComplexFloat;
+    case ScalarType::Half:
+      return ScalarType::ComplexHalf;
+    case ScalarType::Float:
+      return ScalarType::ComplexFloat;
+    case ScalarType::Double:
+      return ScalarType::ComplexDouble;
+    case ScalarType::ComplexHalf:
+      return ScalarType::ComplexHalf;
+    case ScalarType::ComplexFloat:
+      return ScalarType::ComplexFloat;
+    case ScalarType::ComplexDouble:
+      return ScalarType::ComplexDouble;
+    default:
+      TORCH_CHECK(false, "Unknown Complex ScalarType for ", t);
+  }
+}
+
+// see tensor_attributes.rst for detailed explanation and examples
+// of casting rules.
+inline bool canCast(const ScalarType from, const ScalarType to) {
+  // We disallow complex -> non complex, e.g., float_tensor *= complex is
+  // disallowed.
+  if (isComplexType(from) && !isComplexType(to)) {
+    return false;
+  }
+  // We disallow float -> integral, e.g., int_tensor *= float is disallowed.
+  if (isFloatingType(from) && isIntegralType(to, false)) {
+    return false;
+  }
+
+  // Treat bool as a distinct "category," to be consistent with type promotion
+  // rules (e.g. `bool_tensor + 5 -> int64_tensor`). If `5` was in the same
+  // category as `bool_tensor`, we would not promote. Differing categories
+  // implies `bool_tensor += 5` is disallowed.
+  //
+  // NB: numpy distinguishes "unsigned" as a category to get the desired
+  // `bool_tensor + 5 -> int64_tensor` behavior. We don't, because:
+  // * We don't want the performance hit of checking the runtime sign of
+  // Scalars.
+  // * `uint8_tensor + 5 -> int64_tensor` would be undesirable.
+  if (from != ScalarType::Bool && to == ScalarType::Bool) {
+    return false;
+  }
+  return true;
+}
+
+C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
+
+// Returns a pair of strings representing the names for each dtype.
+// The returned pair is (name, legacy_name_if_applicable)
+C10_API std::pair<std::string, std::string> getDtypeNames(
+    c10::ScalarType scalarType);
+
+// Returns a map of string name to dtype.
+C10_API const std::unordered_map<std::string, ScalarType>& getStringToDtypeMap();
+
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h
new file mode 100644
index 0000000000000000000000000000000000000000..d952b0dd2089207bef2bd3b53d348d6cb667e046
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h
@@ -0,0 +1,62 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
+#include <c10/util/typeid.h>
+
+// these just expose TypeMeta/ScalarType bridge functions in c10
+// TODO move to typeid.h (or codemod away) when TypeMeta et al
+// are moved from caffe2 to c10 (see note at top of typeid.h)
+
+namespace c10 {
+
+/**
+ * convert ScalarType enum values to TypeMeta handles
+ */
+inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+  return caffe2::TypeMeta::fromScalarType(scalar_type);
+}
+
+/**
+ * convert TypeMeta handles to ScalarType enum values
+ */
+inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
+  return dtype.toScalarType();
+}
+
+/**
+ * typeMetaToScalarType(), lifted to optional
+ */
+inline std::optional<at::ScalarType> optTypeMetaToScalarType(
+    std::optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return std::nullopt;
+  }
+  return type_meta->toScalarType();
+}
+
+/**
+ * convenience: equality across TypeMeta/ScalarType conversion
+ */
+inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
+  return m.isScalarType(t);
+}
+
+inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
+  return t == m;
+}
+
+inline bool operator!=(ScalarType t, caffe2::TypeMeta m) {
+  return !(t == m);
+}
+
+inline bool operator!=(caffe2::TypeMeta m, ScalarType t) {
+  return !(t == m);
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..203eec24c05e28e413b69dc71fbb0b7be65538a2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Storage.h
@@ -0,0 +1,293 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstddef>
+#include <utility>
+
+namespace c10 {
+
+struct Storage;
+
+C10_API bool isSharedStorageAlias(
+    const Storage& storage0,
+    const Storage& storage1);
+
+struct C10_API Storage {
+ public:
+  struct use_byte_size_t {};
+  struct unsafe_borrow_t {
+    explicit unsafe_borrow_t() = default;
+  };
+
+  Storage() = default;
+  Storage(c10::intrusive_ptr<StorageImpl> ptr)
+      : storage_impl_(std::move(ptr)) {}
+
+  // Allocates memory buffer using given allocator and creates a storage with it
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      const SymInt& size_bytes,
+      Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            size_bytes,
+            allocator,
+            resizable)) {}
+
+  // Creates storage with pre-allocated memory buffer. Allocator is given for
+  // potential future reallocations, however it can be nullptr if the storage
+  // is non-resizable
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      size_t size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            size_bytes,
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
+  // Creates storage with pre-allocated memory buffer. Allocator is given for
+  // potential future reallocations, however it can be nullptr if the storage
+  // is non-resizable
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      SymInt size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            std::move(size_bytes),
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
+ protected:
+  explicit Storage(unsafe_borrow_t /*unused*/, const Storage& rhs)
+      : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
+            rhs.storage_impl_.get())) {}
+
+  friend MaybeOwnedTraits<Storage>;
+
+ public:
+  // Legacy constructor for partially initialized (dtype or memory) storages
+  // that can be temporarily created with Caffe2 APIs. See the note on top of
+  // TensorImpl.h for details.
+  static Storage create_legacy(at::Device device) {
+    auto allocator = GetAllocator(device.type());
+    return Storage(c10::make_intrusive<StorageImpl>(
+        StorageImpl::use_byte_size_t(),
+        0,
+        allocator->allocate(0), // materialize a non-default Device.
+        allocator,
+        true));
+  }
+
+  // Mimic create_legacy, but without requiring a newly-created StorageImpl.
+  void reset_legacy() {
+    TORCH_CHECK(resizable() && allocator());
+    set_nbytes(0);
+    set_data_ptr_noswap(allocator()->allocate(0));
+  }
+
+  // TODO: remove later
+  void set_nbytes(size_t size_bytes) const {
+    storage_impl_->set_nbytes(size_bytes);
+  }
+
+  void set_nbytes(c10::SymInt size_bytes) const {
+    storage_impl_->set_nbytes(std::move(size_bytes));
+  }
+
+  bool resizable() const {
+    return storage_impl_->resizable();
+  }
+
+  size_t nbytes() const {
+    return storage_impl_->nbytes();
+  }
+
+  SymInt sym_nbytes() const {
+    return storage_impl_->sym_nbytes();
+  }
+  // get() use here is to get const-correctness
+
+  const void* data() const {
+    return storage_impl_->data();
+  }
+
+  void* mutable_data() const {
+    return storage_impl_->mutable_data();
+  }
+
+  at::DataPtr& mutable_data_ptr() const {
+    return storage_impl_->mutable_data_ptr();
+  }
+
+  const at::DataPtr& data_ptr() const {
+    return storage_impl_->data_ptr();
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) const {
+    return storage_impl_->set_data_ptr(std::move(data_ptr));
+  }
+
+  void set_data_ptr_noswap(at::DataPtr&& data_ptr) const {
+    storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+  }
+
+  DeviceType device_type() const {
+    return storage_impl_->device_type();
+  }
+
+  at::Allocator* allocator() const {
+    return storage_impl_->allocator();
+  }
+
+  at::Device device() const {
+    return storage_impl_->device();
+  }
+
+  StorageImpl* unsafeReleaseStorageImpl() {
+    return storage_impl_.release();
+  }
+
+  StorageImpl* unsafeGetStorageImpl() const noexcept {
+    return storage_impl_.get();
+  }
+
+  c10::weak_intrusive_ptr<StorageImpl> getWeakStorageImpl() const {
+    return c10::weak_intrusive_ptr<StorageImpl>(storage_impl_);
+  }
+
+  operator bool() const {
+    return storage_impl_;
+  }
+
+  size_t use_count() const {
+    return storage_impl_.use_count();
+  }
+
+  inline bool unique() const {
+    return storage_impl_.unique();
+  }
+
+  bool is_alias_of(const Storage& other) const {
+    return (
+        storage_impl_ == other.storage_impl_ ||
+        isSharedStorageAlias(*this, other));
+  }
+
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    if (!storage_impl_.unique()) {
+      TORCH_CHECK(
+          false,
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(src, capacity, d);
+  }
+
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      size_t capacity) {
+    if (!storage_impl_.unique()) {
+      TORCH_CHECK(
+          false,
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        std::move(data_ptr), capacity);
+  }
+
+ protected:
+  c10::intrusive_ptr<StorageImpl> storage_impl_;
+};
+
+template <>
+struct MaybeOwnedTraits<c10::Storage> {
+  using owned_type = c10::Storage;
+  using borrow_type = c10::Storage;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseStorageImpl();
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseStorageImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<c10::Storage> {
+  using repr_type = c10::Storage;
+  using pointer_type = c10::Storage*;
+  using const_pointer_type = const c10::Storage*;
+
+  static repr_type nullRepr() {
+    return c10::Storage();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return c10::Storage(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(c10::Storage&& x) {
+    return std::move(x);
+  }
+
+  static c10::Storage take(c10::Storage& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2acfa40771c5f29fb41565a06dfd6944a1a55ea4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StorageImpl.h
@@ -0,0 +1,398 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/impl/COW.h>
+#include <c10/core/impl/COWDeleter.h>
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstddef>
+#include <utility>
+
+namespace c10 {
+
+[[noreturn]] C10_API void throwNullDataPtrError();
+C10_API void warnDeprecatedDataPtr();
+
+// Used in StorageImpl to store extra metadata.
+// Currently used only for storing a custom error message
+// used when throwing an exception when data_ptr is accessed.
+struct C10_API StorageExtraMeta {
+  std::optional<std::string> custom_data_ptr_error_msg_ = std::nullopt;
+};
+
+// A storage represents the underlying backing data buffer for a
+// tensor.  This concept was inherited from the original Torch7
+// codebase; we'd kind of like to get rid of the concept
+// (see https://github.com/pytorch/pytorch/issues/14797) but
+// it's hard work and no one has gotten around to doing it.
+//
+// NB: storage is supposed to uniquely own a data pointer; e.g.,
+// two non-null data pointers alias if and only if they are from
+// the same storage.  Technically you can violate this invariant
+// (e.g., you can create a non-owning StorageImpl with at::from_blob)
+// but a lot of things won't work correctly, including:
+//
+// - An ordinary deleter on such a storage is wrong, because normal deleters
+//   assume unique ownership, but if you have two storages at the same data,
+//   that implies there is some sort of shared ownership. So your deleter would
+//   have to actually be internally doing some sort of refcount thing
+// - Deepcopy in Python side relies on storage equality and not data pointer
+//   equality; so if there are two separate storages pointing to the same data,
+//   the data will actually get duplicated in that case (one data ptr before,
+//   two data ptrs after)
+// - Version counts won't work correctly, because we do all VC tracking at the
+//   level of storages (unless you explicitly disconnect the VC with detach);
+//   mutation because data pointers are the same are totally untracked
+struct C10_API StorageImpl : public c10::intrusive_ptr_target {
+ public:
+  struct use_byte_size_t {};
+
+  StorageImpl(
+      use_byte_size_t /*use_byte_size*/,
+      SymInt size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
+      : data_ptr_(std::move(data_ptr)),
+        size_bytes_(std::move(size_bytes)),
+        size_bytes_is_heap_allocated_(size_bytes_.is_heap_allocated()),
+        resizable_(resizable),
+        received_cuda_(false),
+        allocator_(allocator) {
+    if (resizable) {
+      TORCH_INTERNAL_ASSERT(
+          allocator_, "For resizable storage, allocator must be provided");
+    }
+    refresh_has_data_ptr_check();
+  }
+
+  StorageImpl(
+      use_byte_size_t /*use_byte_size*/,
+      const SymInt& size_bytes,
+      at::Allocator* allocator,
+      bool resizable)
+      : StorageImpl(
+            use_byte_size_t(),
+            size_bytes,
+            size_bytes.is_heap_allocated()
+                ? allocator->allocate(0)
+                : allocator->allocate(size_bytes.as_int_unchecked()),
+            allocator,
+            resizable) {}
+
+  StorageImpl& operator=(StorageImpl&& other) = delete;
+  StorageImpl& operator=(const StorageImpl&) = delete;
+  StorageImpl() = delete;
+  StorageImpl(StorageImpl&& other) = delete;
+  StorageImpl(const StorageImpl&) = delete;
+  ~StorageImpl() override = default;
+
+  void reset() {
+    data_ptr_.clear();
+    size_bytes_ = 0;
+    size_bytes_is_heap_allocated_ = false;
+  }
+
+  // Destructor doesn't call release_resources because it's
+  // unnecessary; don't forget to change that if needed!
+  void release_resources() override {
+    data_ptr_.clear();
+  }
+
+  void incref_pyobject() const noexcept override final;
+
+  void decref_pyobject() const noexcept override final;
+
+  bool try_incref_pyobject() const noexcept override final;
+
+  size_t nbytes() const {
+    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
+    TORCH_CHECK(!size_bytes_is_heap_allocated_);
+    return size_bytes_.as_int_unchecked();
+  }
+
+  SymInt sym_nbytes() const {
+    return size_bytes_;
+  }
+
+  // TODO: remove later
+  void set_nbytes(size_t size_bytes) {
+    size_bytes_ = static_cast<int64_t>(size_bytes);
+    size_bytes_is_heap_allocated_ = false;
+  }
+
+  void unsafe_set_nbytes(size_t size_bytes) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!size_bytes_is_heap_allocated_);
+    size_bytes_.unsafe_set_data(size_bytes);
+  }
+
+  void set_nbytes(c10::SymInt size_bytes) {
+    size_bytes_ = std::move(size_bytes);
+  }
+
+  bool resizable() const {
+    return resizable_;
+  }
+
+  const at::DataPtr& data_ptr() const {
+    if (C10_UNLIKELY(throw_on_immutable_data_ptr_)) {
+      throw_data_ptr_access_error();
+    }
+    return data_ptr_;
+  }
+
+  at::DataPtr& mutable_data_ptr() {
+    if (C10_UNLIKELY(has_mutable_data_ptr_check_)) {
+      if (throw_on_immutable_data_ptr_) {
+        throw_data_ptr_access_error();
+      }
+      if (throw_on_mutable_data_ptr_) {
+        throwNullDataPtrError();
+      }
+      if (warn_deprecated_on_mutable_data_ptr_) {
+        warnDeprecatedDataPtr();
+      }
+      maybe_materialize_cow();
+    }
+    return data_ptr_;
+  }
+
+  // Returns the data_ptr. Bypasses all checks.
+  at::DataPtr& _mutable_data_ptr_no_checks() {
+    return data_ptr_;
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    // We need to materialize the old COW DataPtr because it is
+    // being returned as mutable.
+    maybe_materialize_cow();
+    return set_data_ptr_no_materialize_cow(std::move(data_ptr));
+  }
+
+  void set_data_ptr_noswap(at::DataPtr&& data_ptr) {
+    data_ptr_ = std::move(data_ptr);
+    refresh_has_data_ptr_check();
+  }
+
+  const void* data() const {
+    if (C10_UNLIKELY(throw_on_immutable_data_ptr_)) {
+      throw_data_ptr_access_error();
+    }
+    return data_ptr_.get();
+  }
+
+  void* mutable_data() {
+    if (C10_UNLIKELY(has_mutable_data_ptr_check_)) {
+      if (throw_on_immutable_data_ptr_) {
+        throw_data_ptr_access_error();
+      }
+      if (throw_on_mutable_data_ptr_) {
+        throwNullDataPtrError();
+      }
+      if (warn_deprecated_on_mutable_data_ptr_) {
+        warnDeprecatedDataPtr();
+      }
+      maybe_materialize_cow();
+    }
+    return data_ptr_.mutable_get();
+  }
+
+  at::DeviceType device_type() const {
+    return data_ptr_.device().type();
+  }
+
+  at::Allocator* allocator() {
+    return allocator_;
+  }
+
+  const at::Allocator* allocator() const {
+    return allocator_;
+  }
+
+  // You generally shouldn't use this method, but it is occasionally
+  // useful if you want to override how a tensor will be reallocated,
+  // after it was already allocated (and its initial allocator was
+  // set)
+  void set_allocator(at::Allocator* allocator) {
+    allocator_ = allocator;
+  }
+
+  Device device() const {
+    return data_ptr_.device();
+  }
+
+  void set_resizable(bool resizable) {
+    if (resizable) {
+      // We need an allocator to be resizable
+      AT_ASSERT(allocator_);
+    }
+    resizable_ = resizable;
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      size_t size_bytes,
+      DeleterFnPtr d = nullptr) {
+    UniqueStorageShareExternalPointer(
+        at::DataPtr(src, src, d, data_ptr_.device()), size_bytes);
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      size_t size_bytes) {
+    data_ptr_ = std::move(data_ptr);
+    size_bytes_ = static_cast<int64_t>(size_bytes);
+    size_bytes_is_heap_allocated_ = false;
+    allocator_ = nullptr;
+    resizable_ = false;
+  }
+
+  // This method can be used only after storage construction and cannot be used
+  // to modify storage status
+  void set_received_cuda(bool received_cuda) {
+    received_cuda_ = received_cuda;
+  }
+
+  bool received_cuda() {
+    return received_cuda_;
+  }
+
+  impl::PyObjectSlot* pyobj_slot() {
+    return &pyobj_slot_;
+  }
+
+  const impl::PyObjectSlot* pyobj_slot() const {
+    return &pyobj_slot_;
+  }
+
+  StorageExtraMeta& get_extra_meta() {
+    if (!extra_meta_) {
+      extra_meta_ = std::make_unique<StorageExtraMeta>();
+    }
+    return *extra_meta_;
+  }
+
+  [[noreturn]] void throw_data_ptr_access_error() const;
+
+  void release_data_and_set_meta_custom_data_ptr_error_msg_(
+      std::optional<std::string> s) {
+    throw_on_immutable_data_ptr_ = true;
+    get_extra_meta().custom_data_ptr_error_msg_ = std::move(s);
+    refresh_has_data_ptr_check();
+  }
+
+  void set_throw_on_mutable_data_ptr() {
+    throw_on_mutable_data_ptr_ = true;
+    refresh_has_data_ptr_check();
+  }
+
+  void set_warn_deprecated_on_mutable_data_ptr() {
+    warn_deprecated_on_mutable_data_ptr_ = true;
+    refresh_has_data_ptr_check();
+  }
+
+ protected:
+  // materialize_cow_storage needs to call set_data_ptr_no_materlize_cow
+  friend void c10::impl::cow::materialize_cow_storage(StorageImpl& storage);
+
+  // Returns the previous data_ptr. If the old data_ptr was COW,
+  // this avoids materializing it
+  at::DataPtr set_data_ptr_no_materialize_cow(at::DataPtr&& data_ptr) {
+    at::DataPtr old_data_ptr(std::move(data_ptr_));
+    data_ptr_ = std::move(data_ptr);
+    refresh_has_data_ptr_check();
+    return old_data_ptr;
+  }
+
+ private:
+  void refresh_has_data_ptr_check() {
+    has_mutable_data_ptr_check_ = is_cow() || throw_on_mutable_data_ptr_ ||
+        warn_deprecated_on_mutable_data_ptr_ || throw_on_immutable_data_ptr_;
+  }
+
+  inline bool is_cow() const {
+    return c10::impl::cow::is_cow_data_ptr(data_ptr_);
+  }
+
+  // Triggers a copy if this is a copy-on-write tensor.
+  void maybe_materialize_cow() {
+    if (is_cow()) {
+      impl::cow::materialize_cow_storage(*this);
+    }
+  }
+
+  DataPtr data_ptr_;
+  SymInt size_bytes_;
+  bool size_bytes_is_heap_allocated_;
+  bool resizable_;
+  // Identifies that Storage was received from another process and doesn't have
+  // local to process cuda memory allocation
+  bool received_cuda_;
+  // All special checks in data/data_ptr calls are guarded behind this single
+  // boolean. This is for performance: .data/.data_ptr calls are commonly in the
+  // hot-path.
+  bool has_mutable_data_ptr_check_ = false;
+  // If we should throw when mutable_data_ptr() or mutable_data() is called.
+  bool throw_on_mutable_data_ptr_ = false;
+  // If we should throw when data_ptr() or data() is called.
+  bool throw_on_immutable_data_ptr_ = false;
+  // If we warn when mutable_data_ptr() or mutable_data() is called.
+  bool warn_deprecated_on_mutable_data_ptr_ = false;
+  Allocator* allocator_;
+  impl::PyObjectSlot pyobj_slot_;
+  std::unique_ptr<StorageExtraMeta> extra_meta_ = nullptr;
+};
+
+// Declare StorageImpl create function pointer types.
+using StorageImplCreateHelper = intrusive_ptr<StorageImpl> (*)(
+    StorageImpl::use_byte_size_t,
+    SymInt size_bytes,
+    DataPtr data_ptr,
+    Allocator* allocator,
+    bool resizable);
+
+C10_API void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr);
+
+C10_API StorageImplCreateHelper GetStorageImplCreate(DeviceType t);
+
+C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
+    c10::StorageImpl::use_byte_size_t use_byte_size,
+    c10::SymInt size_bytes,
+    c10::DataPtr data_ptr,
+    c10::Allocator* allocator,
+    bool resizable,
+    std::optional<at::Device> device_opt);
+
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d3a50984ec6e9093a321b7df2855383758e50ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/Stream.h
@@ -0,0 +1,182 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+
+namespace c10 {
+
+/// An index representing a specific stream.  A StreamId is not independently
+/// meaningful without knowing the Device it is associated with; try to
+/// use Stream rather than StreamId directly.
+///
+/// StreamIds are opaque; they are assigned by some DeviceType-specific
+/// numbering system which is not visible to the user.  HOWEVER, we
+/// guarantee that StreamId 0 is always a valid stream, and corresponds
+/// to some sort of "default" stream.
+using StreamId = int64_t;
+
+struct C10_API StreamData3 {
+  StreamId stream_id;
+  DeviceIndex device_index;
+  DeviceType device_type;
+};
+
+// NB: I decided not to call the above StreamIndex to avoid confusion with
+// DeviceIndex.  This way, you access device index with index(), and stream id
+// with id()
+
+/**
+ * A stream is a software mechanism used to synchronize launched kernels
+ * without requiring explicit synchronizations between kernels.  The basic
+ * model is that every kernel launch is associated with a stream: every
+ * kernel on the same stream is implicitly synchronized so that if I launch
+ * kernels A and B on the same stream, A is guaranteed to finish before B
+ * launches.  If I want B to run concurrently with A, I must schedule
+ * it on a different stream.
+ *
+ * The Stream class is a backend agnostic value class representing a stream
+ * which I may schedule a kernel on.  Every stream is associated with a device,
+ * which is recorded in stream, which is used to avoid confusion about which
+ * device a stream refers to.
+ *
+ * Streams are explicitly thread-safe, in the sense that it is OK to pass
+ * a Stream from one thread to another, and kernels queued from two different
+ * threads will still get serialized appropriately.  (Of course, the
+ * time when the kernels get queued is undetermined unless you synchronize
+ * host side ;)
+ *
+ * Stream does NOT have a default constructor.  Streams are for expert
+ * users; if you want to use Streams, we're going to assume you know
+ * how to deal with C++ template error messages if you try to
+ * resize() a vector of Streams.
+ *
+ * Known instances of streams in backends:
+ *
+ *  - cudaStream_t (CUDA)
+ *  - hipStream_t (HIP)
+ *  - cl_command_queue (OpenCL)  (NB: Caffe2's existing OpenCL integration
+ *    does NOT support command queues.)
+ *
+ * Because this class is device agnostic, it cannot provide backend-specific
+ * functionality (e.g., get the cudaStream_t of a CUDA stream.)  There are
+ * wrapper classes which provide this functionality, e.g., CUDAStream.
+ */
+class C10_API Stream final {
+ private:
+  Device device_;
+  StreamId id_;
+
+ public:
+  enum Unsafe { UNSAFE };
+  enum Default { DEFAULT };
+
+  /// Unsafely construct a stream from a Device and a StreamId.  In
+  /// general, only specific implementations of streams for a
+  /// backend should manufacture Stream directly in this way; other users
+  /// should use the provided APIs to get a stream.  In particular,
+  /// we don't require backends to give any guarantees about non-zero
+  /// StreamIds; they are welcome to allocate in whatever way they like.
+  explicit Stream(Unsafe /*unused*/, Device device, StreamId id)
+      : device_(device), id_(id) {}
+
+  /// Construct the default stream of a Device.  The default stream is
+  /// NOT the same as the current stream; default stream is a fixed stream
+  /// that never changes, whereas the current stream may be changed by
+  /// StreamGuard.
+  explicit Stream(Default /*unused*/, Device device)
+      : device_(device), id_(0) {}
+
+  bool operator==(const Stream& other) const noexcept {
+    return this->device_ == other.device_ && this->id_ == other.id_;
+  }
+  bool operator!=(const Stream& other) const noexcept {
+    return !(*this == other);
+  }
+
+  Device device() const noexcept {
+    return device_;
+  }
+  DeviceType device_type() const noexcept {
+    return device_.type();
+  }
+  DeviceIndex device_index() const noexcept {
+    return device_.index();
+  }
+  StreamId id() const noexcept {
+    return id_;
+  }
+
+  // Enqueues a wait instruction in the stream's work queue.
+  // This instruction is a no-op unless the event is marked
+  // for recording. In that case the stream stops processing
+  // until the event is recorded.
+  template <typename T>
+  void wait(const T& event) const {
+    event.block(*this);
+  }
+
+  // Return whether all asynchronous work previously enqueued on this stream
+  // has completed running on the device.
+  bool query() const;
+
+  // Wait (by blocking the calling thread) until all asynchronous work enqueued
+  // on this stream has completed running on the device.
+  void synchronize() const;
+
+  // The purpose of this function is to more conveniently permit binding
+  // of Stream to and from Python.  Without packing, I have to setup a whole
+  // class with two fields (device and stream id); with packing I can just
+  // store a single uint64_t.
+  //
+  // The particular way we pack streams into a uint64_t is considered an
+  // implementation detail and should not be relied upon.
+  uint64_t hash() const noexcept {
+    // Concat these together into a 64-bit integer
+    uint64_t bits = static_cast<uint64_t>(device_type()) << 56 |
+        static_cast<uint64_t>(device_index()) << 48 |
+        // Remove the sign extension part of the 64-bit address because
+        // the id might be used to hold a pointer.
+        (static_cast<uint64_t>(id()) & ((1ull << 48) - 1));
+    return bits;
+  }
+
+  struct StreamData3 pack3() const {
+    return {id(), device_index(), device_type()};
+  }
+
+  static Stream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    TORCH_CHECK(isValidDeviceType(device_type));
+    return Stream(UNSAFE, Device(device_type, device_index), stream_id);
+  }
+
+  // I decided NOT to provide setters on this class, because really,
+  // why would you change the device of a stream?  Just construct
+  // it correctly from the beginning dude.
+};
+
+C10_API std::ostream& operator<<(std::ostream& stream, const Stream& s);
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::Stream> {
+  size_t operator()(c10::Stream s) const noexcept {
+    return std::hash<uint64_t>{}(s.hash());
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..003816d62f6ce12223cc5106eee6ae37a26e04e9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/StreamGuard.h
@@ -0,0 +1,178 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+/**
+ * A StreamGuard is an RAII class that changes the current device
+ * to the device corresponding to some stream, and changes the
+ * default stream on that device to be this stream.
+ *
+ * Use of StreamGuard is HIGHLY discouraged in operator definitions.  In
+ * a single operator, you probably don't know enough about the global
+ * state of the world to profitably decide how to set streams.  Let
+ * the caller handle this appropriately, and just use the current stream
+ * in your operator code.
+ *
+ * This StreamGuard does NOT have an uninitialized state; it is guaranteed
+ * to reset the stream and device on exit.  If you are in a situation
+ * where you *might* want to setup a stream guard, see OptionalStreamGuard.
+ */
+struct StreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit StreamGuard() = delete;
+  ~StreamGuard() = default;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current  stream on that device to the passed stream.
+  explicit StreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Copy is disallowed
+  StreamGuard(const StreamGuard&) = delete;
+  StreamGuard& operator=(const StreamGuard&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  StreamGuard(StreamGuard&& other) = delete;
+  StreamGuard& operator=(StreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on , use MultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the stream that was set at the time the guard was constructed.
+  Stream original_stream() const {
+    return guard_.original_stream();
+  }
+
+  /// Returns the most recent stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  Stream current_stream() const {
+    return guard_.current_stream();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::VirtualGuardImpl> guard_;
+};
+
+/**
+ * An OptionalStreamGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * See OptionalDeviceGuard for more guidance on how to use this class.
+ */
+struct OptionalStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalStreamGuard() = default;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  explicit OptionalStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalStreamGuard(std::optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalStreamGuard(const OptionalStreamGuard&) = delete;
+  OptionalStreamGuard& operator=(const OptionalStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalStreamGuard(OptionalStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalStreamGuard& operator=(OptionalStreamGuard&& other) = delete;
+  ~OptionalStreamGuard() = default;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the stream that was set at the time the guard was most recently
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<Stream> original_stream() const {
+    return guard_.original_stream();
+  }
+
+  /// Returns the most recent  stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<Stream> current_stream() const {
+    return guard_.current_stream();
+  }
+
+  /// Restore the original  device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_;
+};
+
+/**
+ * A MultiStreamGuard is an RAII class that sets the current streams of a set of
+ * devices all at once, and resets them to their original values on destruction.
+ */
+struct MultiStreamGuard {
+  /// Set the current streams to the passed streams on each of their respective
+  /// devices.
+  explicit MultiStreamGuard(ArrayRef<Stream> streams) : guard_(streams) {}
+
+  /// Copy is disallowed
+  MultiStreamGuard(const MultiStreamGuard&) = delete;
+  MultiStreamGuard& operator=(const MultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  MultiStreamGuard(MultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  MultiStreamGuard& operator=(MultiStreamGuard&& other) = delete;
+  ~MultiStreamGuard() = default;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::VirtualGuardImpl> guard_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h
new file mode 100644
index 0000000000000000000000000000000000000000..d12fa75fb41446f3f9967a73aed8a25fc1a60f4b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymBool.h
@@ -0,0 +1,184 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <utility>
+
+namespace c10 {
+
+class SymInt;
+
+class C10_API SymBool {
+ public:
+  /*implicit*/ SymBool(bool b) : data_(b) {}
+  SymBool(SymNode ptr) : data_(false), ptr_(std::move(ptr)) {
+    TORCH_CHECK(ptr_->is_bool());
+  }
+  SymBool() : data_(false) {}
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    return ptr_.get();
+  }
+
+  SymNodeImpl* release() && {
+    return std::move(ptr_).release();
+  }
+
+  // Only valid if is_heap_allocated()
+  SymNode toSymNodeImpl() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  bool expect_bool() const {
+    std::optional<bool> c = maybe_as_bool();
+    TORCH_CHECK(c.has_value());
+    return *c;
+  }
+
+  SymBool sym_and(const SymBool& /*sci*/) const;
+  SymBool sym_or(const SymBool& /*sci*/) const;
+  SymBool sym_not() const;
+
+  SymBool operator&(const SymBool& other) const {
+    return sym_and(other);
+  }
+  SymBool operator|(const SymBool& other) const {
+    return sym_or(other);
+  }
+  SymBool operator||(const SymBool& other) const {
+    return sym_or(other);
+  }
+  SymBool operator~() const {
+    return sym_not();
+  }
+
+  // Insert a guard for the bool to be its concrete value, and then return
+  // that value.  Note that C++ comparison operations default to returning
+  // bool, so it's not so common to have to call this
+  bool guard_bool(const char* file, int64_t line) const;
+  bool expect_true(const char* file, int64_t line) const;
+  bool guard_size_oblivious(const char* file, int64_t line) const;
+  bool statically_known_true(const char* file, int64_t line) const;
+  bool guard_or_false(const char* file, int64_t line) const;
+  bool guard_or_true(const char* file, int64_t line) const;
+
+  bool has_hint() const;
+
+  bool as_bool_unchecked() const {
+    return data_;
+  }
+
+  std::optional<bool> maybe_as_bool() const {
+    if (!is_heap_allocated()) {
+      return data_;
+    }
+    return toSymNodeImplUnowned()->constant_bool();
+  }
+
+  // Convert SymBool to SymInt (0 or 1)
+  // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
+  SymInt toSymInt() const;
+
+  bool is_heap_allocated() const {
+    return ptr_;
+  }
+
+ private:
+  // TODO: optimize to union
+  bool data_;
+  SymNode ptr_;
+};
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymBool& s);
+
+#define TORCH_SYM_CHECK(cond, ...) \
+  TORCH_CHECK((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)
+#define TORCH_SYM_INTERNAL_ASSERT(cond, ...) \
+  TORCH_INTERNAL_ASSERT((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)
+#define TORCH_MAYBE_SYM_CHECK(cond, ...)                                 \
+  if constexpr (std::is_same_v<std::decay_t<decltype(cond)>, SymBool>) { \
+    TORCH_CHECK((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)     \
+  } else {                                                               \
+    TORCH_CHECK((cond), __VA_ARGS__)                                     \
+  }
+
+inline bool guard_size_oblivious(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool guard_size_oblivious(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_size_oblivious(file, line);
+}
+
+inline bool guard_or_false(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool guard_or_false(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_or_false(file, line);
+}
+
+inline bool statically_known_true(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool statically_known_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.statically_known_true(file, line);
+}
+
+inline bool guard_or_true(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool guard_or_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_or_true(file, line);
+}
+
+#define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \
+  c10::guard_size_oblivious((cond), __FILE__, __LINE__)
+
+#define TORCH_STATICALLY_KNOWN_TRUE(cond) \
+  c10::statically_known_true((cond), __FILE__, __LINE__)
+
+#define TORCH_GUARD_OR_FALSE(cond) \
+  c10::guard_or_false((cond), __FILE__, __LINE__)
+
+#define TORCH_GUARD_OR_TRUE(cond) c10::guard_or_true((cond), __FILE__, __LINE__)
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h
new file mode 100644
index 0000000000000000000000000000000000000000..332726ba4c5dade5accef6a3dac6076366c04d95
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymFloat.h
@@ -0,0 +1,123 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymBool.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <utility>
+
+namespace c10 {
+
+// NB: this is actually double precision; we're using the Python naming here
+class C10_API SymFloat {
+ public:
+  /*implicit*/ SymFloat(double d) : data_(d) {}
+  SymFloat(SymNode ptr)
+      : data_(std::numeric_limits<double>::quiet_NaN()), ptr_(std::move(ptr)) {
+    TORCH_CHECK(ptr_->is_float());
+  }
+  SymFloat() : data_(0.0) {}
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    return ptr_.get();
+  }
+
+  SymNodeImpl* release() && {
+    return std::move(ptr_).release();
+  }
+
+  // Only valid if is_symbolic()
+  SymNode toSymNodeImpl() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  double expect_float() const {
+    TORCH_CHECK(!is_symbolic());
+    return data_;
+  }
+
+  SymFloat operator+(const SymFloat& /*sci*/) const;
+  SymFloat operator-(const SymFloat& /*sci*/) const;
+  SymFloat operator*(const SymFloat& /*sci*/) const;
+  SymFloat operator/(const SymFloat& /*sci*/) const;
+
+  SymBool sym_eq(const SymFloat& /*sci*/) const;
+  SymBool sym_ne(const SymFloat& /*sci*/) const;
+  SymBool sym_lt(const SymFloat& /*sci*/) const;
+  SymBool sym_le(const SymFloat& /*sci*/) const;
+  SymBool sym_gt(const SymFloat& /*sci*/) const;
+  SymBool sym_ge(const SymFloat& /*sci*/) const;
+
+  bool operator==(const SymFloat& o) const {
+    return sym_eq(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator!=(const SymFloat& o) const {
+    return sym_ne(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<(const SymFloat& o) const {
+    return sym_lt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<=(const SymFloat& o) const {
+    return sym_le(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>(const SymFloat& o) const {
+    return sym_gt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>=(const SymFloat& o) const {
+    return sym_ge(o).guard_bool(__FILE__, __LINE__);
+  }
+
+  SymFloat min(const SymFloat& sci) const;
+  SymFloat max(const SymFloat& sci) const;
+
+  // Need guidance on where to put this code
+  SymFloat sqrt() const;
+
+  // Insert a guard for the float to be its concrete value, and then return
+  // that value.  This operation always works, even if the float is symbolic,
+  // so long as we know what the underlying value is. Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_float(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  double guard_float(const char* file, int64_t line) const;
+
+  bool has_hint() const;
+
+  // N.B. It's important to keep this definition in the header
+  // as we expect if checks to be folded for mobile builds
+  // where `is_symbolic` is always false
+  C10_ALWAYS_INLINE bool is_symbolic() const {
+    return ptr_;
+  }
+
+  // UNSAFELY coerce this SymFloat into a double.  You MUST have
+  // established that this is a non-symbolic by some other means,
+  // typically by having tested is_symbolic().  You will get garbage
+  // from this function if is_symbolic()
+  double as_float_unchecked() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_symbolic());
+    return data_;
+  }
+
+ private:
+  // TODO: optimize to union
+  double data_;
+  SymNode ptr_;
+};
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymFloat& s);
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9fa7f645047dbf5f8a2f1831d362606e8d98e98
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymInt.h
@@ -0,0 +1,586 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymBool.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <optional>
+#include <ostream>
+#include <type_traits>
+
+namespace c10 {
+
+class SymFloat;
+
+// SymInt represents either a regular int64_t, or a symbolic integer
+// (represented in a type erased way as SymNode).  The intention is for SymInt
+// to represent symbolic sizes that arise when doing shape computation in
+// operator kernels. This allows for tracing through programs without baking in
+// concrete sizes into kernel calls.
+//
+// SymInt has an API equivalent to int64_t.  In particular, it is a value type.
+// Internally, SymInt is represented in a clever packed way, so that it only
+// occupies one word of space; but morally, it is a union between an int64_t
+// and an intrusive pointer to SymNodeImpl.
+//
+// Invariant: the referenced SymNodeImpl is guaranteed to be a SymNode where
+// is_int() returns true
+
+class C10_API SymInt {
+ public:
+  enum Unchecked {
+    UNCHECKED,
+  };
+
+  /*implicit*/ SymInt(int64_t d) : data_(d) {
+    if (is_heap_allocated()) {
+      // Large negative number, heap allocate it
+      promote_to_negative();
+    }
+  }
+  SymInt() : data_(0) {}
+  SymInt(SymNode n);
+
+  // unchecked c-tor accepting raw `data_`
+  // One appropriate use for this is when you are constructing a symint
+  // in a situation where you know it is non-negative (or, if it is negative,
+  // the negative value is -1; i.e., not user controlled)
+  SymInt(Unchecked /*unused*/, int64_t d) : data_(d) {}
+
+  // TODO: these implementations are not optimal because they allocate a
+  // temporary and then use the move constructor/assignment
+  SymInt(const SymInt& s) : data_(0) {
+    if (s.is_heap_allocated()) {
+      *this = SymInt(s.toSymNode());
+    } else {
+      data_ = s.data_;
+    }
+  }
+  SymInt(SymInt&& s) noexcept : data_(s.data_) {
+    s.data_ = 0;
+  }
+
+  SymInt& operator=(const SymInt& s) {
+    if (this != &s) {
+      if (s.is_heap_allocated()) {
+        *this = SymInt(s.toSymNode());
+      } else {
+        data_ = s.data_;
+      }
+    }
+    return *this;
+  }
+  SymInt& operator=(SymInt&& s) noexcept {
+    if (this != &s) {
+      release_(); // release the current SymNode if any
+      data_ = s.data_;
+      if (s.is_heap_allocated())
+        s.data_ = 0;
+    };
+    return *this;
+  }
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(is_heap_allocated());
+    uint64_t unextended_bits = static_cast<uint64_t>(data_) & ~MASK;
+    uint64_t sign_bit_mask = 1ULL << (62 - 1);
+    // https://stackoverflow.com/questions/42534749/signed-extension-from-24-bit-to-32-bit-in-c
+    uint64_t extended_bits = (unextended_bits ^ sign_bit_mask) - sign_bit_mask;
+    return static_cast<SymNodeImpl*>(
+        // NOLINTNEXTLINE(performance-no-int-to-ptr, bugprone*)
+        reinterpret_cast<void*>(static_cast<uintptr_t>(extended_bits)));
+  }
+
+  void release_() {
+    if (is_heap_allocated()) {
+      SymNode::reclaim(toSymNodeImplUnowned()); // steal
+    }
+  }
+
+  SymNodeImpl* release() && {
+#ifndef C10_MOBILE
+    TORCH_INTERNAL_ASSERT(is_heap_allocated());
+    auto* r = toSymNodeImplUnowned();
+    data_ = 0; // transfer ownership
+    return r;
+#else
+    TORCH_INTERNAL_ASSERT(false);
+#endif
+  }
+
+  // Only valid if is_heap_allocated()
+  SymNode toSymNode() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  ~SymInt() {
+    release_();
+  }
+
+  // Require the int to be non-symbolic, and if it is symbolic raise an
+  // error.  This is safe to use for C++ code that doesn't work for symbolic
+  // shapes, and you don't have time to fix it immediately, as if we
+  // try to trigger the path in C++ you'll appropriately get an error
+  int64_t expect_int() const {
+    if (auto r = maybe_as_int()) {
+      return *r;
+    }
+    TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(
+        false, "when unpacking SymInt, expected int but got ", *this);
+  }
+
+  // Test if we have a hint for this int (e.g., guard_int would work).
+  // Most of the time this is true; it is only false when you have
+  // an unbacked SymInt.
+  bool has_hint() const;
+
+  // Insert a guard for the int to be its concrete value, and then return
+  // that value.  This operation always works, even if the int is symbolic,
+  // so long as we know what the underlying value is (e.g., this won't work
+  // if you call it on the size of nonzero output).  Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_int(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  int64_t guard_int(const char* file, int64_t line) const;
+
+  // Distinguish actual symbolic values from constants stored on the heap
+  bool is_symbolic() const {
+    return is_heap_allocated() &&
+        !toSymNodeImplUnowned()->constant_int().has_value();
+  }
+
+  // N.B. It's important to keep this definition in the header
+  // as we expect if checks to be folded for mobile builds
+  // where `is_heap_allocated` is always false and optimize dead code paths
+  C10_ALWAYS_INLINE bool is_heap_allocated() const {
+#ifdef C10_MOBILE
+    return false;
+#else
+    return !check_range(data_);
+#endif
+  }
+
+  SymInt operator+(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma + *mb);
+      }
+    }
+    return operator_add_slow_path(sci);
+  }
+
+  SymInt operator-(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma - *mb);
+      }
+    }
+    return operator_sub_slow_path(sci);
+  }
+
+  SymInt operator*(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma * *mb);
+      }
+    }
+    return operator_mul_slow_path(sci);
+  }
+
+  SymInt operator/(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma / *mb);
+      }
+    }
+    return operator_div_slow_path(sci);
+  }
+
+  SymInt operator%(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma % *mb);
+      }
+    }
+    return operator_mod_slow_path(sci);
+  }
+
+  void operator*=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma * *mb);
+        return;
+      }
+    }
+    operator_imul_slow_path(sci);
+  }
+
+  void operator+=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma + *mb);
+        return;
+      }
+    }
+    operator_iadd_slow_path(sci);
+  }
+
+  void operator/=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma / *mb);
+        return;
+      }
+    }
+    operator_idiv_slow_path(sci);
+  }
+
+  SymInt clone() const;
+
+  SymBool sym_eq(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma == *mb);
+      }
+    }
+    return sym_eq_slow_path(sci);
+  }
+
+  SymBool sym_ne(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma != *mb);
+      }
+    }
+    return sym_ne_slow_path(sci);
+  }
+
+  SymBool sym_lt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma < *mb);
+      }
+    }
+    return sym_lt_slow_path(sci);
+  }
+
+  SymBool sym_le(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma <= *mb);
+      }
+    }
+    return sym_le_slow_path(sci);
+  }
+
+  SymBool sym_gt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma > *mb);
+      }
+    }
+    return sym_gt_slow_path(sci);
+  }
+
+  SymBool sym_ge(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma >= *mb);
+      }
+    }
+    return sym_ge_slow_path(sci);
+  }
+
+  bool operator==(const SymInt& o) const {
+    return sym_eq(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator!=(const SymInt& o) const {
+    return sym_ne(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<(const SymInt& o) const {
+    return sym_lt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<=(const SymInt& o) const {
+    return sym_le(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>(const SymInt& o) const {
+    return sym_gt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>=(const SymInt& o) const {
+    return sym_ge(o).guard_bool(__FILE__, __LINE__);
+  }
+
+  SymInt min(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::min(*ma, *mb));
+      }
+    }
+    return min_slow_path(sci);
+  }
+
+  SymInt max(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::max(*ma, *mb));
+      }
+    }
+    return max_slow_path(sci);
+  }
+
+  // If both are symbolic, this checks if
+  // they share the same node.
+  // If both are not symbolic this just checks normal equality.
+  bool is_same(const SymInt& other) const;
+
+  operator SymFloat() const;
+
+  void unsafe_set_data(size_t nbytes) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated());
+    data_ = static_cast<int64_t>(nbytes);
+  }
+
+  // Don't use this.  Prefer maybe_as_int instead
+  int64_t as_int_unchecked() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated());
+    return data_;
+  }
+
+  std::optional<int64_t> maybe_as_int() const {
+    if (!is_heap_allocated()) {
+      return data_;
+    }
+    return maybe_as_int_slow_path();
+  }
+
+  // Return whether the integer is directly coercible to a SymInt
+  // without requiring heap allocation.  You don't need to use this
+  // to check if you can pass an integer to SymInt; this is guaranteed
+  // to work (it just might heap allocate!)
+  static bool check_range(int64_t i) {
+    return i > MAX_UNREPRESENTABLE_INT;
+  }
+
+  // Return the min representable integer as a SymInt without
+  // heap allocation.  For quantities that count bytes (or larger),
+  // this is still much larger than you need, so you may consider
+  // using this as a more efficient version of MIN_INT
+  static constexpr int64_t min_representable_int() {
+    return MAX_UNREPRESENTABLE_INT + 1;
+  }
+
+ private:
+  void promote_to_negative();
+  SymInt operator_add_slow_path(const SymInt& sci) const;
+  SymInt operator_sub_slow_path(const SymInt& sci) const;
+  SymInt operator_mul_slow_path(const SymInt& sci) const;
+  SymInt operator_div_slow_path(const SymInt& sci) const;
+  SymInt operator_mod_slow_path(const SymInt& sci) const;
+  void operator_imul_slow_path(const SymInt& sci);
+  void operator_iadd_slow_path(const SymInt& sci);
+  void operator_idiv_slow_path(const SymInt& sci);
+  SymBool sym_eq_slow_path(const SymInt& sci) const;
+  SymBool sym_ne_slow_path(const SymInt& sci) const;
+  SymBool sym_lt_slow_path(const SymInt& sci) const;
+  SymBool sym_le_slow_path(const SymInt& sci) const;
+  SymBool sym_gt_slow_path(const SymInt& sci) const;
+  SymBool sym_ge_slow_path(const SymInt& sci) const;
+
+  SymInt min_slow_path(const SymInt& sci) const;
+  SymInt max_slow_path(const SymInt& sci) const;
+
+  std::optional<int64_t> maybe_as_int_slow_path() const;
+
+  // Constraints on the internal representation:
+  //
+  // - Should represent positive and small negative ints
+  // - No conversion necessary for operations on ints
+  // - Must represent valid 64-bit pointers
+  // - Is symbolic test should be FAST (two arithmetic instructions is too
+  // much).
+  //   This code being a hotpath is based on Strobelight profiles of
+  //   is_heap_allocated().  FB only: https://fburl.com/strobelight/5l50ncxd
+  //   (you will need to change the time window).
+  //
+  // So, the scheme is to reserve large negative numbers (assuming
+  // two's complement):
+  //
+  // - 0b0.... means we are a positive int
+  // - 0b11... means we are a small negative int
+  // - 0b10... means we are are a pointer. This means that
+  //           [-2^63, -2^62-1] are not representable as ints.
+  //           We don't actually need all of this space as on x86_64
+  //           as the top 16bits aren't used for anything
+  static constexpr uint64_t MASK = 1ULL << 63 | 1ULL << 62 | 1ULL << 61;
+  static constexpr uint64_t IS_SYM = 1ULL << 63 | 1ULL << 61;
+  // We must manually translate the bit pattern test into a greater
+  // than test because compiler doesn't figure it out:
+  // https://godbolt.org/z/356aferaW
+  static constexpr int64_t MAX_UNREPRESENTABLE_INT =
+      -1LL & static_cast<int64_t>(~(1ULL << 62));
+  int64_t data_;
+};
+
+/// Sum of a list of SymInt; accumulates into the c10::SymInt expression
+template <
+    typename C,
+    typename std::enable_if_t<
+        std::is_same_v<typename C::value_type, c10::SymInt>,
+        int> = 0>
+inline c10::SymInt multiply_integers(const C& container) {
+  return std::accumulate(
+      container.begin(),
+      container.end(),
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
+template <
+    typename Iter,
+    typename = std::enable_if_t<std::is_same_v<
+        typename std::iterator_traits<Iter>::value_type,
+        c10::SymInt>>>
+inline c10::SymInt multiply_integers(Iter begin, Iter end) {
+  return std::accumulate(
+      begin,
+      end,
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
+#define DECLARE_SYMINT_OP_INTONLY(scalar_t, RetTy)      \
+  C10_API RetTy operator%(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator%(scalar_t a, const SymInt& b);
+
+#define DECLARE_SYMINT_OP(scalar_t, RetTy)              \
+  C10_API RetTy operator+(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator-(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator*(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator/(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator+(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator-(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator*(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator/(scalar_t a, const SymInt& b); \
+  C10_API bool operator==(const SymInt& a, scalar_t b); \
+  C10_API bool operator!=(const SymInt& a, scalar_t b); \
+  C10_API bool operator<(const SymInt& a, scalar_t b);  \
+  C10_API bool operator<=(const SymInt& a, scalar_t b); \
+  C10_API bool operator>(const SymInt& a, scalar_t b);  \
+  C10_API bool operator>=(const SymInt& a, scalar_t b); \
+  C10_API bool operator==(scalar_t a, const SymInt& b); \
+  C10_API bool operator!=(scalar_t a, const SymInt& b); \
+  C10_API bool operator<(scalar_t a, const SymInt& b);  \
+  C10_API bool operator<=(scalar_t a, const SymInt& b); \
+  C10_API bool operator>(scalar_t a, const SymInt& b);  \
+  C10_API bool operator>=(scalar_t a, const SymInt& b);
+
+DECLARE_SYMINT_OP_INTONLY(int64_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(int32_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(uint64_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(uint32_t, SymInt)
+DECLARE_SYMINT_OP(int64_t, SymInt)
+DECLARE_SYMINT_OP(int32_t, SymInt) // make sure constants work
+DECLARE_SYMINT_OP(uint64_t, SymInt)
+DECLARE_SYMINT_OP(uint32_t, SymInt)
+DECLARE_SYMINT_OP(double, SymFloat)
+DECLARE_SYMINT_OP(float, SymFloat) // just for completeness
+
+// On OSX size_t is different than uint64_t so we have to
+// define it separately
+#if defined(__APPLE__)
+DECLARE_SYMINT_OP_INTONLY(size_t, SymInt)
+DECLARE_SYMINT_OP(size_t, SymInt)
+#endif
+
+#undef DECLARE_SYMINT_OP
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s);
+C10_API SymInt operator-(const SymInt& s);
+
+inline bool sym_eq(int64_t a, int64_t b) {
+  return a == b;
+}
+
+inline SymBool sym_eq(const SymInt& a, const SymInt& b) {
+  return a.sym_eq(b);
+}
+
+inline bool sym_ne(int64_t a, int64_t b) {
+  return a != b;
+}
+
+inline SymBool sym_ne(const SymInt& a, const SymInt& b) {
+  return a.sym_ne(b);
+}
+
+inline bool sym_lt(int64_t a, int64_t b) {
+  return a < b;
+}
+
+inline SymBool sym_lt(const SymInt& a, const SymInt& b) {
+  return a.sym_lt(b);
+}
+
+inline bool sym_le(int64_t a, int64_t b) {
+  return a <= b;
+}
+
+inline SymBool sym_le(const SymInt& a, const SymInt& b) {
+  return a.sym_le(b);
+}
+
+inline bool sym_gt(int64_t a, int64_t b) {
+  return a > b;
+}
+
+inline SymBool sym_gt(const SymInt& a, const SymInt& b) {
+  return a.sym_gt(b);
+}
+
+inline bool sym_ge(int64_t a, int64_t b) {
+  return a >= b;
+}
+
+inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
+  return a.sym_ge(b);
+}
+
+} // namespace c10
+
+#include <limits>
+
+namespace std {
+
+template <>
+class numeric_limits<c10::SymInt> {
+ public:
+  static constexpr bool is_specialized = true;
+
+  static constexpr int64_t max() noexcept {
+    return std::numeric_limits<int64_t>::max();
+  }
+
+  static constexpr int64_t min() noexcept {
+    return std::numeric_limits<int64_t>::min();
+  }
+
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = true;
+};
+
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..b63753b186937f0e6869ee557ca1528bb2d7e340
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymIntArrayRef.h
@@ -0,0 +1,113 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cstdint>
+#include <optional>
+
+namespace c10 {
+using SymIntArrayRef = ArrayRef<SymInt>;
+
+inline at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) {
+  return IntArrayRef(reinterpret_cast<const int64_t*>(ar.data()), ar.size());
+}
+
+// TODO: a SymIntArrayRef containing a heap allocated large negative integer
+// can actually technically be converted to an IntArrayRef... but not with
+// the non-owning API we have here.  We can't reinterpet cast; we have to
+// allocate another buffer and write the integers into it.  If you need it,
+// we can do it.  But I don't think you need it.
+
+inline std::optional<at::IntArrayRef> asIntArrayRefSlowOpt(
+    c10::SymIntArrayRef ar) {
+  for (const c10::SymInt& sci : ar) {
+    if (sci.is_heap_allocated()) {
+      return std::nullopt;
+    }
+  }
+
+  return {asIntArrayRefUnchecked(ar)};
+}
+
+inline at::IntArrayRef asIntArrayRefSlow(
+    c10::SymIntArrayRef ar,
+    const char* file,
+    int64_t line) {
+  for (const c10::SymInt& sci : ar) {
+    TORCH_CHECK(
+        !sci.is_heap_allocated(),
+        file,
+        ":",
+        line,
+        ": SymIntArrayRef expected to contain only concrete integers");
+  }
+  return asIntArrayRefUnchecked(ar);
+}
+
+// Even slower than asIntArrayRefSlow, as it forces an allocation for a
+// destination int, BUT it is able to force specialization (it never errors)
+inline c10::DimVector asIntArrayRefSlowAlloc(
+    c10::SymIntArrayRef ar,
+    const char* file,
+    int64_t line) {
+  c10::DimVector res(ar.size(), 0);
+  for (const auto i : c10::irange(ar.size())) {
+    res[i] = ar[i].guard_int(file, line);
+  }
+  return res;
+}
+
+#define C10_AS_INTARRAYREF_SLOW(a) c10::asIntArrayRefSlow(a, __FILE__, __LINE__)
+#define C10_AS_INTARRAYREF_SLOW_ALLOC(a) \
+  c10::asIntArrayRefSlowAlloc(a, __FILE__, __LINE__)
+
+// Prefer using a more semantic constructor, like
+// fromIntArrayRefKnownNonNegative
+inline SymIntArrayRef fromIntArrayRefUnchecked(IntArrayRef array_ref) {
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
+}
+
+inline SymIntArrayRef fromIntArrayRefKnownNonNegative(IntArrayRef array_ref) {
+  return fromIntArrayRefUnchecked(array_ref);
+}
+
+inline SymIntArrayRef fromIntArrayRefSlow(IntArrayRef array_ref) {
+  for (long i : array_ref) {
+    TORCH_CHECK(
+        SymInt::check_range(i),
+        "IntArrayRef contains an int that cannot be represented as a SymInt: ",
+        i);
+  }
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
+}
+
+inline c10::SymBool sym_equals(SymIntArrayRef LHS, SymIntArrayRef RHS) {
+  if (LHS.size() != RHS.size()) {
+    return c10::SymBool(false);
+  }
+
+  c10::SymBool result = sym_eq(LHS.size(), RHS.size());
+  for (size_t i = 0; i < RHS.size(); ++i) {
+    c10::SymBool equals = sym_eq(LHS[i], RHS[i]);
+    std::optional<bool> equals_bool = equals.maybe_as_bool();
+
+    if (equals_bool.has_value() && !*equals_bool) {
+      // Early return if element comparison is known to be false
+      return equals;
+    }
+    result = result.sym_and(equals);
+  }
+  return result;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4257684ea150ac4f8f1bda39ab4c1212c1929ed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymNodeImpl.h
@@ -0,0 +1,261 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <string>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+
+namespace c10 {
+
+class SymNodeImpl;
+using SymNode = c10::intrusive_ptr<SymNodeImpl>;
+
+// When you add a method, you also need to edit
+// torch/csrc/jit/python/init.cpp
+// torch/csrc/utils/python_symnode.h
+// c10/core/ConstantSymNodeImpl.h
+class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
+ public:
+  ~SymNodeImpl() override = default;
+
+  template <typename T>
+  c10::intrusive_ptr<T> dyn_cast() const {
+    return c10::intrusive_ptr<T>::reclaim_copy(dynamic_cast<T*>(this));
+  }
+
+  // these could be pure virtual when we implement LTC versions
+  virtual bool is_int() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool is_bool() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool is_float() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool is_nested_int() const {
+    return false;
+  }
+  virtual SymNode add(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sub(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode mul(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  // NB: legacy, prefer float_truediv or int_truediv
+  virtual SymNode truediv(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode float_truediv(const SymNode& other) {
+    return truediv(other);
+  }
+  virtual SymNode int_truediv(const SymNode& other) {
+    return truediv(other);
+  }
+  // NB: legacy, prefer float_pow or pow_by_natural
+  virtual SymNode pow(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode float_pow(const SymNode& other) {
+    return pow(other);
+  }
+  virtual SymNode pow_by_natural(const SymNode& other) {
+    return pow(other);
+  }
+  // NB: legacy, prefer int_floordiv
+  virtual SymNode floordiv(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode int_floordiv(const SymNode& other) {
+    return floordiv(other);
+  }
+  virtual SymNode mod(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode eq(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode ne(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode gt(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode lt(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode le(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode ge(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode ceil() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode floor() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode neg() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_min(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_max(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_or(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_and(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_not() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_ite(const SymNode& then_val, const SymNode& else_val) {
+    TORCH_CHECK(false, "NYI");
+  }
+  // NB: self is ignored here, only the arguments are used
+  virtual SymNode is_contiguous(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode is_channels_last_contiguous_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode is_channels_last_contiguous_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode is_channels_last_strides_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode is_channels_last_strides_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode is_non_overlapping_and_dense(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode clone() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_float() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode wrap_int(int64_t num) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode wrap_float(double num) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode wrap_bool(bool num) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual int64_t guard_int(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool guard_bool(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual double guard_float(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool guard_size_oblivious(const char* file, int64_t line) {
+    // No improvement for unbacked SymBools by default, replace this
+    // with a better implementation!
+    return guard_bool(file, line);
+  }
+  virtual bool guard_or_false(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+  virtual bool statically_known_true(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+  virtual bool guard_or_true(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+  virtual bool expect_true(const char* file, int64_t line) {
+    // No improvement for unbacked SymBools by default, replace this
+    // with a better implementation!
+    return guard_bool(file, line);
+  }
+  virtual int64_t int_() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool bool_() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual bool has_hint() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual std::string str() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual std::string _graph_repr() {
+    return str();
+  }
+  virtual std::optional<int64_t> nested_int() {
+    return std::nullopt;
+  }
+  virtual std::optional<int64_t> nested_int_coeff() {
+    return std::nullopt;
+  }
+  virtual std::optional<int64_t> constant_int() {
+    return std::nullopt;
+  }
+  virtual std::optional<bool> constant_bool() {
+    return std::nullopt;
+  }
+  virtual std::optional<int64_t> maybe_as_int() {
+    return std::nullopt;
+  }
+  virtual bool is_constant() {
+    return false;
+  }
+  virtual bool is_symbolic() {
+    return true;
+  }
+  std::ostream& operator<<(std::ostream& os) {
+    os << str();
+    return os;
+  }
+};
+
+} // namespace c10
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h
new file mode 100644
index 0000000000000000000000000000000000000000..411c81a98bac68a34c7c2bafbf78b096bf2bc9cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/SymbolicShapeMeta.h
@@ -0,0 +1,234 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/DimVector.h>
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <utility>
+
+namespace c10 {
+
+class C10_API SymbolicShapeMeta {
+ public:
+  // Basic metadata from which other quantities are derived
+  SymDimVector sizes_ = {0};
+  SymDimVector strides_ = {1};
+  SymInt storage_offset_ = 0;
+
+  bool strides_valid_ = true; // e.g. for sparse where there are no strides
+
+  SymbolicShapeMeta() = default;
+  ~SymbolicShapeMeta() = default;
+  SymbolicShapeMeta(const SymbolicShapeMeta& other);
+  SymbolicShapeMeta(SymbolicShapeMeta&& other) = delete;
+  SymbolicShapeMeta& operator=(const SymbolicShapeMeta& other) = delete;
+  SymbolicShapeMeta& operator=(SymbolicShapeMeta&& other) = delete;
+
+  void refresh_numel() {
+    // Non-const, don't need to hold mutables_ lock
+    available_.fetch_and(~numel_avail);
+    numel_ = 1;
+  }
+
+  void refresh_contiguous() {
+    // Non-const, don't need to hold mutables_ lock
+    available_.fetch_and(numel_avail);
+    is_contiguous_ = false;
+    is_channels_last_contiguous_ = false;
+    is_channels_last_3d_contiguous_ = false;
+    is_channels_last_ = false;
+    is_channels_last_3d_ = false;
+    is_non_overlapping_and_dense_ = false;
+  }
+
+  int64_t dim() const {
+    return static_cast<int64_t>(sizes_.size());
+  }
+
+  // Accessors for derived quantities, computed lazily on first access
+
+  bool has_numel() const {
+    return available_.load() & numel_avail;
+  }
+  bool has_is_contiguous() const {
+    return available_.load() & is_contiguous_avail;
+  }
+  bool has_is_channels_last_contiguous() const {
+    return available_.load() & is_channels_last_contiguous_avail;
+  }
+  bool has_is_channels_last_3d_contiguous() const {
+    return available_.load() & is_channels_last_3d_contiguous_avail;
+  }
+  bool has_is_channels_last() const {
+    return available_.load() & is_channels_last_avail;
+  }
+  bool has_is_channels_last_3d() const {
+    return available_.load() & is_channels_last_3d_avail;
+  }
+  bool has_is_non_overlapping_and_dense() const {
+    return available_.load() & is_non_overlapping_and_dense_avail;
+  }
+
+  // Accessors to cached derived properties
+  // DO NOT call with mutables_ lock held
+  const SymInt& numel() const {
+    if (C10_UNLIKELY(!has_numel())) {
+      init_numel();
+    }
+    return numel_;
+  }
+
+  const SymBool& is_contiguous(at::MemoryFormat memory_format) const {
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return this->is_channels_last_contiguous();
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return this->is_channels_last_3d_contiguous();
+    }
+    return this->is_contiguous();
+  }
+
+  const SymBool& is_contiguous() const {
+    if (C10_UNLIKELY(!has_is_contiguous())) {
+      init_is_contiguous();
+    }
+    return is_contiguous_;
+  }
+
+  const SymBool& is_channels_last_contiguous() const {
+    if (C10_UNLIKELY(!has_is_channels_last_contiguous())) {
+      init_is_channels_last_contiguous();
+    }
+    return is_channels_last_contiguous_;
+  }
+
+  const SymBool& is_channels_last_3d_contiguous() const {
+    if (C10_UNLIKELY(!has_is_channels_last_3d_contiguous())) {
+      init_is_channels_last_3d_contiguous();
+    }
+    return is_channels_last_3d_contiguous_;
+  }
+
+  const SymBool& is_channels_last() const {
+    if (C10_UNLIKELY(!has_is_channels_last())) {
+      init_is_channels_last();
+    }
+    return is_channels_last_;
+  }
+
+  const SymBool& is_channels_last_3d() const {
+    if (C10_UNLIKELY(!has_is_channels_last_3d())) {
+      init_is_channels_last_3d();
+    }
+    return is_channels_last_3d_;
+  }
+
+  const SymBool& is_non_overlapping_and_dense() const {
+    if (C10_UNLIKELY(!has_is_non_overlapping_and_dense())) {
+      init_is_non_overlapping_and_dense();
+    }
+    return is_non_overlapping_and_dense_;
+  }
+
+  // Assumptions so we can short-circuit computation
+  // NOTE: Don't need to lock mutables_ since these aren't const
+  void assume_contiguous(SymBool val = true) {
+    is_contiguous_ = std::move(val);
+    available_.fetch_or(is_contiguous_avail);
+  }
+  void assume_channels_last_contiguous(SymBool val = true) {
+    is_contiguous_ = std::move(val);
+    available_.fetch_or(is_channels_last_contiguous_avail);
+  }
+  void assume_channels_last_3d_contiguous(SymBool val = true) {
+    is_channels_last_3d_contiguous_ = std::move(val);
+    available_.fetch_or(is_channels_last_3d_contiguous_avail);
+  }
+  void assume_channels_last(SymBool val = true) {
+    is_channels_last_ = std::move(val);
+    available_.fetch_or(is_channels_last_avail);
+  }
+  void assume_channels_last_3d(SymBool val = true) {
+    is_channels_last_3d_ = std::move(val);
+    available_.fetch_or(is_channels_last_3d_avail);
+  }
+  void assume_non_overlapping_and_dense(SymBool val = true) {
+    is_non_overlapping_and_dense_ = std::move(val);
+    available_.fetch_or(is_non_overlapping_and_dense_avail);
+  }
+
+ private:
+  SymBool compute_contiguous() const;
+  SymBool compute_channels_last_contiguous_2d() const;
+  SymBool compute_channels_last_contiguous_3d() const;
+  SymBool compute_strides_like_channels_last_2d() const;
+  SymBool compute_strides_like_channels_last_3d() const;
+  SymBool compute_non_overlapping_and_dense() const;
+
+  // These are little wrappers over the real compute_ functions that
+  // can make use of other contiguity fields to short circuit.
+  // They need to be implemented separately for SymBool, as SymBool does
+  // not short circuit.
+  // TODO: should the SymBool cases avoid the short circuit?  Need to reason
+  // if its correct, and reason if the simpler expressions are better for
+  // analysis (maybe not!)
+
+  SymBool compute_channels_last_contiguous_3d_dim5() const;
+  SymBool compute_channels_last_2d_dim5() const;
+  SymBool compute_channels_last_3d_dim5() const;
+  SymBool compute_is_non_overlapping_and_dense_dim4() const;
+  SymBool compute_is_non_overlapping_and_dense_dim5() const;
+  SymBool compute_is_non_overlapping_and_dense_anydim() const;
+
+  void init_numel() const;
+  void init_is_contiguous() const;
+  void init_is_channels_last_contiguous() const;
+  void init_is_channels_last_3d_contiguous() const;
+  void init_is_channels_last() const;
+  void init_is_channels_last_3d() const;
+  void init_is_non_overlapping_and_dense() const;
+
+  // NOTE: These only set if !has_foo()
+  void set_numel(SymInt val) const;
+  void set_is_contiguous(SymBool val) const;
+  void set_is_channels_last_contiguous(SymBool val) const;
+  void set_is_channels_last_3d_contiguous(SymBool val) const;
+  void set_is_channels_last(SymBool val) const;
+  void set_is_channels_last_3d(SymBool val) const;
+  void set_is_non_overlapping_and_dense(SymBool val) const;
+
+  // Lazily initialized variables, with the corresponding available_ flag
+  // indicating whether the value has been initialized
+  mutable std::atomic<int> available_{0};
+
+  enum avail {
+    numel_avail = 1 << 0,
+    is_contiguous_avail = 1 << 1,
+    is_channels_last_contiguous_avail = 1 << 2,
+    is_channels_last_3d_contiguous_avail = 1 << 3,
+    is_channels_last_avail = 1 << 4,
+    is_channels_last_3d_avail = 1 << 5,
+    is_non_overlapping_and_dense_avail = 1 << 6,
+  };
+
+  // Mutex to prevent races when initializing the variable from const accessors
+  mutable std::mutex mutables_;
+  mutable SymInt numel_ = 1;
+  mutable SymBool is_contiguous_{true};
+  mutable SymBool is_channels_last_contiguous_{false};
+  mutable SymBool is_channels_last_3d_contiguous_{false};
+  mutable SymBool is_channels_last_{false};
+  mutable SymBool is_channels_last_3d_{false};
+  mutable SymBool is_non_overlapping_and_dense_{true};
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..03faea3fbc70500bda37a8099657e80f38976657
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorImpl.h
@@ -0,0 +1,3333 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymbolicShapeMeta.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/core/impl/SizesAndStrides.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Flags.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+#include <c10/util/safe_numerics.h>
+#include <c10/util/typeid.h>
+#include <optional>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrunk to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+//
+// This parameter is respected "upper-case" methods which call Resize()
+// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_
+// or ShrinkTo, both of which guarantee to never to free memory.
+C10_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.  This only applies to functions which
+// respect caffe2_keep_on_shrink.
+C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default")
+
+namespace at {
+class Tensor;
+class TensorBase;
+} // namespace at
+
+namespace c10 {
+
+/**
+ * A utility function to convert vector<int> to vector<int64_t>.
+ */
+inline std::vector<int64_t> ToVectorint64_t(const ArrayRef<int>& src) {
+  return std::vector<int64_t>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline int64_t size_from_dim_(int k, IntArrayRef dims) {
+  int64_t r = 1;
+  for (const auto i : c10::irange(k, dims.size())) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline int64_t size_to_dim_(int k, IntArrayRef dims) {
+  TORCH_CHECK(k >= 0 && static_cast<size_t>(k) <= dims.size());
+  int64_t r = 1;
+  for (const auto i : c10::irange(k)) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
+  TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size());
+  int64_t r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  TORCH_CHECK(axis_index >= -ndims);
+  TORCH_CHECK(axis_index < ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+using PlacementDtor = void (*)(void*, size_t);
+
+/*
+ * A Context that will call extra placement deleter during
+ * deconstruction.
+ *
+ * Accept a already constructed DataPtr and store it as member
+ * during destruction, we'll call extra deleter on the underlying
+ * data pointer before the DataPtr is destructed.
+ * `data_ptr_` owns the memory.
+ */
+struct C10_API PlacementDeleteContext {
+  DataPtr data_ptr_;
+  PlacementDtor placement_dtor_;
+  size_t size_;
+
+  PlacementDeleteContext(
+      DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size)
+      : data_ptr_(std::move(data_ptr)),
+        placement_dtor_(placement_dtor),
+        size_(size) {}
+
+  PlacementDeleteContext(PlacementDeleteContext&&) noexcept = delete;
+  PlacementDeleteContext(const PlacementDeleteContext&) = delete;
+  PlacementDeleteContext& operator=(const PlacementDeleteContext&) = delete;
+  PlacementDeleteContext& operator=(PlacementDeleteContext&&) = delete;
+  static DataPtr makeDataPtr(
+      DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size,
+      Device device);
+  ~PlacementDeleteContext() {
+    placement_dtor_(data_ptr_.get(), size_);
+    // original memory will be freed when data_ptr_ is destructed
+  }
+};
+
+struct C10_API AutogradMetaInterface {
+  virtual void set_requires_grad(
+      bool requires_grad,
+      at::TensorImpl* self_impl) = 0;
+  virtual bool requires_grad() const = 0;
+  virtual at::Tensor& mutable_grad() = 0;
+  virtual const at::Tensor& grad() const = 0;
+  virtual const at::Tensor& fw_grad(uint64_t level, const at::TensorBase& self)
+      const = 0;
+  virtual void set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op) = 0;
+  virtual ~AutogradMetaInterface();
+};
+
+namespace impl {
+
+// Unfortunately, the definition of AutogradMeta lives in a separate
+// compilation unit than TensorImpl (libtorch.so versus libc10.so)
+// which means that we cannot construct an AutogradMeta from TensorImpl,
+// not even from the cpp file.  So we have to indirect it through a factory
+// function which will be initialized when we load libtorch.so.
+
+struct C10_API AutogradMetaFactory {
+  virtual ~AutogradMetaFactory() = default;
+  virtual std::unique_ptr<AutogradMetaInterface> make() const = 0;
+  // This method is the dumbest method.  But I don't have access
+  // to Tensor (not TensorImpl) which is undefined in this header.
+  virtual const at::Tensor& undefined_tensor() const = 0;
+};
+
+C10_API void SetAutogradMetaFactory(AutogradMetaFactory* factory);
+C10_API AutogradMetaFactory* GetAutogradMetaFactory();
+
+struct C10_API AutogradMetaFactoryRegisterer{
+    explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory * factory){
+        SetAutogradMetaFactory(factory);
+} // namespace impl
+}; // namespace c10
+
+} // namespace impl
+
+struct C10_API NamedTensorMetaInterface {
+  virtual ~NamedTensorMetaInterface() = default;
+  virtual std::unique_ptr<NamedTensorMetaInterface> clone() const {
+    TORCH_INTERNAL_ASSERT(
+        false, "Not implemented: NamedTensorMetaInterface::clone");
+  }
+  virtual int64_t slow_dim() const {
+    TORCH_INTERNAL_ASSERT(
+        false, "Not implemented: NamedTensorMetaInterface::slow_dim");
+  }
+};
+
+// For ease of copy pasting
+#if 0
+is_contiguous
+is_channels_last_contiguous
+is_channels_last_3d_contiguous
+is_channels_last
+is_channels_last_3d
+is_non_overlapping_and_dense
+#endif
+
+/**
+ * This structure is intended to hold additional metadata of the specific device
+ * backend.
+ **/
+struct C10_API BackendMeta : intrusive_ptr_target {
+  ~BackendMeta() override = default;
+  virtual intrusive_ptr<BackendMeta> clone(
+      const intrusive_ptr<BackendMeta>& ptr) const {
+    return ptr;
+  }
+};
+
+struct C10_API ExtraMeta {
+  std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta_ = nullptr;
+  std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta_ = nullptr;
+  intrusive_ptr<c10::BackendMeta> backend_meta_ = nullptr;
+  std::optional<std::string> custom_data_ptr_error_msg_ = std::nullopt;
+  std::optional<std::string> custom_storage_error_msg_ = std::nullopt;
+
+  ExtraMeta() = default;
+  ~ExtraMeta() = default;
+  ExtraMeta(const ExtraMeta& other) {
+    if (other.symbolic_shape_meta_) {
+      symbolic_shape_meta_ =
+          std::make_unique<c10::SymbolicShapeMeta>(*other.symbolic_shape_meta_);
+    }
+    if (other.named_tensor_meta_) {
+      named_tensor_meta_ = other.named_tensor_meta_->clone();
+    }
+    if (other.backend_meta_) {
+      backend_meta_ = other.backend_meta_->clone(other.backend_meta_);
+    }
+    if (other.custom_data_ptr_error_msg_) {
+      custom_data_ptr_error_msg_ = other.custom_data_ptr_error_msg_;
+    }
+    if (other.custom_storage_error_msg_) {
+      custom_storage_error_msg_ = other.custom_storage_error_msg_;
+    }
+  }
+  ExtraMeta& operator=(const ExtraMeta& other) = delete;
+  ExtraMeta(ExtraMeta&& other) = delete;
+  ExtraMeta& operator=(ExtraMeta&& other) = delete;
+
+  ExtraMeta(
+      std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta,
+      std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta,
+      intrusive_ptr<c10::BackendMeta> backend_meta,
+      std::optional<std::string> custom_data_ptr_error_msg = std::nullopt,
+      std::optional<std::string> custom_storage_access_error_msg = std::nullopt)
+      : symbolic_shape_meta_(std::move(symbolic_shape_meta)),
+        named_tensor_meta_(std::move(named_tensor_meta)),
+        backend_meta_(std::move(backend_meta)),
+        custom_data_ptr_error_msg_(std::move(custom_data_ptr_error_msg)),
+        custom_storage_error_msg_(std::move(custom_storage_access_error_msg)) {}
+
+  std::unique_ptr<ExtraMeta> clone() const {
+    return std::make_unique<ExtraMeta>(*this);
+  }
+};
+
+// NOTE [ Version Counter Sharing ]
+//
+// Every Tensor has a version counter. Version counters are incremented whenever
+// the data or size of a tensor changes through in-place Variable operations.
+// Version counters are used to detect modifications to saved variables which
+// would result in incorrect gradient calculations. Version counters may be
+// shared between Variables:
+//
+// 1. A view shares the version counter of the base Variable,
+// 2. `x.detach()` shares the version counter of `x`,
+// 3. Unpacked saved variables share the version counter of the source.
+//
+// Version counters are not shared in these scenarios:
+//
+// 1. When we replace a `Variable`'s underlying `Tensor` by calling
+// `set_data(...)`,
+// 2. `x.data` does not share the version counter of `x`. (See discussion at
+// https://github.com/pytorch/pytorch/issues/5396)
+//
+// Question: Why do we put the version counter in TensorImpl instead of
+// AutogradMeta?
+//
+// Answer: After the Variable/Tensor merge, a tensor will not have AutogradMeta
+// when its `requires_grad_` is false, but when we use this tensor in the
+// forward pass of a function that requires saving this tensor for backward, we
+// need to keep track of this tensor's version to make sure it's always valid in
+// the autograd graph.
+//
+// To achieve this goal, we put the version counter in TensorImpl instead of
+// AutogradMeta, and have it always be available. This allows us to have the
+// optimization of not carrying AutogradMeta when a tensor doesn't require
+// gradient.
+//
+// A hypothetical alternative way to achieve this goal is to initialize
+// AutogradMeta and create the version counter for the non-requires-grad tensor
+// only when it's saved for backward. However, since saving a tensor for
+// backward happens in the forward pass, and our invariant is that forward pass
+// needs to be thread-safe, lazy-initializing AutogradMeta when saving a tensor
+// can introduce race conditions when we are running the forward pass in
+// multi-thread scenarios, thus making the forward pass not thread-safe anymore,
+// which breaks the invariant.
+struct C10_API VariableVersion {
+ private:
+  struct VersionCounter : intrusive_ptr_target {
+    VersionCounter(uint32_t version) : version_(version) {}
+    std::atomic<uint32_t> version_;
+  };
+  c10::intrusive_ptr<VersionCounter> version_counter_;
+
+ public:
+  // Note [Disabled VariableVersion]
+  // VariableVersion struct has an intrusive_ptr pointing VersionCounter struct
+  // with an atomic variable. Thus `VariableVersion(/*version=*/0)` is not as
+  // cheap as we expected. In some cases constructing a VariableVersion with
+  // version 0 is not necessary so we add a cheap constructor which
+  // doesn't allocate the intrusive_ptr.
+  // Example use cases are:
+  //  - Inference tensors don't track version counter, so they'll just always
+  //    have disabled VariableVersion.
+  //  - In SavedVariable class we override version_counter_ inside its
+  //  constructor
+  //    so that we can use the cheap constructor there.
+  enum Disabled { DISABLED };
+  // It's okay to return true even for inference tensor which
+  // doesn't have version counter enabled.
+  // We want to be permissive here since in many cases (e.g. make_variable)
+  // we can std::move a TensorImpl if there's no other uses which saves us
+  // an additional TensorImpl allocation.
+  bool unique() const {
+    return version_counter_ ? 1 == version_counter_.use_count() : true;
+  }
+  // NOTE: As of C++11 and 14, default-constructing a std::atomic variable
+  // leaves it in a persistently undefined state. See
+  // https://cplusplus.github.io/LWG/issue2334.
+  VariableVersion(uint32_t version)
+      : version_counter_(c10::make_intrusive<VersionCounter>(version)) {}
+  VariableVersion(Disabled /*unused*/ = DISABLED) {}
+
+  bool enabled() const {
+    return version_counter_;
+  }
+
+  // Note [Inplace update inference tensor]
+  // 1. Inplace update to inference tensor is forbidden in normal mode.
+  //   For example:
+  //     inference_tensor.copy_(normal_tensor_requires_grad)
+  //   This inplace makes inference_tensor have requires_grad=True and
+  //   have a grad_fn.  This is bad because views of `inference_tensor`
+  //   created in InferenceMode won't be able to know the grad_fn since
+  //   their ViewMeta were not recorded. To match NoGradMode behavior
+  //   that "inplace update to a view created in NoGradMode raise an error",
+  //   we just ban inplace update to inference tensor since we can't tell
+  //   if an inference tensor is a view created in InferenceMode.
+  //
+  //   Note that views of normal tensor created in InferenceMode has proper
+  //   ViewMeta so that they're aware of the grad_fn correctly.
+  //
+  // 2. Inplace update to inference tensor in inference tensor doesn't bump
+  //    version counter.
+  //    * It either doesn't call bump() by skipping ADInplaceOrView kernel,
+  //      - e.g. inference_tensor.add_(1)
+  //    * or bump() is a no-op for inference tensor.
+  //      - e.g. inference_tensor.add_(normal_tensor)
+  void bump() {
+    // TODO: Replace the link to the documentation once it's available.
+    TORCH_CHECK(
+        version_counter_ || InferenceMode::is_enabled(),
+        "Inplace update to inference tensor outside InferenceMode is not allowed."
+        "You can make a clone to get a normal tensor before doing inplace update."
+        "See https://github.com/pytorch/rfcs/pull/17 for more details.");
+    if (version_counter_) {
+      ++version_counter_->version_;
+    }
+  }
+
+  void set_version(int64_t i) {
+    TORCH_CHECK(
+        version_counter_,
+        "Tried to call torch.autograd._unsafe_set_version() on a tensor "
+        "that does not have a version counter. Was it created in inference mode?");
+    TORCH_CHECK(i >= 0, "Cannot set a version_counter to a value below 0: ", i);
+    version_counter_->version_ = i;
+  }
+
+  // Inference tensor doesn't have version counter so it shouldn't be
+  // accessed.
+  uint32_t current_version() const {
+    TORCH_CHECK(
+        version_counter_, "Inference tensors do not track version counter.");
+    return version_counter_->version_;
+  }
+};
+
+// Forward declaration of TensorImpl needed for forward declaration of
+// C10_TensorImpl_Size_Check_Dummy_Class
+struct C10_API TensorImpl;
+
+/**
+ * NOTE: Some TensorImpl methods are small and not overridden in the
+ * PyTorch codebase itself, but may theoretically need to be
+ * overridden by third-party TensorImpl subclasses. This macro allows
+ * users that need maximum performance and don't need these extension
+ * points to disable them with a build-time flag. (In particular,
+ * XLA's XLATensorImpl currently overrides these methods, so we can't
+ * enable this flag by default.)
+ */
+#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
+#define TENSORIMPL_MAYBE_VIRTUAL
+#else
+#define TENSORIMPL_MAYBE_VIRTUAL virtual
+#endif
+
+/**
+ * The low-level representation of a tensor, which contains a pointer
+ * to a storage (which contains the actual data) and metadata (e.g., sizes and
+ * strides) describing this particular view of the data as a tensor.
+ *
+ * Some basic characteristics about our in-memory representation of
+ * tensors:
+ *
+ *  - It contains a pointer to a storage struct (Storage/StorageImpl)
+ *    which contains the pointer to the actual data and records the
+ *    data type and device of the view.  This allows multiple tensors
+ *    to alias the same underlying data, which allows to efficiently
+ *    implement differing *views* on a tensor.
+ *
+ *  - The tensor struct itself records view-specific metadata about
+ *    the tensor, e.g., sizes, strides and offset into storage.
+ *    Each view of a storage can have a different size or offset.
+ *
+ *  - This class is intrusively refcounted.  It is refcounted so that
+ *    we can support prompt deallocation of large tensors; it is
+ *    intrusively refcounted so that we can still perform reference
+ *    counted operations on raw pointers, which is often more convenient
+ *    when passing tensors across language boundaries.
+ *
+ *  - For backwards-compatibility reasons, a tensor may be in an
+ *    uninitialized state.  A tensor may be uninitialized in the following
+ *    two ways:
+ *
+ *      - A tensor may be DTYPE UNINITIALIZED.  A tensor of this
+ *        form has an uninitialized dtype.  This situation most
+ *        frequently arises when a user writes Tensor x(CPU).  The dtype
+ *        is subsequently initialized when mutable_data<T>() is
+ *        invoked for the first time.
+ *
+ *      - A tensor may be STORAGE UNINITIALIZED.  A tensor of this form
+ *        has non-zero size, but has a storage with a null data pointer.
+ *        This situation most frequently arises when a user calls
+ *        Resize() or FreeMemory().  This is because Caffe2 historically
+ *        does lazy allocation: allocation of data doesn't occur until
+ *        mutable_data<T>() is invoked.  A tensor with zero size is
+ *        always storage initialized, because no allocation is necessary
+ *        in this case.
+ *
+ *    All combinations of these two uninitialized states are possible.
+ *    Consider the following transcript in idiomatic Caffe2 API:
+ *
+ *      Tensor x(CPU); // x is storage-initialized, dtype-UNINITIALIZED
+ *      x.Resize(4); // x is storage-UNINITIALIZED, dtype-UNINITIALIZED
+ *      x.mutable_data<float>(); // x is storage-initialized, dtype-initialized
+ *      x.FreeMemory(); // x is storage-UNINITIALIZED, dtype-initialized.
+ *
+ *    All other fields on tensor are always initialized.  In particular,
+ *    size is always valid. (Historically, a tensor declared as Tensor x(CPU)
+ *    also had uninitialized size, encoded as numel == -1, but we have now
+ *    decided to default to zero size, resulting in numel == 0).
+ *
+ *    Uninitialized storages MUST be uniquely owned, to keep our model
+ *    simple.  Thus, we will reject operations which could cause an
+ *    uninitialized storage to become shared (or a shared storage to
+ *    become uninitialized, e.g., from FreeMemory).
+ *
+ *    In practice, tensors which are storage-UNINITIALIZED and
+ *    dtype-UNINITIALIZED are *extremely* ephemeral: essentially,
+ *    after you do a Resize(), you basically always call mutable_data()
+ *    immediately afterwards.  Most functions are not designed to
+ *    work if given a storage-UNINITIALIZED, dtype-UNINITIALIZED tensor.
+ *
+ *    We intend to eliminate all uninitialized states, so that every
+ *    tensor is fully initialized in all fields.  Please do not write new code
+ *    that depends on these uninitialized states.
+ */
+struct C10_API TensorImpl : public c10::intrusive_ptr_target {
+  TensorImpl() = delete;
+  ~TensorImpl() override;
+  // Note [Enum ImplType]
+  // This enum is temporary. In the followup refactor we should
+  // think about how to specialize TensorImpl creation for view
+  // tensors. Currently we only special case its key_set_ but
+  // there's also potential to share version_counter_ directly
+  // without creating first and then override in as_view.
+  enum ImplType { VIEW };
+
+  /**
+   * Construct a 1-dim 0-size tensor backed by the given storage.
+   */
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta data_type);
+
+  // See Note [Enum ImplType]
+  TensorImpl(
+      ImplType /*unused*/,
+      Storage&& storage,
+      DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta data_type);
+
+  /**
+   * Construct a 1-dim 0 size tensor that doesn't have a storage.
+   */
+  TensorImpl(
+      DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta data_type,
+      std::optional<c10::Device> device_opt);
+
+  // Legacy constructors so I don't have to go update call sites.
+  // TODO: When Variable is added, delete these constructors
+  TensorImpl(
+      Storage&& storage,
+      DispatchKey dispatch_key,
+      const caffe2::TypeMeta data_type)
+      : TensorImpl(
+            std::move(storage),
+            DispatchKeySet(dispatch_key),
+            data_type) {}
+  TensorImpl(
+      DispatchKey dispatch_key,
+      const caffe2::TypeMeta data_type,
+      std::optional<c10::Device> device_opt)
+      : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {}
+
+ private:
+  // This constructor is private, because the data_type is redundant with
+  // storage.  Still, we pass it in separately because it's easier to write
+  // the initializer list if we're not worried about storage being moved out
+  // from under us.
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta data_type,
+      std::optional<c10::Device> /*device_opt*/);
+
+ public:
+  TensorImpl(const TensorImpl&) = delete;
+  TensorImpl& operator=(const TensorImpl&) = delete;
+  TensorImpl(TensorImpl&&) = delete;
+  TensorImpl& operator=(TensorImpl&&) = delete;
+
+  /**
+   * Release (decref) storage, and any other external allocations.  This
+   * override is for `intrusive_ptr_target` and is used to implement weak
+   * tensors.
+   */
+  void release_resources() override;
+
+ public:
+  /**
+   * Return the DispatchKeySet corresponding to this Tensor, specifying
+   * all of the DispatchKeys that this Tensor identifies as.  This is the
+   * information used to dispatch operations on this tensor.
+   */
+  DispatchKeySet key_set() const {
+    return key_set_;
+  }
+
+ private:
+  [[noreturn]] void throw_cannot_call_with_symbolic(const char* meth) const;
+
+  // NOTE: The general recipe for customizable methods is that the fastpath
+  // function (e.g., sizes()) does an unlikely policy test, and if doesn't
+  // trigger, it does the fast path implementation with no checks and going
+  // directly to on-TensorImpl fields.  In particular, you never need to
+  // check ExtraMeta if the policy doesn't trigger, as non-trivial ExtraMeta
+  // implies the policy will always match.
+  //
+  // The default implementations of methods are "safe": they do extra tests
+  // to make sure the internal state is consistent no matter if you are
+  // doing symbolic shapes or not.  If you don't want the tests, directly
+  // override the custom method (e.g., custom_sizes()) to do your preferred
+  // behavior.
+
+ public:
+  /**
+   * Return a reference to the sizes of this tensor.  This reference remains
+   * valid as long as the tensor is live and not resized.
+   */
+  IntArrayRef sizes() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sizes_custom();
+    }
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  SymIntArrayRef sym_sizes() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_sizes_custom();
+    }
+    // Sizes guaranteed to be non-negative, so unchecked cast is OK
+    return c10::fromIntArrayRefKnownNonNegative(
+        sizes_and_strides_.sizes_arrayref());
+  }
+
+  IntArrayRef sizes_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("sizes");
+    }
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  SymIntArrayRef sym_sizes_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().sizes_;
+    } else {
+      // Sizes guaranteed to be non-negative, so unchecked cast is OK
+      return c10::fromIntArrayRefKnownNonNegative(sizes_default());
+    }
+  }
+
+  template <typename T>
+  ArrayRef<T> generic_sizes() {
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
+
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return sizes();
+    } else {
+      return sym_sizes();
+    }
+  }
+
+  template <typename T>
+  ArrayRef<T> generic_strides() {
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
+
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return strides();
+    } else {
+      return sym_strides();
+    }
+  }
+
+  template <typename T>
+  T generic_storage_offset() {
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
+
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return storage_offset();
+    } else {
+      return sym_storage_offset();
+    }
+  }
+
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: Previously, if you were using the Caffe2 API, you could
+   * test numel() == -1 to see if a tensor was uninitialized.  This
+   * is no longer true; numel always accurately reports the product
+   * of sizes of a tensor.
+   */
+  int64_t numel() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return numel_custom();
+    }
+    return numel_;
+  }
+
+  c10::SymInt sym_numel() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_numel_custom();
+    }
+    return c10::SymInt(SymInt::UNCHECKED, numel_);
+  }
+
+  int64_t numel_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("numel");
+    }
+    return numel_;
+  }
+
+  c10::SymInt sym_numel_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().numel();
+    } else {
+      return c10::SymInt(SymInt::UNCHECKED, numel_);
+    }
+  }
+
+  /**
+   * Return the number of dimensions of this tensor.  Note that 0-dimension
+   * represents a Tensor that is a Scalar, e.g., one that has a single element.
+   */
+  int64_t dim() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return dim_custom();
+    }
+    return static_cast<int64_t>(sizes_and_strides_.size());
+  }
+
+  int64_t dim_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return static_cast<int64_t>(symbolic_shape_meta().sizes_.size());
+    } else {
+      return static_cast<int64_t>(sizes_and_strides_.size());
+    }
+  }
+
+  /**
+   * Return the offset in number of elements into the storage that this
+   * tensor points to.  Most tensors have storage_offset() == 0, but,
+   * for example, an index into a tensor will have a non-zero storage_offset().
+   *
+   * WARNING: This is NOT computed in bytes.
+   */
+  int64_t storage_offset() const {
+    // TODO: maybe this should be toggled by strides
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return storage_offset_custom();
+    }
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_storage_offset_custom();
+    }
+    return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+  }
+
+  int64_t storage_offset_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("storage_offset");
+    }
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().storage_offset_;
+    } else {
+      return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+    }
+  }
+
+  /**
+   * Return a reference to the strides of this tensor.  This reference remains
+   * valid as long as the tensor is live and not restrided.
+   */
+  IntArrayRef strides() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return strides_custom();
+    }
+    return sizes_and_strides_.strides_arrayref();
+  }
+
+  c10::SymIntArrayRef sym_strides() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return sym_strides_custom();
+    }
+    return c10::fromIntArrayRefKnownNonNegative(strides_default());
+  }
+
+  IntArrayRef strides_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("strides");
+    }
+    return sizes_and_strides_.strides_arrayref();
+  }
+
+  c10::SymIntArrayRef sym_strides_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().strides_;
+    } else {
+      return c10::fromIntArrayRefKnownNonNegative(strides_default());
+    }
+  }
+
+  c10::SymBool sym_is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return sym_is_contiguous_custom(memory_format);
+    }
+    return sym_is_contiguous_default(memory_format);
+  }
+
+  template <typename T>
+  T is_contiguous_default_impl(at::MemoryFormat memory_format) const {
+    if (!has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return is_channels_last_contiguous_;
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return is_channels_last_3d_contiguous_;
+      }
+      return is_contiguous_;
+    }
+
+    // Handle dynamic shapes.
+    const auto& symbolic = symbolic_shape_meta().is_contiguous(memory_format);
+
+    if constexpr (std::is_same_v<T, bool>) {
+      return symbolic.guard_bool(__FILE__, __LINE__);
+    } else {
+      return symbolic;
+    }
+  }
+
+  bool is_contiguous_default(at::MemoryFormat memory_format) const {
+    return is_contiguous_default_impl<bool>(memory_format);
+  }
+
+  c10::SymBool sym_is_contiguous_default(at::MemoryFormat memory_format) const {
+    return is_contiguous_default_impl<c10::SymBool>(memory_format);
+  }
+
+  /**
+   * Whether or not a tensor is laid out in contiguous memory.
+   *
+   * Tensors with non-trivial strides are not contiguous.  See
+   * compute_contiguous() for the exact definition of whether or not
+   * a tensor is contiguous or not.
+   */
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_contiguous_custom(memory_format);
+    }
+    return is_contiguous_default(memory_format);
+  }
+
+  bool is_strides_like_default(at::MemoryFormat memory_format) const {
+    if (has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return symbolic_shape_meta().is_channels_last().guard_bool(
+            __FILE__, __LINE__);
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return symbolic_shape_meta().is_channels_last_3d().guard_bool(
+            __FILE__, __LINE__);
+      } else {
+        return false;
+      }
+    }
+
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_;
+    } else {
+      return false;
+    }
+  }
+
+  SymBool sym_is_non_overlapping_and_dense_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().is_non_overlapping_and_dense();
+    } else {
+      return is_non_overlapping_and_dense_;
+    }
+  }
+
+  bool is_non_overlapping_and_dense_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return sym_is_non_overlapping_and_dense_default().guard_bool(
+          __FILE__, __LINE__);
+    } else {
+      return is_non_overlapping_and_dense_;
+    }
+  }
+
+  // NB: these dim accessor functions don't have _default(), as you can use
+  // sizes_default/strides_default
+  /**
+   * Return the size of a tensor at some dimension, wrapping the dimension if
+   * necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t size(int64_t d) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return size_custom(d);
+    }
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sizes_and_strides_.size_at_unchecked(d);
+  }
+
+  c10::SymInt sym_size(int64_t d) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_size_custom(d);
+    }
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    const auto sizes = this->sym_sizes();
+    return sizes[d];
+  }
+
+  /**
+   * Return the stride of a tensor at some dimension, wrapping the dimension
+   * if necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t stride(int64_t d) const {
+    d = maybe_wrap_dim(d, dim(), false);
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      // TODO: provide stride_custom, symmetrically with size_custom.
+      // There is presently no user for it; only NestedTensor is using
+      // size_custom overrideability
+      return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+    }
+    // Intentionally don't call default, which also handles symbolic
+    return sizes_and_strides_.stride_at_unchecked(d);
+  }
+
+  enum class SizesStridesPolicy : uint8_t {
+    // Default behavior, e.g., dense tensor.
+    //
+    // Can override: nothing
+    Default = 0,
+    // Customizable strides behavior, e.g., sparse tensor,
+    // mkldnn tensor.
+    //
+    // Can override: strides(), is_contiguous()
+    CustomStrides = 1,
+    // Customizable sizes behavior, e.g., nested tensor
+    //
+    // Can override: strides(), is_contiguous(), sizes(), dim(), numel()
+    CustomSizes = 2
+  };
+
+ protected:
+  inline bool matches_policy(SizesStridesPolicy policy) const {
+    return sizes_strides_policy_ >= static_cast<uint8_t>(policy);
+  }
+
+  inline bool matches_custom(SizesStridesPolicy policy) const {
+    return custom_sizes_strides_ >= static_cast<uint8_t>(policy);
+  }
+
+  inline bool matches_python_custom(SizesStridesPolicy policy) const {
+    auto r = python_custom_sizes_strides_ >= static_cast<uint8_t>(policy);
+    if (r) {
+      TORCH_INTERNAL_ASSERT(is_python_dispatch())
+    }
+    return r;
+  }
+
+  /**
+   * Customization points for the functions above.  sizes_strides_policy_
+   * must be set to enable these.
+   *
+   * NB: dim is overridable separately from sizes because it is possible
+   * for a tensor to have rank, but not well defined sizes.
+   */
+  // sizes_strides_policy_ >= CustomStrides
+
+  virtual bool is_strides_like_custom(at::MemoryFormat memory_format) const;
+
+  virtual c10::SymBool sym_is_non_overlapping_and_dense_custom() const;
+
+  bool is_non_overlapping_and_dense_custom() const {
+    return sym_is_non_overlapping_and_dense_custom().guard_bool(
+        __FILE__, __LINE__);
+  }
+
+  virtual c10::SymBool sym_is_contiguous_custom(
+      at::MemoryFormat memory_format) const;
+
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const {
+    return sym_is_contiguous_custom(memory_format)
+        .guard_bool(__FILE__, __LINE__);
+  }
+
+  // sizes_strides_policy_ >= CustomSizes
+  // Currently this method only exists to be overwritten by subclasses such as
+  // NestedTensorImpl.
+  virtual int64_t size_custom(int64_t d) const {
+    // TODO: We could add support to Python dispatch here.
+    // TODO: We could call into aten::size.int instead of
+    // sizes_custom()[d] and enable use of the dispatcher.
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+  }
+
+  virtual c10::SymInt sym_size_custom(int64_t d) const {
+    // TODO: We could add support to Python dispatch here.
+    // TODO: We could call into aten::size.int instead of
+    // sym_sizes_custom()[d] and enable use of the dispatcher.
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sym_sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+  }
+
+  virtual IntArrayRef sizes_custom() const;
+  virtual IntArrayRef strides_custom() const;
+  virtual int64_t numel_custom() const;
+  virtual int64_t storage_offset_custom() const;
+  virtual int64_t dim_custom() const;
+  virtual Device device_custom() const;
+  virtual Layout layout_custom() const;
+
+  virtual c10::SymIntArrayRef sym_sizes_custom() const;
+  virtual c10::SymIntArrayRef sym_strides_custom() const;
+  virtual c10::SymInt sym_numel_custom() const;
+  virtual c10::SymInt sym_storage_offset_custom() const;
+
+ public:
+/**
+ * True if this tensor has storage. See storage() for details.
+ */
+#ifdef DEBUG
+  // Allow subclasses to check that their storage_ is never getting set in debug
+  // builds.
+  virtual
+#else
+  TENSORIMPL_MAYBE_VIRTUAL
+#endif
+      bool
+      has_storage() const
+// NOTE: we devirtualize this because it arguably shouldn't be an
+// error just to ask subclasses if they have storage.
+// This used to throw for most subclasses, but OpaqueTensorImpl
+// wanted it to successfully return false, so we went ahead and made
+// it a non-error.
+#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
+  {
+    return storage_;
+  }
+#else
+      ;
+#endif
+
+  /**
+   * Return the underlying storage of a Tensor.  Multiple tensors may share
+   * a single storage.  A Storage is an impoverished, Tensor-like class
+   * which supports far less operations than Tensor.
+   *
+   * Avoid using this method if possible; try to use only Tensor APIs to perform
+   * operations.
+   */
+  TENSORIMPL_MAYBE_VIRTUAL const Storage& storage() const {
+    if (C10_UNLIKELY(storage_access_should_throw_)) {
+      throw_storage_access_error();
+    }
+    return storage_;
+  }
+
+  /**
+   * Return the underlying storage, unsafely assuming this is a basic strided
+   * tensor. In cases where `storage` access would throw, this returns a
+   * default-constructed Storage.
+   */
+  inline const Storage& unsafe_storage() const {
+    return storage_;
+  }
+
+  bool unique_version() const {
+    return version_counter_.unique();
+  }
+
+ protected:
+  virtual Layout layout_impl() const {
+    TORCH_CHECK(
+        false, "layout_impl is only implemented for TensorImpl subclasses.");
+  }
+
+ public:
+  // Whether a tensor is sparse COO or not.
+  bool is_sparse() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    return key_set_.has_all(c10::sparse_ks);
+  }
+
+  // Whether a tensor is sparse CSR or not.
+  bool is_sparse_csr() const {
+    return layout() == kSparseCsr;
+  }
+
+  // Whether a tensor is sparse CSR/CSC/BSR/BSC or not.
+  bool is_sparse_compressed() const {
+    return key_set_.has_all(c10::sparse_csr_ks);
+  }
+
+  bool is_quantized() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized);
+    return key_set_.has_all(quantized_ks);
+  }
+
+  bool is_meta() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_meta();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMeta;
+  }
+
+  bool is_cpu() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_cpu();
+    }
+    // Note: we cannot rely on dispatch keys to determine the device type
+    // of a tensor, because "wrapper" tensors (like FunctionalTensorWrapper)
+    // don't include backend dispatch keys.
+    return device_opt_.has_value() && device_opt_->type() == kCPU;
+  }
+
+  bool is_cuda() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_cuda();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kCUDA;
+  }
+
+  bool is_xpu() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_xpu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kXPU;
+  }
+
+  bool is_ipu() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_ipu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kIPU;
+  }
+
+  bool is_xla() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_xla();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kXLA;
+  }
+
+  bool is_mtia() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_mtia();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMTIA;
+  }
+
+  bool is_hpu() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_hpu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kHPU;
+  }
+
+  bool is_lazy() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_lazy();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kLazy;
+  }
+
+  bool is_hip() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_hip();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kHIP;
+  }
+
+  bool is_ve() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_ve();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kVE;
+  }
+
+  bool is_privateuseone() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_privateuseone();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kPrivateUse1;
+  }
+
+  bool is_mkldnn() const {
+    return key_set_.has_all(c10::mkldnn_ks);
+  }
+
+  bool is_vulkan() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_vulkan();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kVulkan;
+  }
+
+  bool is_metal() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_metal();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMetal;
+  }
+
+  bool is_mps() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_mps();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMPS;
+  }
+
+  bool is_maia() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_maia();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMAIA;
+  }
+
+  bool is_nested() const {
+    return key_set_.has(DispatchKey::NestedTensor);
+  }
+
+  // TODO: remove this once we don't automatically enabled Autograd dispatch
+  // keys
+  //       in TensorImpl constructor.
+  // DON'T USE THIS API!! It's only created for testing purpose in
+  // file aten/src/ATen/core/boxing/impl/test_helpers.h
+  void remove_autograd_key() {
+    key_set_ = key_set_ - autograd_dispatch_keyset;
+  }
+
+  // Inference tensor doesn't have autograd or ADInplaceOrView key.
+  // Invariant:
+  //   Inference tensor has version_counter_.enabled() == false
+  bool is_inference() {
+    bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks);
+    bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        no_ADInplaceOrView == no_Autograd,
+        "ADInplaceOrView and Autograd keys must be on/off at the same time.");
+    return no_ADInplaceOrView && no_Autograd;
+  }
+
+  DeviceIndex get_device() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().index();
+    }
+    return device_default().index();
+  }
+
+  Device device() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom();
+    }
+    return device_default();
+  }
+
+ protected:
+  c10::Device device_default() const {
+    TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device");
+    // See NOTE [std::optional operator usage in CUDA]
+    return *device_opt_;
+  }
+
+ public:
+  Layout layout() const {
+    if (C10_UNLIKELY(layout_policy_)) {
+      return layout_custom();
+    }
+
+    // NB: This method is not virtual and avoid dispatches for perf.
+    // strided is also the most common layout type, so we check for
+    // strided case first.
+    // This keyset must also be kept in sync with the logic in
+    // is_sparse() / is_sparse_csr() / is_mkldnn()
+    constexpr auto sparse_and_sparsecsr_and_mkldnn_ks =
+        c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks;
+    if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) {
+      return kStrided;
+    } else if (is_sparse()) {
+      return kSparse;
+    } else if (is_sparse_compressed()) {
+      // Typically, the tensor dispatch keys define the tensor layout
+      // uniquely. This allows using non-virtual layout method for
+      // better performance. However, when tensor's layout depends,
+      // say, on tensor attributes, one must use this execution path
+      // where the corresponding tensor impl class overwrites virtual
+      // layout_impl() method.
+      //
+      // TODO: implement layout() as native function/method so that
+      // __torch_dispatch__ users will be able to redefine the
+      // layout() method.
+      return layout_impl();
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          is_mkldnn(), "There is an error in the layout calculation logic.");
+      return kMkldnn;
+    }
+  }
+
+  /**
+   * True if a tensor was auto-wrapped from a C++ or Python number.
+   * For example, when you write 't + 2', 2 is auto-wrapped into a Tensor
+   * with `is_wrapped_number_` set to true.
+   *
+   * Wrapped numbers do not participate in the result type computation for
+   * mixed-type operations if there are any Tensors that are not wrapped
+   * numbers.  This is useful, because we want 't + 2' to work with
+   * any type of tensor, not just LongTensor (which is what integers
+   * in Python represent).
+   *
+   * Otherwise, they behave like their non-wrapped equivalents.
+   * See [Result type computation] in TensorIterator.h.
+   *
+   * Why did we opt for wrapped numbers, as opposed to just having
+   * an extra function add(Tensor, Scalar)?  This helps greatly reduce
+   * the amount of code we have to write for add, when actually
+   * a Tensor-Scalar addition is really just a Tensor-Tensor
+   * addition when the RHS is 0-dim (except for promotion behavior.)
+   */
+  bool is_wrapped_number() const {
+    return is_wrapped_number_;
+  }
+
+  /**
+   * Set whether or not a tensor was auto-wrapped from a C++ or Python
+   * number.  You probably don't want to call this, unless you are
+   * writing binding code.
+   */
+  void set_wrapped_number(bool value) {
+    TORCH_INTERNAL_ASSERT(dim() == 0);
+    is_wrapped_number_ = value;
+  }
+
+  /**
+   * Returns true if Tensor supports as_strided and as_strided_backward.
+   * This is used in autograd to perform inplace update on view Tensors.
+   * See Note [View + Inplace update for base tensor] and
+   * [View + Inplace update for view tensor] for details.
+   * Note this method only returns true for XLA backend, where it
+   * simulates strided Tensor to support most view ops, but it cannot
+   * fully support general `as_strided` case.
+   * It can be expanded as needed in the future, e.g sparse Tensor.
+   */
+  inline bool support_as_strided() const {
+    if (is_nested()) {
+      return false;
+    }
+    if (key_set_.has(DispatchKey::Functionalize)) {
+      return false;
+    }
+    return device().supports_as_strided();
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+  // Some methods below are defined in TensorImpl.cpp because Tensor is an
+  // incomplete type.
+
+  /**
+   * Set whether or not a tensor requires gradient.
+   */
+  void set_requires_grad(bool requires_grad);
+
+  /**
+   * True if a tensor requires gradient.  Tensors which require gradient
+   * have history tracked for any operations performed on them, so that
+   * we can automatically differentiate back to them.  A tensor that
+   * requires gradient and has no history is a "leaf" tensor, which we
+   * accumulate gradients into.
+   */
+  bool requires_grad() const;
+
+  /**
+   * Return a mutable reference to the gradient.  This is conventionally
+   * used as `t.grad() = x` to set a gradient to a completely new tensor.
+   */
+  at::Tensor& mutable_grad();
+
+  /**
+   * Return the accumulated gradient of a tensor.  This gradient is written
+   * into when performing backwards, when this tensor is a leaf tensor.
+   */
+  const at::Tensor& grad() const;
+
+  /**
+   * Whether or not the imaginary part of the tensor should be negated
+   */
+  inline bool is_conj() const {
+    constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate);
+    return key_set_.has_all(conjugate_ks);
+  }
+
+  /**
+   * Set whether or not to take the conjugate of the tensor (flip the imaginary
+   * bit).
+   */
+  void _set_conj(bool value) {
+    if (value) {
+      key_set_ = key_set_.add(DispatchKey::Conjugate);
+      TORCH_INTERNAL_ASSERT(isComplexType(typeMetaToScalarType(dtype())));
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::Conjugate);
+    }
+  }
+
+  /**
+   * XXX: do not use, private api!
+   * Update the backend component related keys to the backend component
+   * corresponding to this device.
+   */
+  void _change_backend_component_keys(c10::Device device);
+
+  /**
+   * Whether or not the tensor is a zerotensor
+   */
+  inline bool _is_zerotensor() const {
+    constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor);
+    return key_set_.has_all(zerotensor_ks);
+  }
+
+  /**
+   Set whether or not the tensor is a zero tensor
+  */
+  void _set_zero(bool value) {
+    if (value) {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Please call `torch._efficientzerotensor` if you want to create a tensor with no storage.");
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::ZeroTensor);
+    }
+  }
+
+  /**
+   * Whether or not the tensor should be negated
+   */
+  inline bool is_neg() const {
+    constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative);
+    return key_set_.has_all(negative_ks);
+  }
+
+  /**
+   * Set whether or not to take the conjugate of the tensor (flip the imaginary
+   * bit).
+   */
+  void _set_neg(bool value) {
+    if (value) {
+      key_set_ = key_set_.add(DispatchKey::Negative);
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::Negative);
+    }
+  }
+
+  /**
+   * Return the accumulated gradient of a tensor. This gradient is computed
+   * using forward mode AD.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be returned. Note that since levels are not fully
+   *     supported yet, this argument should be 0. See documentation for
+   *     torch::autograd::enter_dual_level for more details about forward AD
+   * nesting.
+   *   - "self" should represent the Tensor whose forward grad is accessed. It
+   * is required when dealing with view.
+   */
+  const at::Tensor& _fw_grad(uint64_t level, const at::TensorBase& self) const;
+
+  /**
+   * Sets the forward gradient for this Tensor.
+   * The given Tensor might not be used directly and its content will be copied.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "new_grad" is a Tensor containing the new value of the gradient that
+   * should be set
+   *   - "self" should represent the Tensor whose forward grad is accessed. It
+   * is required when dealing with view.
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be set. Note that since levels are not fully supported
+   *     yet, this argument should be 0. See documentation for
+   * torch::autograd::enter_dual_level for more details about forward AD
+   * nesting.
+   *   - "is_inplace_op" is a boolean flag that tells if this gradient was
+   * generated by an inplace operation or an out of place one. This allows
+   * better error checking.
+   */
+  void _set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op);
+
+  /**
+   * Return a typed data pointer to the actual data which this tensor refers to.
+   * This checks that the requested type (from the template parameter) matches
+   * the internal type of the tensor.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if
+   * the size is 0.
+   *
+   * WARNING: If a tensor is not contiguous, you MUST use strides when
+   * performing index calculations to determine the location of elements in
+   * the tensor.  We recommend using 'TensorAccessor' to handle this computation
+   * for you; this class is available from 'Tensor'.
+   */
+  template <typename T>
+  const T* data_dtype_initialized() const {
+    return data_dtype_initialized_impl<const T>(
+        [this] { return static_cast<const T*>(storage_.data()); });
+  }
+
+  /**
+   * Return a mutable typed data pointer to the actual data which this
+   * tensor refers to. This checks that the requested type (from the
+   * template parameter) matches the internal type of the tensor.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if
+   * the size is 0.
+   *
+   * WARNING: If a tensor is not contiguous, you MUST use strides when
+   * performing index calculations to determine the location of elements in
+   * the tensor.  We recommend using 'TensorAccessor' to handle this computation
+   * for you; this class is available from 'Tensor'.
+   */
+  template <typename T>
+  T* mutable_data_dtype_initialized() {
+    return data_dtype_initialized_impl<T>(
+        [this] { return static_cast<T*>(storage_.mutable_data()); });
+  }
+
+ private:
+  // Shared implementation of data_dtype_initialized() and
+  // mutable_data_dtype_initialized().
+  template <typename T, typename Func>
+  T* data_dtype_initialized_impl(const Func& get_data) const {
+    TORCH_CHECK(
+        data_type_.Match<std::remove_const_t<T>>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        caffe2::TypeMeta::TypeName<std::remove_const_t<T>>(),
+        ", while tensor contains ",
+        data_type_.name(),
+        ". ");
+    return data_ptr_impl_impl<T>(get_data);
+  }
+
+ public:
+  /**
+   * More efficient helper for Tensor::data_ptr(). Like data<T>(), but
+   * does not do a type check. Unlike the untemplated data(), does
+   * check has_storage() and storage_initialized().
+   */
+  template <typename T>
+  inline const T* data_ptr_impl() const {
+    return data_ptr_impl_impl<const T>(
+        [this] { return static_cast<const T*>(storage_.data()); });
+  }
+
+  /**
+   * More efficient helper for Tensor::data_ptr(). Like data<T>(), but
+   * does not do a type check. Unlike the untemplated data(), does
+   * check has_storage() and storage_initialized().
+   */
+  template <typename T>
+  inline T* mutable_data_ptr_impl() {
+    return data_ptr_impl_impl<T>(
+        [this] { return static_cast<T*>(storage_.mutable_data()); });
+  }
+
+ private:
+  // Shared implementation of mutable_data_ptr_impl() and the future
+  // mutable_data_ptr_impl().
+  template <typename T, typename Func>
+  __ubsan_ignore_pointer_overflow__ T* data_ptr_impl_impl(
+      const Func& get_data) const {
+    if (C10_UNLIKELY(!has_storage())) {
+      throw_data_ptr_access_error();
+    }
+    TORCH_CHECK(
+        storage_initialized(),
+        "The tensor has a non-zero number of elements, but its data is not allocated yet.\n"
+        "If you're using torch.compile/export/fx, it is likely that we are erroneously "
+        "tracing into a custom kernel. To fix this, please wrap the custom kernel into "
+        "an opaque custom op. Please see the following for details: "
+        "https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html\n"
+        "If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    // Caller does the type check.
+    // Note: storage_offset_ can be non-null even for zero-elements tensors
+    // (for example if created as `torch.empty(5)[10:]`) that triggers
+    // applying non-zero offset to null pointer in UBSan
+    return get_data() + storage_offset_;
+  }
+
+ public:
+  /**
+   * Return a const void* data pointer to the actual data which this
+   * tensor refers to.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if the
+   * size is 0.
+   *
+   * WARNING: The data pointed to by this tensor may not contiguous; do NOT
+   * assume that itemsize() * numel() is sufficient to compute the bytes that
+   * can be validly read from this tensor.
+   */
+  inline const void* data() const {
+    return data_impl<const void>(
+        [this] { return static_cast<const char*>(storage_.data()); });
+  }
+
+  /**
+   * Return a void* data pointer to the actual data which this tensor refers to.
+   *
+   * It is invalid to call mutable_data() on a dtype-uninitialized
+   * tensor, even if the size is 0.
+   *
+   * WARNING: The data pointed to by this tensor may not contiguous; do NOT
+   * assume that itemsize() * numel() is sufficient to compute the bytes that
+   * can be validly read from this tensor.
+   */
+  inline void* mutable_data() {
+    return data_impl<void>(
+        [this] { return static_cast<char*>(storage_.mutable_data()); });
+  }
+
+ private:
+  /// Shared implementation of data() and mutable_data().
+  ///
+  /// get_data must return a byte-addressed pointer, e.g. char*,
+  /// std::byte const*, etc.
+  template <typename Void, typename Func>
+  Void* data_impl(const Func& get_data) const {
+    if (C10_UNLIKELY(!has_storage())) {
+      throw_data_ptr_access_error();
+    }
+    TORCH_CHECK(
+        dtype_initialized(),
+        "Cannot access data pointer of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
+    auto* data = get_data();
+    static_assert(
+        sizeof(*data) == 1, "get_data must return a byte-addressed pointer.");
+    // Computing an offset into an empty tensor would be UB, since an empty
+    // tensor's storage will be nullptr, and adding a nonzero offset to nullptr
+    // is UB.  So we skip the offset computation in this case.
+    if (is_empty()) {
+      return nullptr;
+    }
+    return data + data_type_.itemsize() * storage_offset_;
+  }
+
+ public:
+  /**
+   * Returns the TypeMeta of a tensor, which describes what data type
+   * it is (e.g., int, float, ...)
+   */
+  const caffe2::TypeMeta dtype() const {
+    return data_type_;
+  }
+
+  /**
+   * Return the size of a single element of this tensor in bytes.
+   */
+  size_t itemsize() const {
+    TORCH_CHECK(
+        dtype_initialized(),
+        "Cannot report itemsize of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
+    return data_type_.itemsize();
+  }
+
+  void set_backend_meta(intrusive_ptr<c10::BackendMeta> backend_meta) {
+    get_extra_meta().backend_meta_ = std::move(backend_meta);
+  }
+
+  c10::BackendMeta* get_backend_meta() {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->backend_meta_.get();
+  }
+
+  intrusive_ptr<c10::BackendMeta> get_backend_meta_intrusive_ptr() const {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->backend_meta_;
+  }
+
+  void release_storage_and_set_meta_custom_data_ptr_error_msg_(
+      std::optional<std::string> s) {
+    storage_ = {};
+    set_storage_access_should_throw();
+    get_extra_meta().custom_data_ptr_error_msg_ = s;
+    get_extra_meta().custom_storage_error_msg_ = std::move(s);
+  }
+
+ protected:
+  /**
+   * Returns the human-readable name of the actual type of this object (e.g.,
+   * TensorImpl, BatchedTensorImpl, etc.). Used for error messages.
+   */
+  virtual const char* tensorimpl_type_name() const {
+    return "TensorImpl";
+  }
+
+ private:
+  [[noreturn]] void throw_storage_access_error() const;
+  [[noreturn]] void throw_data_ptr_access_error() const;
+
+  ExtraMeta& get_extra_meta() {
+    if (!extra_meta_) {
+      extra_meta_ = std::make_unique<ExtraMeta>();
+    }
+    return *extra_meta_;
+  }
+
+  c10::SymbolicShapeMeta& symbolic_shape_meta() {
+    TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_);
+    return *extra_meta_->symbolic_shape_meta_;
+  }
+
+  const c10::SymbolicShapeMeta& symbolic_shape_meta() const {
+    TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_);
+    return *extra_meta_->symbolic_shape_meta_;
+  }
+
+ public:
+  /**
+   * True if a tensor has no elements (e.g., numel() == 0).
+   */
+  inline bool is_empty() const {
+    return numel() == 0;
+  }
+
+  // if we are going to use sym sizes, we should be setting sym strides at the
+  // same time, otherwise it's very easy to misuse this API
+  void set_sizes_and_strides(
+      c10::SymIntArrayRef sizes,
+      c10::SymIntArrayRef strides,
+      std::optional<c10::SymInt> storage_offset = std::nullopt);
+  // This is renamed to avoid breaking overload BC
+  void generic_set_sizes_contiguous(c10::SymIntArrayRef sizes);
+  void generic_set_sizes_contiguous(c10::IntArrayRef sizes) {
+    set_sizes_contiguous(sizes);
+  }
+
+  /**
+   * Change the size at some dimension.  This DOES NOT update strides;
+   * thus, most changes to size will not preserve contiguity.  You probably
+   * also want to call set_stride() when you call this.
+   *
+   * TODO: This should be jettisoned in favor of `set_sizes_and_strides`,
+   * which is harder to misuse.
+   */
+  virtual void set_size(int64_t dim, int64_t new_size) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_size ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !matches_policy(SizesStridesPolicy::CustomSizes),
+        "set_size() called on tensor with dynamic shapes or customized size behavior")
+    sizes_and_strides_.size_at(dim) = new_size;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  /**
+   * Change the stride at some dimension.
+   *
+   * TODO: This should be jettisoned in favor of `set_sizes_and_strides`,
+   * which is harder to misuse.
+   */
+  virtual void set_stride(int64_t dim, int64_t new_stride) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_stride ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_stride() called on tensor with symbolic shape")
+    sizes_and_strides_.stride_at_unchecked(dim) = new_stride;
+    refresh_contiguous();
+  }
+
+  /**
+   * Set the offset into the storage of this tensor.
+   *
+   * WARNING: This does NOT check if the tensor is in bounds for the new
+   * location at the storage; the caller is responsible for checking this
+   * (and resizing if necessary.)
+   */
+  virtual void set_storage_offset(int64_t storage_offset) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_storage_offset ",
+        err_msg_tensor_metadata_change_not_allowed);
+    // TODO: this should probably consult policy
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_storage_offset() called on tensor with symbolic shape")
+    storage_offset_ = storage_offset;
+  }
+
+  /**
+   * Like set_sizes_and_strides but assumes contiguous strides.
+   *
+   * WARNING: This function does not check if the requested
+   * sizes/strides are in bounds for the storage that is allocated;
+   * this is the responsibility of the caller
+   */
+  void set_sizes_contiguous(IntArrayRef new_size) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_sizes_contiguous ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !matches_policy(SizesStridesPolicy::CustomStrides),
+        "tried to directly modify sizes for customized tensor");
+    sizes_and_strides_.set_sizes(new_size);
+
+    refresh_numel();
+    empty_tensor_restride(
+        MemoryFormat::Contiguous); // calls refresh_contiguous()
+  }
+
+  C10_ALWAYS_INLINE const impl::SizesAndStrides& sizes_and_strides() {
+    return sizes_and_strides_;
+  }
+
+  /**
+   * Set the sizes and strides of a tensor.
+   *
+   * WARNING: This function does not check if the requested
+   * sizes/strides are in bounds for the storage that is allocated;
+   * this is the responsibility of the caller
+   */
+  void set_sizes_and_strides(
+      IntArrayRef new_size,
+      IntArrayRef new_stride,
+      std::optional<int64_t> storage_offset = std::nullopt) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_sizes_and_strides ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_sizes_and_strides() called on tensor with symbolic shape")
+    TORCH_CHECK(
+        new_size.size() == new_stride.size(),
+        "dimensionality of sizes (",
+        new_size.size(),
+        ") must match dimensionality of strides (",
+        new_stride.size(),
+        ")");
+    const auto new_dim = new_size.size();
+    bool overflowed = false;
+    sizes_and_strides_.set_sizes(new_size);
+
+    if (new_dim > 0) {
+      for (size_t dim = new_dim - 1;; dim--) {
+        if (new_stride[dim] >= 0) {
+          sizes_and_strides_.stride_at_unchecked(dim) = new_stride[dim];
+        } else {
+          // XXX: This behavior is surprising and may need to be removed to
+          // support negative strides. Some pytorch functions rely on it:
+          // for example, torch.cat (run TestTorch.test_cat_empty).
+          if (dim == new_dim - 1) {
+            sizes_and_strides_.stride_at_unchecked(dim) = 1;
+          } else {
+            // Keep stride monotonically increasing to match NumPy.
+            overflowed |= c10::mul_overflows(
+                sizes_and_strides_.stride_at_unchecked(dim + 1),
+                std::max<int64_t>(
+                    sizes_and_strides_.size_at_unchecked(dim + 1), 1),
+                std::addressof(sizes_and_strides_.stride_at_unchecked(dim)));
+          }
+        }
+        if (dim == 0)
+          break;
+      }
+      TORCH_CHECK(!overflowed, "Stride calculation overflowed");
+    }
+
+    refresh_numel();
+    refresh_contiguous();
+
+    if (storage_offset.has_value()) {
+      storage_offset_ = *storage_offset;
+    }
+  }
+
+  /**
+   * Set whether a tensor allows changes to its metadata (e.g. sizes / strides /
+   * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor
+   * ] for details.
+   */
+  void set_allow_tensor_metadata_change(bool value [[maybe_unused]]) {
+    // TODO: at some point, we should kill this field completely.
+    allow_tensor_metadata_change_ = true;
+  }
+
+  /**
+   * True if a tensor allows changes to its metadata (e.g. sizes / strides /
+   * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor
+   * ] for details.
+   */
+  bool allow_tensor_metadata_change() const {
+    return allow_tensor_metadata_change_;
+  }
+
+  /**
+   * Set the pointer to autograd metadata.
+   */
+  void set_autograd_meta(
+      std::unique_ptr<c10::AutogradMetaInterface> autograd_meta);
+
+  /**
+   * Return the pointer to autograd metadata.  May return nullptr if the
+   * tensor does not track gradients.
+   */
+  c10::AutogradMetaInterface* autograd_meta() const;
+
+  /**
+   * Set the pointer to named tensor metadata.
+   */
+  void set_named_tensor_meta(
+      std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta) {
+    TORCH_WARN_ONCE(
+        "Named tensors and all their associated APIs are an experimental feature ",
+        "and subject to change. Please do not use them for anything important ",
+        "until they are released as stable.");
+#ifdef DEBUG
+    if (named_tensor_meta) {
+      TORCH_INTERNAL_ASSERT(named_tensor_meta->slow_dim() == dim());
+    }
+#endif
+    if (named_tensor_meta) {
+      get_extra_meta().named_tensor_meta_ = std::move(named_tensor_meta);
+      key_set_ = key_set_.add(DispatchKey::Named);
+    } else {
+      if (extra_meta_) {
+        extra_meta_->named_tensor_meta_ = nullptr;
+      }
+      key_set_ = key_set_.remove(DispatchKey::Named);
+    }
+  }
+
+  void set_python_dispatch(bool k) {
+    if (k) {
+      key_set_ = key_set_.add(c10::python_ks);
+    } else {
+      key_set_ = key_set_ - c10::python_ks;
+    }
+  }
+
+  bool is_python_dispatch() const {
+    return key_set_.has_all(c10::python_ks);
+  }
+
+  /**
+   * Return the pointer to named tensor metadata.
+   */
+  const c10::NamedTensorMetaInterface* named_tensor_meta() const {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->named_tensor_meta_.get();
+  }
+
+  c10::NamedTensorMetaInterface* named_tensor_meta() {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->named_tensor_meta_.get();
+  }
+
+  bool has_named_tensor_meta() const {
+    if (!extra_meta_) {
+      return false;
+    }
+    return extra_meta_->named_tensor_meta_ != nullptr;
+  }
+
+  // NOTE [ TensorImpl Shallow-Copying ]
+  //
+  // TensorImpl shallow-copying is used when we want to have two Variables share
+  // the same tensor metadata (e.g. sizes / strides / storage pointer /
+  // storage_offset), but each with a different autograd history. Example call
+  // sites:
+  //
+  // 1. `var_detached = var.detach()` uses `shallow_copy_and_detach()` to create
+  // `var_detached` that shares the same tensor metadata with `var`, but with a
+  // completely new autograd history.
+  // 2. `var.set_data(tensor)` uses `shallow_copy_from()` to copy tensor
+  // metadata from `tensor` into `var`, while keeping `var`'s original
+  // AutogradMeta.
+  //
+  // Functions that shallow-copy a TensorImpl (such as
+  // `shallow_copy_and_detach()` / `shallow_copy_from()` /
+  // `copy_tensor_metadata()`) copy the tensor metadata fields (e.g. sizes /
+  // strides / storage pointer / storage_offset) by value. However, the
+  // following fields are not copied:
+  //
+  // 1. the AutogradMeta pointer, because it is unique for each Variable.
+  // 2. the version counter, because the destination TensorImpl's version
+  // counter is either set to the passed-in `version_counter` (in
+  // `shallow_copy_and_detach()` and `copy_tensor_metadata()`), or it is kept
+  // intact (in `shallow_copy_from()`). See NOTE [ Version Counter Sharing ] for
+  // details.
+  //
+  // In `shallow_copy_and_detach()` and `copy_tensor_metadata()`, the passed-in
+  // `allow_tensor_metadata_change` determines whether the TensorImpl
+  // shallow-copy allows changes to its metadata (e.g. sizes / strides / storage
+  // / storage_offset). See NOTE [ Metadata Change for a Detached Tensor ] for
+  // details.
+  //
+  // In `shallow_copy_from()`, we don't check the destination TensorImpl's
+  // `allow_tensor_metadata_change_`, because `shallow_copy_from()` is used for
+  // implementing functions such as `var.set_data(tensor)`, which changes
+  // `var`'s tensor metadata and expects its `allow_tensor_metadata_change_` to
+  // be ignored.
+
+  /**
+   * One TensorImpl can be copied to another TensorImpl if they have the same
+   * DispatchKeySet. The only two special cases (for legacy reason) are:
+   * CPU is compatible with CUDA and SparseCPU is
+   * compatible with SparseCUDA.
+   */
+  inline bool has_compatible_shallow_copy_type(DispatchKeySet from) {
+    auto is_dense = [](DispatchKeySet ts) {
+      constexpr auto dense_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::MPSBit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit,
+           BackendComponent::HPUBit,
+           BackendComponent::MTIABit});
+      constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
+      return ts.has_any(dense_k) && ts.has_any(dense_backends);
+    };
+    auto is_sparse = [](DispatchKeySet ts) {
+      constexpr auto sparse_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::MPSBit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit});
+      constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
+      return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
+    };
+    auto is_sparse_compressed = [](DispatchKeySet ts) {
+      constexpr auto sparse_compressed_k =
+          DispatchKeySet(DispatchKey::SparseCsr);
+      return ts.has_any(sparse_compressed_k);
+    };
+    return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
+        (is_sparse(key_set_) && is_sparse(from)) ||
+        (is_sparse_compressed(key_set_) && is_sparse_compressed(from));
+    ;
+  }
+
+ private:
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+ public:
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Shallow-copies data from another TensorImpl into this TensorImpl.
+   *
+   * For why this function doesn't check this TensorImpl's
+   * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
+    copy_tensor_metadata(
+        /*src_impl=*/impl.get(),
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+  }
+
+  // Inference tensor doesn't have version counter,
+  // set_version_counter is no-op for them.
+  void set_version_counter(const c10::VariableVersion& version_counter) {
+    TORCH_CHECK(
+        !(is_inference() && version_counter.enabled()),
+        "Cannot set version_counter for inference tensor");
+    version_counter_ = version_counter;
+  }
+
+  void set_version_counter(c10::VariableVersion&& version_counter) {
+    TORCH_CHECK(
+        !(is_inference() && version_counter.enabled()),
+        "Cannot set version_counter for inference tensor");
+    version_counter_ = std::move(version_counter);
+  }
+
+  const c10::VariableVersion& version_counter() const noexcept {
+    return version_counter_;
+  }
+
+  void bump_version() {
+    version_counter_.bump();
+  }
+
+  impl::PyObjectSlot* pyobj_slot() {
+    return &pyobj_slot_;
+  }
+
+  const impl::PyObjectSlot* pyobj_slot() const {
+    return &pyobj_slot_;
+  }
+
+  void incref_pyobject() const noexcept override final;
+
+  void decref_pyobject() const noexcept override final;
+
+  bool try_incref_pyobject() const noexcept override final;
+
+ private:
+  // See NOTE [std::optional operator usage in CUDA]
+  // We probably don't want to expose this publicly until
+  // the note is addressed.
+  std::optional<c10::Device> device_opt() const {
+    return device_opt_;
+  }
+
+ public:
+  /**
+   * The device type of a Tensor, e.g., DeviceType::CPU or DeviceType::CUDA.
+   */
+  DeviceType device_type() const {
+    // TODO: A useful internal assert would be to show that device_opt_ is null
+    // only if you are an undefined tensor
+    TORCH_CHECK(
+        device_opt_.has_value(),
+        "device_type cannot be run on undefined Tensor");
+    // See NOTE [std::optional operator usage in CUDA]
+    return (*device_opt_).type();
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   *
+   * This op is auto-asynchronous if the underlying device (CUDA) supports it.
+   */
+  void Extend(int64_t num, float growthPct);
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  void ReserveSpace(int64_t outer_dim);
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   *
+   * This method respects caffe2_keep_on_shrink.  Consult the internal logic
+   * of this method to see exactly under what circumstances this flag matters.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      HandleResize();
+    }
+  }
+
+  template <typename T>
+  void Resize(const std::vector<T>& dim_source) {
+    Resize(ArrayRef<T>(dim_source));
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  void Reshape(const std::vector<int64_t>& dims);
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  void FreeMemory();
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  // To be deprecated
+  void ShareData(const TensorImpl& src);
+
+  void ShareExternalPointer(
+      DataPtr&& data_ptr,
+      const caffe2::TypeMeta data_type,
+      size_t size_bytes);
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const caffe2::TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (data_type_ == meta && storage_initialized()) {
+      return static_cast<void*>(
+          static_cast<char*>(storage_.mutable_data()) +
+          storage_offset_ * meta.itemsize());
+    } else {
+      bool had_special_dtor = data_type_.placementDelete() != nullptr;
+      storage_offset_ = 0;
+      data_type_ = meta;
+      // NB: device is not changed
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.placementNew() == nullptr && !had_special_dtor &&
+           (storage_.nbytes() >= (numel_ * data_type_.itemsize())))) {
+        TORCH_INTERNAL_ASSERT(
+            storage_offset_ == 0); // because we just reallocated
+        return storage_.mutable_data();
+      }
+      Allocator* allocator = storage_.allocator();
+      // Storage might have nullptr allocator in rare cases, for example, if
+      // an external memory segment has been wrapped with Tensor and we don't
+      // know how to reallocate it. However, in order to preserve legacy C2
+      // behavior, we allow reallocating the memory using default allocator.
+      if (allocator == nullptr) {
+        allocator = GetAllocator(storage_.device_type());
+      }
+      if (meta.placementNew()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = data_type_.placementDelete();
+        auto data_ptr = allocator->allocate(numel_ * data_type_.itemsize());
+        storage_.set_data_ptr_noswap(PlacementDeleteContext::makeDataPtr(
+            std::move(data_ptr), dtor, size, storage_.device()));
+        data_type_.placementNew()(storage_.mutable_data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        storage_.set_data_ptr_noswap(
+            allocator->allocate(numel_ * data_type_.itemsize()));
+      }
+      storage_.set_nbytes(numel_ * data_type_.itemsize());
+      TORCH_INTERNAL_ASSERT(
+          storage_offset_ == 0); // because we just reallocated
+      device_opt_ = storage_.device();
+      return storage_.mutable_data();
+    }
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if (storage_initialized() && data_type_.Match<T>()) {
+      return static_cast<T*>(storage_.mutable_data()) + storage_offset_;
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible_v<T>,
+        "Tensor can't hold non-default-constructable types");
+    return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
+  }
+
+  /**
+   * True if a tensor is storage initialized.  A tensor may become
+   * storage UNINITIALIZED after a Resize() or FreeMemory()
+   */
+  bool storage_initialized() const {
+    TORCH_CHECK(
+        has_storage(),
+        "cannot call storage_initialized on tensor that does not have storage");
+    return storage_.data() || numel_ == 0;
+  }
+
+  /**
+   * True if a tensor is dtype initialized.  A tensor allocated with
+   * Caffe2-style constructors is dtype uninitialized until the
+   * first time mutable_data<T>() is called.
+   */
+  bool dtype_initialized() const noexcept {
+    return data_type_ != caffe2::TypeMeta();
+  }
+
+  void set_storage_keep_dtype(at::Storage storage) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_storage ",
+        err_msg_tensor_metadata_change_not_allowed);
+    storage_ = std::move(storage);
+    device_opt_ = storage_.device();
+  }
+
+  void set_storage_and_dtype(
+      at::Storage storage,
+      const caffe2::TypeMeta data_type) {
+    set_storage_keep_dtype(std::move(storage));
+    data_type_ = data_type;
+  }
+
+  void empty_tensor_restride_symint(MemoryFormat memory_format);
+
+  /**
+   * Set the strides of the tensor to match memory_format
+   *
+   * WARNING: This function doesn't rearrange data and assumes tensor is a
+   * memory contiguous
+   */
+  void empty_tensor_restride(MemoryFormat memory_format) {
+    if (has_symbolic_sizes_strides_) {
+      empty_tensor_restride_symint(memory_format);
+      return;
+    }
+#ifdef DEBUG
+    TORCH_INTERNAL_ASSERT(
+        compute_numel() == numel_,
+        "If you are seeing this error, that means empty_tensor_restride was "
+        "called before setting correct numel");
+#endif
+    switch (memory_format) {
+      case MemoryFormat::Contiguous: {
+        // dim_ is a virtual call, don't repeat it
+        const auto dim_ = dim();
+        sizes_and_strides_.resize(dim_);
+        if (dim_ > 0) {
+          bool overflowed = false;
+          const auto last_idx = dim_ - 1;
+          sizes_and_strides_.stride_at_unchecked(last_idx) = 1;
+          for (auto i = last_idx - 1; i >= 0; --i) {
+            overflowed |= c10::mul_overflows(
+                sizes_and_strides_.stride_at_unchecked(i + 1),
+                std::max<int64_t>(
+                    sizes_and_strides_.size_at_unchecked(i + 1), 1),
+                std::addressof(sizes_and_strides_.stride_at_unchecked(i)));
+          }
+          TORCH_CHECK(!overflowed, "Stride calculation overflowed");
+        }
+        break;
+      }
+      case MemoryFormat::ChannelsLast: {
+        TORCH_CHECK(
+            dim() == 4, "required rank 4 tensor to use channels_last format");
+        set_sizes_and_strides(sizes(), get_channels_last_strides_2d(sizes()));
+        break;
+      }
+      case MemoryFormat::ChannelsLast3d: {
+        TORCH_CHECK(
+            dim() == 5,
+            "required rank 5 tensor to use channels_last_3d format");
+        set_sizes_and_strides(sizes(), get_channels_last_strides_3d(sizes()));
+        break;
+      }
+      case MemoryFormat::Preserve:
+        TORCH_CHECK(false, "unsupported memory format ", memory_format);
+        // Cleaning warning messages, no need to break as TORCH_CHECK(false)
+        // terminates flow.
+        // break;
+      case MemoryFormat::NumOptions:
+        TORCH_INTERNAL_ASSERT(false, "invalid memory format ", memory_format);
+    }
+    // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually
+    // exclusive see #24090
+    refresh_contiguous();
+  }
+
+  bool is_strides_like(at::MemoryFormat memory_format) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_strides_like_custom(memory_format);
+    }
+    return is_strides_like_default(memory_format);
+  }
+
+  bool is_strides_like_channels_last() const {
+    return is_strides_like(at::MemoryFormat::ChannelsLast);
+  }
+
+  bool is_strides_like_channels_last_3d() const {
+    return is_strides_like(at::MemoryFormat::ChannelsLast3d);
+  }
+
+  bool is_non_overlapping_and_dense_or_false() const {
+    return sym_is_non_overlapping_and_dense().guard_or_false(
+        __FILE__, __LINE__);
+  }
+
+  bool is_non_overlapping_and_dense() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_non_overlapping_and_dense_custom();
+    }
+    return is_non_overlapping_and_dense_default();
+  }
+
+  SymBool sym_is_non_overlapping_and_dense() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return sym_is_non_overlapping_and_dense_custom();
+    }
+    return sym_is_non_overlapping_and_dense_default();
+  }
+
+  // if this returns true, then it is guaranteed that this tensor has symbolic
+  // sizes/strides
+  bool has_symbolic_sizes_strides() const {
+    return has_symbolic_sizes_strides_;
+  }
+
+ private:
+  void HandleResize();
+
+  // The Caffe2 Resize() method supports being called both as Resize({2,2}) as
+  // well as variadic with Resize(2, 2).  These overloads provide all of the
+  // supported calling configurations, while being overloads (and not templates)
+  // so that implicit conversions still work.
+  //
+  // SetDims on ArrayRef is internally implemented as a template, so we can
+  // handle both ArrayRefs of different types (there are some uses of
+  // Resize in Caffe2 which pass in int, not int64_t.)
+
+  template <
+      typename T,
+      typename = typename std::enable_if_t<std::is_integral_v<T>>>
+  bool SetDimsTemplate(ArrayRef<T> src) {
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "SetDims() called on tensor with symbolic shape")
+
+    auto old_numel = numel_;
+    sizes_and_strides_.resize(src.size());
+    int64_t new_numel = 1;
+    for (const auto i : c10::irange(src.size())) {
+      new_numel *= src[i];
+      sizes_and_strides_.size_at_unchecked(i) = src[i];
+    }
+    numel_ = new_numel;
+    empty_tensor_restride(MemoryFormat::Contiguous);
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(ArrayRef<int64_t> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims(ArrayRef<int> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims(ArrayRef<size_t> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims() {
+    return SetDims(IntArrayRef{});
+  }
+
+  bool SetDims(const int64_t d0) {
+    return SetDims(IntArrayRef{d0});
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1) {
+    return SetDims(IntArrayRef{d0, d1});
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
+    return SetDims(IntArrayRef{d0, d1, d2});
+  }
+
+  bool SetDims(
+      const int64_t d0,
+      const int64_t d1,
+      const int64_t d2,
+      const int64_t d3) {
+    return SetDims(IntArrayRef{d0, d1, d2, d3});
+  }
+
+  /**
+   * Compute the number of elements based on the sizes of a tensor.
+   */
+  // NB: This is ONLY called when sizes_and_strides_ is used directly; if
+  // we are virtualizing, then numel calls are virtualized as well, and this
+  // should never get called
+  int64_t compute_numel() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_);
+#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE)
+    // Use overflow checks if supported by the compiler
+    return safe_compute_numel();
+#else
+    return c10::multiply_integers(sizes_and_strides_.sizes_arrayref());
+#endif
+  }
+
+  /**
+   * Compute the number of elements based on the sizes of a
+   * tensor. Catches integer overflow that may occur when a tensor
+   * using a sparse layout has multiple dimensions with large sizes.
+   */
+  int64_t safe_compute_numel() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_);
+    uint64_t n = 1;
+    bool overflows =
+        c10::safe_multiplies_u64(sizes_and_strides_.sizes_arrayref(), &n);
+    constexpr auto numel_max = std::min(
+        static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
+        static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
+
+    overflows |= (n > numel_max);
+    TORCH_CHECK(!overflows, "numel: integer multiplication overflow");
+    return static_cast<int64_t>(n);
+  }
+
+  /**
+   * Compute whether or not a tensor is contiguous based on the sizes and
+   * strides of a tensor.
+   */
+  bool compute_contiguous() const;
+
+  bool compute_channels_last_contiguous_2d() const;
+
+  bool compute_channels_last_contiguous_3d() const;
+
+  bool compute_strides_like_channels_last_2d() const;
+
+  bool compute_strides_like_channels_last_3d() const;
+
+  bool compute_non_overlapping_and_dense() const;
+
+ protected:
+  /**
+   * Recompute the cached numel of a tensor.  Call this if you modify
+   * sizes.
+   *
+   * For tensors with sparse layouts, use safe_refresh_numel() instead
+   * because it will catch integer overflow that may occur for tensors
+   * with sparse layouts and large dimensions.
+   *
+   * NB: We may uselessly recompute cached numel even in situations where
+   * it is completely never used (e.g., if CustomSizes for Python).  However,
+   * we still must keep it up to date in case the Python overload
+   * returns None (in which case we will consult the field here).  This also
+   * implies that sizes/strides will never be complete garbage; in the
+   * very worst case scenario, it will reflect a 1-dim zero size tensor.
+   */
+  void refresh_numel() {
+    if (has_symbolic_sizes_strides_) {
+      symbolic_shape_meta().refresh_numel();
+    } else {
+      numel_ = compute_numel();
+    }
+  }
+
+  /**
+   * Recompute the cached numel of a tensor.  Call this if you modify
+   * sizes. Use only for tensors with sparse layouts because only
+   * sparse tensor are likely to have sizes that may lead to integer
+   * overflow when computing numel.
+   */
+  void safe_refresh_numel() {
+    if (has_symbolic_sizes_strides_) {
+      // NB: sym numel is done with symbolic integers, which handle overflow
+      // checking
+      symbolic_shape_meta().refresh_numel();
+    } else {
+      numel_ = safe_compute_numel();
+    }
+  }
+
+ private:
+  void _set_is_contiguous(bool b) {
+    is_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_contiguous(bool b) {
+    is_channels_last_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_3d_contiguous(bool b) {
+    is_channels_last_3d_contiguous_ = b;
+  }
+
+  void _set_is_channels_last(bool b) {
+    is_channels_last_ = b;
+  }
+
+  void _set_is_channels_last_3d(bool b) {
+    is_channels_last_3d_ = b;
+  }
+
+  void _set_is_non_overlapping_and_dense(bool b) {
+    is_non_overlapping_and_dense_ = b;
+  }
+
+  // These are little wrappers over the real compute_ functions that
+  // can make use of other contiguity fields to short circuit.
+
+  bool compute_is_non_overlapping_and_dense_dim4() {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        compute_non_overlapping_and_dense();
+  }
+
+  bool compute_channels_last_contiguous_3d_dim5() {
+    return !is_channels_last_contiguous_ &&
+        compute_channels_last_contiguous_3d();
+  }
+
+  bool compute_channels_last_2d_dim5() {
+    return !is_channels_last_3d_contiguous_ &&
+        compute_strides_like_channels_last_2d();
+  }
+
+  bool compute_channels_last_3d_dim5() {
+    return !is_channels_last_ && compute_strides_like_channels_last_3d();
+  }
+
+  bool compute_is_non_overlapping_and_dense_dim5() {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        is_channels_last_3d_contiguous_ || compute_non_overlapping_and_dense();
+  }
+
+  bool compute_is_non_overlapping_and_dense_anydim() {
+    return is_contiguous_ || compute_non_overlapping_and_dense();
+  }
+
+  void _refresh_contiguous() {
+    // Note:
+    // Dim 0, 1, 2 will never be a channels last 2d/3d format
+    // Dim 3+ is possibly be a channels last 2d format (Dim 4 only at this
+    // point) Dim 4+ is possibly be a channels last 3d format (Dim 5 only at
+    // this point)
+    switch (dim()) {
+      case 4: {
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d());
+        _set_is_channels_last_3d_contiguous(false);
+        _set_is_channels_last(compute_strides_like_channels_last_2d());
+        _set_is_channels_last_3d(false);
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_dim4());
+        break;
+      }
+      case 5: {
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d());
+        _set_is_channels_last_3d_contiguous(
+            compute_channels_last_contiguous_3d_dim5());
+        _set_is_channels_last(compute_channels_last_2d_dim5());
+        _set_is_channels_last_3d(compute_channels_last_3d_dim5());
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_dim5());
+        break;
+      }
+      default:
+        // is_channels_last_ and is_channels_last_3d_ are suggested
+        // memory_format. Being channels_last_contiguous doesn't necessarily
+        // mean the tensor is strided like channels_last: for strides on channel
+        // dimension could suggest desired memory_layout, but it doesn't affect
+        // memory storage
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(false);
+        _set_is_channels_last_3d_contiguous(false);
+        _set_is_channels_last(false);
+        _set_is_channels_last_3d(false);
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_anydim());
+        break;
+    }
+  }
+
+ protected:
+  /**
+   * Recompute the cached contiguity of a tensor.  Call this if you modify sizes
+   * or strides.
+   */
+  void refresh_contiguous() {
+    if (has_symbolic_sizes_strides_) {
+      symbolic_shape_meta().refresh_contiguous();
+    } else {
+      _refresh_contiguous();
+    }
+  }
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change);
+
+ private:
+  static void copy_tensor_metadata_except_version_counter(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      bool allow_tensor_metadata_change);
+
+ protected:
+  // Error message to show when the user tries to change tensor metadata on
+  // Tensor created from .data or .detach().
+  //
+  // See NOTE [ Metadata Change for a Detached Tensor ] for details.
+  static const char* const err_msg_tensor_metadata_change_not_allowed;
+
+  static void copy_generic_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl);
+
+ public:
+  void set_storage_access_should_throw() {
+    storage_access_should_throw_ = true;
+  }
+
+ public:
+  void set_custom_sizes_strides(SizesStridesPolicy policy) {
+    custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
+  }
+
+  void set_python_custom_sizes_strides(SizesStridesPolicy policy) {
+    python_custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
+  }
+
+  void set_custom_device(bool custom_device) {
+    custom_device_ = custom_device;
+    refresh_device_policy();
+  }
+
+  void set_custom_layout(bool custom_layout) {
+    custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+  void set_python_custom_device(bool custom_device) {
+    python_custom_device_ = custom_device;
+    refresh_device_policy();
+  }
+
+  void set_python_custom_layout(bool custom_layout) {
+    python_custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+ protected:
+  void refresh_sizes_strides_policy() {
+    if (has_symbolic_sizes_strides_) {
+      sizes_strides_policy_ =
+          static_cast<uint8_t>(SizesStridesPolicy::CustomSizes);
+    } else {
+      sizes_strides_policy_ =
+          std::max(custom_sizes_strides_, python_custom_sizes_strides_);
+    }
+  }
+
+  void refresh_device_policy() {
+    device_policy_ = custom_device_ || python_custom_device_;
+  }
+
+  void refresh_layout_policy() {
+    layout_policy_ = custom_layout_ || python_custom_layout_;
+  }
+
+ protected:
+  Storage storage_;
+
+ private:
+  // This pointer points to an AutogradMeta struct that stores autograd-specific
+  // fields (such as grad_ / grad_fn_ / grad_accumulator_). This pointer always
+  // has unique ownership (meaning only one TensorImpl can own it at a time).
+  //
+  // autograd_meta_ can be nullptr, as an optimization.  When this occurs, it is
+  // equivalent to having an autograd_meta_ pointing to a default constructed
+  // AutogradMeta; intuitively, tensors which don't require grad will have this
+  // field set to null.
+  //
+  // This means accessors on autograd_meta_ have to be careful to test if they
+  // got a nullptr, and handle default behavior appropriately in that case.
+  //
+  // Note that we don't enforce the invariant that if the AutogradMeta is
+  // default constructed, it is nullptr (to do this, we'd have to continuously
+  // check if an AutogradMeta became, by mutation, equal to the default
+  // constructed form.  (This might be useful, but it seems rare enough that
+  // a requires_grad=True variable will turn back into the requires_grad=False
+  // version.)  So there are three representable states:
+  //
+  //    1. autograd_meta_ == nullptr
+  //    2. autograd_meta_ is default constructed (semantically, same as (1))
+  //    3. autograd_meta_ has nontrivial information content
+  //
+  std::unique_ptr<c10::AutogradMetaInterface> autograd_meta_ = nullptr;
+
+ protected:
+  std::unique_ptr<c10::ExtraMeta> extra_meta_ = nullptr;
+
+  c10::VariableVersion version_counter_;
+
+  impl::PyObjectSlot pyobj_slot_;
+
+  c10::impl::SizesAndStrides sizes_and_strides_;
+
+  int64_t storage_offset_ = 0;
+  // If sizes and strides are empty, the numel is 1!!  However, most of the
+  // time, we will immediately set sizes to {0} and reset numel to 0.
+  // (Can't do that in the default initializers, because there's no way to
+  // spell "allocate a one-element array" for strides_).
+  int64_t numel_ = 1;
+
+  // INVARIANT: When storage is non-null, this type meta must
+  // agree with the type meta in storage
+  caffe2::TypeMeta data_type_;
+
+  // NOTE [std::optional operator usage in CUDA]
+  // Our optional definition doesn't compile in .cu file if `value()` or
+  // `operator->` are used.  Instead, we always use `operator*`.
+  // See https://github.com/pytorch/pytorch/issues/18496 for more info.
+  // If this is too burdensome to maintain, we can just
+  // manually implement this with an additional bool.
+
+  // INVARIANT: When storage is non-null, this Device must
+  // agree with the type meta in storage.
+  //
+  // INVARIANT: device_opt_ is only nullopt for undefined tensors
+  // (which do not have a device.)
+  std::optional<c10::Device> device_opt_;
+
+  // default member initializers for bit-fields only available with -std=c++2a
+  // or -std=gnu++2a
+  inline void init_bitfields() {
+    is_contiguous_ = true;
+    is_channels_last_ = false;
+    is_channels_last_contiguous_ = false;
+    is_channels_last_3d_ = false;
+    is_channels_last_3d_contiguous_ = false;
+    is_non_overlapping_and_dense_ = true;
+    is_wrapped_number_ = false;
+    allow_tensor_metadata_change_ = true;
+    reserved_ = false;
+    sizes_strides_policy_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    custom_sizes_strides_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_sizes_strides_ =
+        static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_device_ = false;
+    python_custom_layout_ = false;
+    custom_device_ = false;
+    custom_layout_ = false;
+    device_policy_ = false;
+    layout_policy_ = false;
+    storage_access_should_throw_ = false;
+    has_symbolic_sizes_strides_ = false;
+  }
+
+  // Tensor is contiguous
+  bool is_contiguous_ : 1;
+
+  // Tensor is a subclass that does not permit storage access.
+  bool storage_access_should_throw_ : 1;
+
+  // Tensor is stored in the channels last 2d memory format, when dimensions
+  // order is (N)CHW and C-strides < W-strides < H-strides (< N-strides)
+  // (If size of any dimension is equal to 1, this dimension strides value
+  // is not taken into account).
+  bool is_channels_last_ : 1;
+
+  // Channels last contiguous tensor is channel last tensor which occupies
+  // contiguous memory block.
+  bool is_channels_last_contiguous_ : 1;
+
+  // Tensor is stored in the channels last 3d memory format, when dimensions
+  // order is (N)CDHW and C-strides < W-strides < H-strides < D - strides (<
+  // N-strides) (If size of any dimension is equal to 1, this dimension strides
+  // value is not taken into account).
+  bool is_channels_last_3d_ : 1;
+
+  // Channels last 3d contiguous tensor is channel last 3d tensor which occupies
+  // contiguous memory block.
+  bool is_channels_last_3d_contiguous_ : 1;
+
+  // Dense tensor is the tensor that store values in a contiguous block of
+  // memory. Non-overlapping tensor is the tensor in which elements occupy
+  // individual non-repetitive memory.
+  bool is_non_overlapping_and_dense_ : 1;
+
+  bool is_wrapped_number_ : 1;
+
+  // NOTE [ Metadata Change for a Detached Tensor ]
+  //
+  // Normally, a user is allowed to change the tensor metadata
+  // (e.g. sizes / strides / storage / storage_offset) of a tensor.
+  // However, if the tensor is created by `t1_detached = t1.data` in Python
+  // or `t1_detached = t1.detach()` in Python/C++, those changes to the
+  // tensor metadata of `t1_detached` will not be propagated back to the
+  // original tensor `t1`. In order to make such changes explicitly illegal,
+  // we created the `allow_tensor_metadata_change_` flag, to prevent users
+  // from changing metadata of the detached tensor and expecting the original
+  // tensor to also be updated.
+  //
+  // NOTE: For a full list of tensor metadata fields, please see
+  // `copy_tensor_metadata()` in TensorImpl and its subclasses to find
+  // which fields are copied by value.
+  bool allow_tensor_metadata_change_ : 1;
+
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ : 1;
+
+  // Call _custom() virtual methods for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  // This is a combination of sizes_strides_custom_dispatch_
+  // and has_symbolic_sizes_strides_
+  uint8_t sizes_strides_policy_ : 2;
+
+  // Whether or not sizes_and_strides_ contains a symbolic value.
+  bool has_symbolic_sizes_strides_ : 1;
+
+  // Call _custom() virtual method for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t custom_sizes_strides_ : 2;
+
+  // Combo of custom_ and python_custom_
+  bool device_policy_ : 1;
+  bool layout_policy_ : 1;
+
+  // Call _custom() virtual method for device()
+  bool custom_device_ : 1;
+
+  // Call _custom() virtual method for layout()
+  bool custom_layout_ : 1;
+
+  // Call into Python for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t python_custom_sizes_strides_ : 2;
+
+  // Call into Python for device()
+  bool python_custom_device_ : 1;
+
+  // Call into Python for layout()
+  bool python_custom_layout_ : 1;
+
+  // The set of DispatchKeys which describe this tensor.  NB: this
+  // does NOT include Autograd (historically, it did, but
+  // not anymore!)
+  //
+  // INVARIANT: extra_meta_->named_tensor_meta_ != nullptr  <==>
+  // key_set_.has(DispatchKey::Named)
+  DispatchKeySet key_set_;
+
+ private:
+  // C10_TensorImpl_Size_Check_Dummy_Class needs to be friends with
+  // TensorImpl so it can inspect the size of private fields
+  template <
+      size_t cplusplus,
+      size_t clang_ver_major,
+      size_t gcc_ver,
+      size_t gcc_ver_minor,
+      size_t nvcc,
+      size_t cuda_version,
+      size_t cuda_version_major,
+      size_t ptr_size>
+  friend class C10_TensorImpl_Size_Check_Dummy_Class;
+};
+
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
+// Note [TensorImpl size constraints]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Changed the size of TensorImpl?  If the size went down, good for
+// you!  Adjust the documentation below and the expected size.
+// Did it go up?  Read on...
+//
+// Struct size matters.  In some production systems at Facebook, we have
+// 400M live tensors during a training run.  Do the math: every 64-bit
+// word you add to Tensor is an extra 3.2 gigabytes in RAM.
+//
+// If you are a Facebook employee, you can check if the run in question
+// has tipped you over the point using the command here:
+// https://fburl.com/q5enpv98
+//
+// For reference, we OOMed at 160 bytes (20 words) per TensorImpl.
+// This is not counting overhead from strides out-of-line allocation and
+// StorageImpl space and this is from before we inlined sizes and strides
+// directly into TensorImpl as SmallVectors.
+//
+// Our memory usage on 32-bit systems is suboptimal, but we're not checking
+// for it at the moment (to help avoid rage inducing cycles when the
+// 32-bit number is wrong).
+//
+// Current breakdown:
+//
+//    vtable pointer
+//    strong refcount           TODO: pack these into one word
+//    weak refcount
+//    storage pointer
+//    autograd metadata pointer
+//    named tensor metadata pointer
+//    version counter pointer
+//    PyObjectSlot
+//    SizesAndStrides size/pointer
+//    SizesAndStrides sizes (pre-allocated 0)
+//    SizesAndStrides sizes (pre-allocated 1)
+//    SizesAndStrides sizes (pre-allocated 2)
+//    SizesAndStrides sizes (pre-allocated 3)
+//    SizesAndStrides sizes (pre-allocated 4)
+//    SizesAndStrides strides (pre-allocated 0)
+//    SizesAndStrides strides (pre-allocated 1)
+//    SizesAndStrides strides (pre-allocated 2)
+//    SizesAndStrides strides (pre-allocated 3)
+//    SizesAndStrides strides (pre-allocated 4)
+//    storage offset
+//    numel
+//    data type, device, is_contiguous, storage_access_should_throw_, bitfields
+//    DispatchKeySet
+//
+
+// Various preprocessor macros we use to check that the
+// TensorImpl size hasn't changed unexpectedly. We undef
+// these later.
+#ifndef __NVCC__
+#define C10_NVCC 0
+#else
+#define C10_NVCC __NVCC__
+#endif
+
+#ifndef __CUDA_VER_MAJOR__
+#define C10_CUDA_VERSION_MAJOR 0
+#else
+#define C10_CUDA_VERSION_MAJOR __CUDA_VER_MAJOR__
+#endif
+
+#ifndef CUDA_VERSION
+#define C10_CUDA_VERSION 0
+#else
+#define C10_CUDA_VERSION CUDA_VERSION
+#endif
+
+#ifndef __clang_major__
+#define C10_CLANG_MAJOR_VERSION 0
+#else
+#define C10_CLANG_MAJOR_VERSION __clang_major__
+#endif
+
+#ifndef __GNUC__
+#define C10_GCC_VERSION 0
+#else
+#define C10_GCC_VERSION __GNUC__
+#endif
+
+#ifndef __GNUC_MINOR__
+#define C10_GCC_VERSION_MINOR 0
+#else
+#define C10_GCC_VERSION_MINOR __GNUC_MINOR__
+#endif
+
+// We use a templatized class to both contain the logic of checking the sizes
+// as well as to provide compile-time information that might be useful in
+// figuring out why sizes may have changed.
+// All the compile time information is given by the template fields that are
+// always printed by the compiler when the static_assert fails.
+template <
+    size_t cplusplus = __cplusplus,
+    size_t clang_ver_major = C10_CLANG_MAJOR_VERSION,
+    size_t gcc_ver = C10_GCC_VERSION,
+    size_t gcc_ver_minor = C10_GCC_VERSION_MINOR,
+    size_t nvcc = C10_NVCC,
+    size_t cuda_version = C10_CUDA_VERSION,
+    size_t cuda_version_major = C10_CUDA_VERSION_MAJOR,
+    size_t ptr_size = sizeof(void*)>
+class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
+  // Names of (non-bitfield) fields in TensorImpl; used to provide
+  // compile-time info about fields whose size changes unexpectedly.
+  enum class FieldNameEnum {
+    storage_,
+    autograd_meta_,
+    extra_meta_,
+    version_counter_,
+    pyobj_slot_,
+    sizes_and_strides_,
+    storage_offset_,
+    numel_,
+    data_type_,
+    device_opt_,
+    key_set_,
+    TOTAL_SIZE
+  };
+
+  // Provides compile-time equality check that reveals what numbers
+  // were used and on which quantity
+  template <size_t Actual, size_t Expected, FieldNameEnum FiledName>
+  constexpr static bool are_equal() {
+    static_assert(
+        Actual == Expected,
+        "Actual and Expected sizes of a field did not match!");
+    return true;
+  }
+
+  // Provides compile-time <= check that reveals what numbers
+  // were used and on which quantity
+  template <size_t Actual, size_t Expected, FieldNameEnum FiledName>
+  constexpr static bool is_le() {
+    static_assert(
+        Actual <= Expected,
+        "Actual and Expected sizes of a field did not match!");
+    return true;
+  }
+
+ public:
+  // Compile-time check that TensorImpl field sizes are as expected
+  //
+  // Observed total sizes and associated versions
+  // If you find a flag that predicts when unique_ptr has 16 bytes
+  // on 64-bit systems or when sizes_and_strides_ is 84 vs 88 bytes
+  // on 32-bit systems you get a cookie!
+  // Length | LLVM | GCC  |    C++ |  CUDA
+  //    192 |    ? | 11.2 | 201703 | 11040
+  //    208 |    ? | 11.2 | 201703 | 11040
+  //    208 |    ? | 11.2 | 201402 | 11040
+  //    192 |    ? | 11.2 | 201402 | 11040
+  //    160 |   12 |  4.2 | 201703 |     0
+  //
+  // To keep things clean, we split on systems here.
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+  // This is a 32-bit system
+  static constexpr bool check_sizes() {
+    constexpr size_t tsize = 20 * sizeof(int64_t);
+
+    // clang-format off
+    are_equal<sizeof(storage_),            4,  FieldNameEnum::storage_>();
+    are_equal<sizeof(autograd_meta_),      4,  FieldNameEnum::autograd_meta_>();
+    are_equal<sizeof(extra_meta_),         4,  FieldNameEnum::extra_meta_>();
+    are_equal<sizeof(version_counter_),    4,  FieldNameEnum::version_counter_>();
+    are_equal<sizeof(pyobj_slot_),    8,  FieldNameEnum::pyobj_slot_>();
+    is_le<sizeof(sizes_and_strides_),     88, FieldNameEnum::sizes_and_strides_>();
+    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
+    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
+    are_equal<sizeof(data_type_),          2,  FieldNameEnum::data_type_>();
+    are_equal<sizeof(device_opt_),         3,  FieldNameEnum::device_opt_>();
+    are_equal<sizeof(key_set_),            8,  FieldNameEnum::key_set_>();
+    is_le<sizeof(TensorImpl),          tsize,  FieldNameEnum::TOTAL_SIZE>();
+    // clang-format on
+
+    return true;
+  }
+#else
+  // This is a 64-bit system
+  static constexpr bool check_sizes() {
+    constexpr size_t tsize = 26 * sizeof(int64_t);
+
+    // clang-format off
+    are_equal<sizeof(storage_),            8,  FieldNameEnum::storage_>();
+    // On some systems involving NVCC the size of unique_ptr is 16 bytes. We haven't
+    // figured out how to detect those via macro preprocessors yet, so we use <=
+    // comparisons for the relevant fields.
+    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
+    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
+    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
+    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
+    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
+    are_equal<sizeof(data_type_),          2,  FieldNameEnum::data_type_>();
+    are_equal<sizeof(device_opt_),         3,  FieldNameEnum::device_opt_>();
+    are_equal<sizeof(key_set_),            8,  FieldNameEnum::key_set_>();
+    is_le<sizeof(TensorImpl),          tsize,  FieldNameEnum::TOTAL_SIZE>();
+    // clang-format on
+
+    return true;
+  }
+#endif
+};
+
+// We use a class to encapsulate size-checking logic with
+// templates to capture sizes and flags. We call this within
+// a static assert to prove there is no run-time behaviour.
+// Since the methods we call return either true or fail their
+// own static_asserts, we should never see the error messages
+// below. We have to provide it though for c++ <17.
+static_assert(
+    C10_TensorImpl_Size_Check_Dummy_Class<>::check_sizes(),
+    "You should not see this message.");
+
+// Clean up after ourselves
+#undef C10_NVCC
+#undef C10_CUDA_VERSION_MAJOR
+#undef C10_CUDA_VERSION
+#undef C10_CLANG_MAJOR_VERSION
+#undef C10_GCC_VERSION
+#undef C10_GCC_VERSION_MINOR
+
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7add8edc4361ab3c38675d8565ad13b4d1ed48b3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/TensorOptions.h
@@ -0,0 +1,791 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/core/DefaultDtype.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <optional>
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
+
+namespace c10 {
+
+inline ScalarType dtype_or_default(std::optional<ScalarType> dtype) {
+  return dtype.value_or(get_default_dtype_as_scalartype());
+}
+
+inline caffe2::TypeMeta dtype_or_default(
+    std::optional<caffe2::TypeMeta> dtype) {
+  return dtype.value_or(get_default_dtype());
+}
+
+inline Layout layout_or_default(std::optional<Layout> layout) {
+  return layout.value_or(kStrided);
+}
+
+inline Device device_or_default(std::optional<Device> device) {
+  return device.value_or(Device(kCPU));
+}
+
+inline bool pinned_memory_or_default(std::optional<bool> pinned_memory) {
+  return pinned_memory.value_or(false);
+}
+
+/// A class to encapsulate construction axes of an Tensor.  TensorOptions was
+/// designed to support the Python style API for specifying construction options
+/// on factory functions, e.g.,
+///
+///     torch.zeros(2, 3, dtype=torch.int32)
+///
+/// Because C++ doesn't natively support keyword arguments, there must be
+/// another way of specifying keyword-like arguments.  TensorOptions is a
+/// builder class which can be used to construct this "dictionary" of keyword
+/// arguments: functions which support TensorOptions conventionally take this
+/// argument optionally as their last argument.
+///
+/// WARNING: In PyTorch, there are `torch::` variants of factory functions,
+/// e.g., torch::zeros for at::zeros.  These return Variables (while the
+/// stock ATen functions return plain Tensors).  If you mix these functions
+/// up, you WILL BE SAD.
+///
+/// Rather than use the constructor of this class directly, you should prefer to
+/// use the constructor functions, and then chain setter methods on top of them.
+///
+///     at::device(at::kCUDA).dtype(kInt)
+///     at::dtype(at::kInt)
+///
+/// Additionally, anywhere a TensorOptions is expected, you can directly
+/// pass at::kCUDA / at::kInt, and it will implicitly convert to a
+/// TensorOptions.
+///
+/// Here are some recommended ways to create a 2x2 tensor of zeros
+/// with certain properties.  These all *implicitly* make use of
+/// TensorOptions, even if they don't mention the class explicitly:
+///
+///     at::zeros({2,2}, at::kCUDA);
+///     at::zeros({2,2}, at::kLong);
+///     at::zeros({2,2}, at::device(at::kCUDA).dtype(at::kLong()));
+///     at::zeros({2,2}, at::device({at::kCUDA, 1})); // place on device 1
+///     at::zeros({2,2}, at::requires_grad());
+///
+
+/// NOTE [ TensorOptions Constructors ]
+///
+/// TensorOptions is like a dictionary with entries from the set:
+/// {requires_grad, device, dtype, layout}, where each entry may be
+/// unspecified (i.e., is optional). It is used to specify the properties of
+/// tensors in many places both in C++ internal and API, e.g., tensor factory
+/// methods like `at::empty({10}, options)`, tensor conversions like
+/// `tensor.to(...)`, etc.
+///
+/// To provide a simple API that is consistent with Python, where one can do
+/// `torch.empty(sizes, X)` with `X` being a `torch.device`, `torch.dtype`, or a
+/// `torch.layout`, we want TensorOptions to be implicitly convertible from
+/// `ScalarType dtype`, `Layout layout` and `Device device`. Therefore, we have
+/// three implicit constructors from each of these three types.
+///
+/// This is sufficient for `ScalarType` and `Layout` as they are simple Enum
+/// classes. However, `Device` is an ordinary class with implicit constructors
+/// `Device(DeviceType, DeviceIndex = -1)` and `Device(std::string)` to be
+/// consistent with Python API, where strings are treated as equivalent with a
+/// `torch.device` object (e.g., "cuda:1" can be passed to everywhere a
+/// `torch.device("cuda:1")` is accepted). To support the syntax
+/// `at::empty({10}, {kCUDA, 1})` and `tensor.to(kCUDA)`, we need to make sure
+/// that `TensorOptions` is implicitly constructible with any arguments that a
+/// `Device` can constructed from. So we have,
+///
+///    /* implicit */ TensorOptions(T&& device) : TensorOptions() {
+///      this->set_device(device);
+///    }
+///
+///    template <typename... Args,
+///             typename = std::enable_if_t<std::is_constructible<Device,
+///             Args&&...>::value>>
+///    /* implicit */  TensorOptions(Args&&... args)
+///     : TensorOptions(Device(std::forward<Args>(args)...)) {}
+///
+///
+/// But this will be problematic. Consider this: `TensorOptions({kCUDA, 1})`.
+/// Compiler will complain about ambiguity between the copy constructor and the
+/// `Device` constructor because `{kCUDA, 1}` can be converted to both a
+/// `TensorOption` and a `Device`.
+///
+/// To get around this, we templatize the `Device` constructor. Since overload
+/// resolution is done before template resolution, our problem is solved.
+
+DispatchKey computeDispatchKey(
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device);
+
+struct C10_API TensorOptions {
+  TensorOptions()
+      : requires_grad_(false),
+        pinned_memory_(false),
+        has_device_(false),
+        has_dtype_(false),
+        has_layout_(false),
+        has_requires_grad_(false),
+        has_pinned_memory_(false),
+        has_memory_format_(false) {}
+
+  /// Constructs a `TensorOptions` object with the given layout.
+  /* implicit */ TensorOptions(Layout layout) : TensorOptions() {
+    this->set_layout(layout);
+  }
+
+  /// Constructs a `TensorOptions` object with the given device.
+  /// See NOTE [ TensorOptions Constructors ] on why this is templatized.
+  template <
+      typename T,
+      typename = std::enable_if_t<std::is_same_v<std::decay_t<T>, Device>>>
+  /* implicit */ TensorOptions(T&& device) : TensorOptions() {
+    this->set_device(std::forward<T>(device));
+  }
+
+  /// Constructs a `TensorOptions` object from arguments allowed in `Device`
+  /// constructors.
+  ///
+  /// See NOTE [ TensorOptions Constructors ].
+  ///
+  /// NB: Ideally we only allow implicit constructors here. But there is no easy
+  ///     way to detect them. So we have this one that allows explicit
+  ///     constructors too.
+  template <
+      typename... Args,
+      typename = std::enable_if_t<std::is_constructible_v<Device, Args&&...>>>
+  /* implicit */ TensorOptions(Args&&... args)
+      : TensorOptions(Device(std::forward<Args>(args)...)) {}
+
+  /// Constructs a `TensorOptions` object with the given dtype.
+  /* implicit */ TensorOptions(caffe2::TypeMeta dtype) : TensorOptions() {
+    this->set_dtype(dtype);
+  }
+
+  /// legacy constructor to support ScalarType
+  /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() {
+    this->set_dtype(dtype);
+  }
+
+  /// Constructs a `TensorOptions` object with the given memory format.
+  /* implicit */ TensorOptions(MemoryFormat memory_format) : TensorOptions() {
+    set_memory_format(memory_format);
+  }
+
+  /// Return a copy of `TensorOptions` with `device` set to the given one, or
+  /// cleared if `device` is `nullopt`.
+  [[nodiscard]] TensorOptions device(
+      std::optional<Device> device) const noexcept {
+    TensorOptions r = *this;
+    r.set_device(device);
+    return r;
+  }
+
+  /// Return a copy of `TensorOptions` with `device` set to the given one.
+  /// (This overload ensures that variadic template std::optional constructor
+  /// for Device work correctly.)
+  template <typename... Args>
+  [[nodiscard]] TensorOptions device(Args&&... args) const noexcept {
+    return device(
+        std::optional<Device>(std::in_place, std::forward<Args>(args)...));
+  }
+
+  /// Return a copy of `TensorOptions`, but with device set to CUDA, and the
+  /// device index set to the given one.
+  ///
+  /// TODO: This function encourages bad behavior (assuming CUDA is
+  /// the only device that matters).  Get rid of it / rename it.
+  [[nodiscard]] TensorOptions device_index(
+      c10::DeviceIndex device_index) const noexcept {
+    return device(Device::Type::CUDA, device_index);
+  }
+
+  /// Return a copy of `TensorOptions` with `dtype` set to the given one.
+  [[nodiscard]] TensorOptions dtype(
+      std::optional<caffe2::TypeMeta> dtype) const noexcept {
+    TensorOptions r = *this;
+    r.set_dtype(dtype);
+    return r;
+  }
+
+  // legacy function to support ScalarType
+  [[nodiscard]] TensorOptions dtype(
+      std::optional<ScalarType> dtype) const noexcept {
+    TensorOptions r = *this;
+    r.set_dtype(dtype);
+    return r;
+  }
+
+  // Since dtype is taken...
+  template <typename T>
+  TensorOptions& dtype() {
+    dtype_ = caffe2::TypeMeta::Make<T>();
+    has_dtype_ = true;
+    return *this;
+  }
+
+  /// Sets the layout of the `TensorOptions`.
+  [[nodiscard]] TensorOptions layout(
+      std::optional<Layout> layout) const noexcept {
+    TensorOptions r = *this;
+    r.set_layout(layout);
+    return r;
+  }
+
+  /// Sets the `requires_grad` property of the `TensorOptions`.
+  [[nodiscard]] TensorOptions requires_grad(
+      std::optional<bool> requires_grad) const noexcept {
+    TensorOptions r = *this;
+    r.set_requires_grad(requires_grad);
+    return r;
+  }
+
+  /// Sets the `pinned_memory` property on the `TensorOptions`.
+  [[nodiscard]] TensorOptions pinned_memory(
+      std::optional<bool> pinned_memory) const noexcept {
+    TensorOptions r = *this;
+    r.set_pinned_memory(pinned_memory);
+    return r;
+  }
+
+  /// Sets the `memory_format` property on `TensorOptions`.
+  [[nodiscard]] TensorOptions memory_format(
+      std::optional<MemoryFormat> memory_format) const noexcept {
+    TensorOptions r = *this;
+    r.set_memory_format(memory_format);
+    return r;
+  }
+
+  /// Returns the device of the `TensorOptions`.
+  Device device() const noexcept {
+    return device_or_default(device_opt());
+  }
+
+  /// Returns whether the device is specified.
+  bool has_device() const noexcept {
+    return has_device_;
+  }
+
+  /// Returns the device of the `TensorOptions`, or `std::nullopt` if
+  /// device is not specified.
+  std::optional<Device> device_opt() const noexcept {
+    return has_device_ ? std::make_optional(device_) : std::nullopt;
+  }
+
+  /// Returns the device index of the `TensorOptions`.
+  c10::DeviceIndex device_index() const noexcept {
+    return device().index();
+  }
+
+  /// Returns the dtype of the `TensorOptions`.
+  caffe2::TypeMeta dtype() const noexcept {
+    return dtype_or_default(dtype_opt());
+  }
+
+  /// Returns whether the dtype is specified.
+  bool has_dtype() const noexcept {
+    return has_dtype_;
+  }
+
+  /// Returns the dtype of the `TensorOptions`, or `std::nullopt` if
+  /// device is not specified.
+  std::optional<caffe2::TypeMeta> dtype_opt() const noexcept {
+    return has_dtype_ ? std::make_optional(dtype_) : std::nullopt;
+  }
+
+  /// Returns the layout of the `TensorOptions`.
+  Layout layout() const noexcept {
+    return layout_or_default(layout_opt());
+  }
+
+  /// Returns whether the layout is specified.
+  bool has_layout() const noexcept {
+    return has_layout_;
+  }
+
+  /// Returns the layout of the `TensorOptions`, or `std::nullopt` if
+  /// layout is not specified.
+  std::optional<Layout> layout_opt() const noexcept {
+    return has_layout_ ? std::make_optional(layout_) : std::nullopt;
+  }
+
+  /// Returns the `requires_grad` property of the `TensorOptions`.
+  bool requires_grad() const noexcept {
+    return has_requires_grad_ ? requires_grad_ : false;
+  }
+
+  /// Returns whether the `requires_grad` is specified.
+  bool has_requires_grad() const noexcept {
+    return has_requires_grad_;
+  }
+
+  /// Returns the `requires_grad` property of the `TensorOptions`, or
+  /// `std::nullopt` if `requires_grad` is not specified.
+  std::optional<bool> requires_grad_opt() const noexcept {
+    return has_requires_grad_ ? std::make_optional(requires_grad_)
+                              : std::nullopt;
+  }
+
+  /// Returns the `pinned_memory` property of the `TensorOptions`.
+  bool pinned_memory() const noexcept {
+    return pinned_memory_or_default(pinned_memory_opt());
+  }
+
+  /// Returns whether the `pinned_memory` is specified.
+  bool has_pinned_memory() const noexcept {
+    return has_pinned_memory_;
+  }
+
+  /// Returns if the layout is sparse
+  bool is_sparse() const {
+    return layout_ == c10::Layout::Sparse;
+  }
+
+  /// Returns if the layout is sparse CSR, deprecated, use
+  /// is_sparse_compressed() instead
+  bool is_sparse_csr() const {
+    return layout_ == c10::Layout::SparseCsr;
+  }
+
+  bool is_sparse_compressed() const {
+    return layout_ == c10::Layout::SparseCsr ||
+        layout_ == c10::Layout::SparseCsc ||
+        layout_ == c10::Layout::SparseBsr || layout_ == c10::Layout::SparseBsc;
+  }
+
+  // For compatibility with legacy tensor.type() comparisons
+  bool type_equal(const TensorOptions& other) const {
+    return computeDispatchKey() == other.computeDispatchKey() &&
+        typeMetaToScalarType(dtype_) == typeMetaToScalarType(other.dtype());
+  }
+
+  /// Returns the `pinned_memory` property of the `TensorOptions`, or
+  /// `std::nullopt` if `pinned_memory` is not specified.
+  std::optional<bool> pinned_memory_opt() const noexcept {
+    return has_pinned_memory_ ? std::make_optional(pinned_memory_)
+                              : std::nullopt;
+  }
+
+  /// Returns whether the `memory_layout` is specified
+  bool has_memory_format() const noexcept {
+    return has_memory_format_;
+  }
+
+  // NB: memory_format() getter is PURPOSELY not defined, as the default
+  // behavior of memory_format varies from function to function.
+
+  /// Returns the `memory_layout` property of `TensorOptions, or
+  /// `std::nullopt` if `memory_format` is not specified.
+  std::optional<MemoryFormat> memory_format_opt() const noexcept {
+    return has_memory_format_ ? std::make_optional(memory_format_)
+                              : std::nullopt;
+  }
+
+  // Resolves the ATen backend specified by the current construction axes.
+  // TODO: Deprecate this
+  Backend backend() const {
+    return at::dispatchKeyToBackend(computeDispatchKey());
+  }
+
+  /// Return the right-biased merge of two TensorOptions.  This has the
+  /// effect of overwriting settings from self with specified options
+  /// of options.
+  ///
+  /// NB: This merging operation does NOT respect device merges.
+  /// For example, if you device({kCUDA, 1}).merge_in(kCUDA)
+  /// you will get kCUDA in the end!  Functions like Tensor.new_empty
+  /// ensure the right device is selected anyway by way of a
+  /// device guard.
+  ///
+  TensorOptions merge_in(TensorOptions options) const noexcept {
+    TensorOptions merged = *this;
+    if (options.has_device())
+      merged.set_device(options.device_opt());
+    if (options.has_dtype())
+      merged.set_dtype(options.dtype_opt());
+    if (options.has_layout())
+      merged.set_layout(options.layout_opt());
+    // NB: requires grad is right biased; not a logical AND/OR!
+    if (options.has_requires_grad())
+      merged.set_requires_grad(options.requires_grad_opt());
+    if (options.has_pinned_memory())
+      merged.set_pinned_memory(options.pinned_memory_opt());
+    if (options.has_memory_format())
+      merged.set_memory_format(options.memory_format_opt());
+    return merged;
+  }
+
+  // TODO remove after TensorOptions rationalization
+  TensorOptions merge_memory_format(
+      std::optional<MemoryFormat> optional_memory_format) const noexcept {
+    TensorOptions merged = *this;
+    if (optional_memory_format.has_value()) {
+      merged.set_memory_format(optional_memory_format);
+    }
+    return merged;
+  }
+
+  // INVARIANT: computeDispatchKey returns only the subset of dispatch keys for
+  // which dispatchKeyToBackend is injective, if it is defined at all  (for
+  // the most part, this just means that this function never returns an
+  // Autograd key)
+  DispatchKey computeDispatchKey() const {
+    return c10::computeDispatchKey(
+        optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt());
+  }
+
+ private:
+  // These methods are currently private because I'm not sure if it's wise
+  // to actually publish them.  They are methods because I need them in
+  // the constructor and the functional API implementation.
+  //
+  // If you really, really need it, you can make these public, but check if you
+  // couldn't just do what you need with the functional API.  Similarly, these
+  // methods are not chainable, because if you wanted chaining, you probably
+  // want to use the functional API instead.  (It's probably OK to make
+  // these chainable, because these functions are all explicitly annotated
+  // with a ref-qualifier, the trailing &, that makes them illegal to call
+  // on temporaries.)
+
+  /// Mutably set the device of `TensorOptions`.
+  void set_device(std::optional<Device> device) & noexcept {
+    if (device) {
+      device_ = *device;
+      has_device_ = true;
+    } else {
+      has_device_ = false;
+    }
+  }
+
+  /// Mutably set the dtype of `TensorOptions`.
+  void set_dtype(std::optional<caffe2::TypeMeta> dtype) & noexcept {
+    if (dtype) {
+      dtype_ = *dtype;
+      has_dtype_ = true;
+    } else {
+      has_dtype_ = false;
+    }
+  }
+
+  // legacy function to support ScalarType
+  void set_dtype(std::optional<ScalarType> dtype) & noexcept {
+    if (dtype) {
+      dtype_ = scalarTypeToTypeMeta(*dtype);
+      has_dtype_ = true;
+    } else {
+      has_dtype_ = false;
+    }
+  }
+
+  /// Mutably set the layout of `TensorOptions`.
+  void set_layout(std::optional<Layout> layout) & noexcept {
+    if (layout) {
+      layout_ = *layout;
+      has_layout_ = true;
+    } else {
+      has_layout_ = false;
+    }
+  }
+
+  /// Mutably set the `requires_grad` property of `TensorOptions`.
+  void set_requires_grad(std::optional<bool> requires_grad) & noexcept {
+    if (requires_grad) {
+      requires_grad_ = *requires_grad;
+      has_requires_grad_ = true;
+    } else {
+      has_requires_grad_ = false;
+    }
+  }
+
+  /// Mutably set the `pinned_memory` property of `TensorOptions`.
+  void set_pinned_memory(std::optional<bool> pinned_memory) & noexcept {
+    if (pinned_memory) {
+      pinned_memory_ = *pinned_memory;
+      has_pinned_memory_ = true;
+    } else {
+      has_pinned_memory_ = false;
+    }
+  }
+
+  /// Mutably set the `memory_Format` property of `TensorOptions`.
+  void set_memory_format(std::optional<MemoryFormat> memory_format) & noexcept {
+    if (memory_format) {
+      memory_format_ = *memory_format;
+      has_memory_format_ = true;
+    } else {
+      has_memory_format_ = false;
+    }
+  }
+
+  // WARNING: If you edit TensorOptions to add more options, you
+  // may need to adjust the implementation of Tensor::options.
+  // The criteria for whether or not Tensor::options must be adjusted
+  // is whether or not the new option you added should preserved
+  // by functions such as empty_like(); if it should be preserved,
+  // you must adjust options().
+  //
+  // TODO: MemoryFormat is not implemented in this way
+
+  // NB: We didn't use std::optional here, because then we can't pack
+  // the has_***_ boolean fields.
+
+  Device device_ = at::kCPU; // 16-bit
+  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
+  Layout layout_ = at::kStrided; // 8-bit
+  MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit
+
+  // Bitmask required here to get this to fit inside 32 bits (or even 64 bits,
+  // for that matter)
+
+  bool requires_grad_ : 1;
+  bool pinned_memory_ : 1;
+
+  bool has_device_ : 1;
+  bool has_dtype_ : 1;
+  bool has_layout_ : 1;
+  bool has_requires_grad_ : 1;
+  bool has_pinned_memory_ : 1;
+  bool has_memory_format_ : 1;
+};
+
+// We should aspire to fit in one machine-size word; but a size greater than two
+// words is too much.  (We are doing terribly on 32-bit archs, where we require
+// three machine size words to store tensor options.  Eek!)
+static_assert(
+    sizeof(TensorOptions) <= sizeof(int64_t) * 2,
+    "TensorOptions must fit in 128-bits");
+
+/// Convenience function that returns a `TensorOptions` object with the `dtype`
+/// set to the given one.
+inline TensorOptions dtype(caffe2::TypeMeta dtype) {
+  return TensorOptions().dtype(dtype);
+}
+
+// legacy function to support ScalarType
+inline TensorOptions dtype(ScalarType dtype) {
+  return TensorOptions().dtype(scalarTypeToTypeMeta(dtype));
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `layout`
+/// set to the given one.
+inline TensorOptions layout(Layout layout) {
+  return TensorOptions().layout(layout);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `device`
+/// set to the given one.
+inline TensorOptions device(Device device) {
+  return TensorOptions().device(device);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `device` set to CUDA and the `device_index` set to the given one.
+inline TensorOptions device_index(c10::DeviceIndex device_index) {
+  return TensorOptions().device_index(device_index);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `requires_grad` set to the given one.
+inline TensorOptions requires_grad(bool requires_grad = true) {
+  return TensorOptions().requires_grad(requires_grad);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `memory_format` set to the given one.
+inline TensorOptions memory_format(MemoryFormat memory_format) {
+  return TensorOptions().memory_format(memory_format);
+}
+
+C10_API std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorOptions& options);
+
+template <typename T>
+inline TensorOptions dtype() {
+  return dtype(caffe2::TypeMeta::Make<T>());
+}
+
+inline std::string toString(const TensorOptions& options) {
+  std::ostringstream stream;
+  stream << options;
+  return stream.str();
+}
+
+// This is intended to be a centralized location by which we can determine
+// what an appropriate DispatchKey for a tensor is.
+inline DispatchKey computeDispatchKey(
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device) {
+  const auto layout_ = layout_or_default(layout);
+  const auto device_ = device_or_default(device);
+  switch (layout_) {
+    case Layout::Jagged:
+    case Layout::Strided: {
+      const auto dtype_ = dtype_or_default(dtype);
+      switch (device_.type()) {
+#define DO_CASE(device, _)                   \
+  case c10::DeviceType::device: {            \
+    if (isQIntType(dtype_)) {                \
+      return DispatchKey::Quantized##device; \
+    }                                        \
+    return DispatchKey::device;              \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        case c10::DeviceType::FPGA:
+          return DispatchKey::FPGA;
+        case c10::DeviceType::MAIA:
+          return DispatchKey::MAIA;
+        case c10::DeviceType::Vulkan:
+          return DispatchKey::Vulkan;
+        case c10::DeviceType::Metal:
+          return DispatchKey::Metal;
+        case c10::DeviceType::MKLDNN:
+        case c10::DeviceType::OPENGL:
+        case c10::DeviceType::OPENCL:
+        case c10::DeviceType::IDEEP:
+          TORCH_INTERNAL_ASSERT(
+              0,
+              "This is a grandfathered Caffe2 device type ",
+              device_.type(),
+              ", it shouldn't ever convert to a DispatchKey.  File a bug describing what you were doing if you think this is in error.");
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for dense layout: ",
+              device_.type());
+      }
+    }
+    case Layout::Sparse:
+      switch (device_.type()) {
+#define DO_CASE(device, _)              \
+  case c10::DeviceType::device: {       \
+    return DispatchKey::Sparse##device; \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for sparse layout: ",
+              device_.type());
+      }
+    case Layout::Mkldnn:
+      switch (device_.type()) {
+        case c10::DeviceType::CPU:
+          return DispatchKey::MkldnnCPU;
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for mkldnn layout: ",
+              device_.type());
+      }
+    case Layout::SparseCsr:
+    case Layout::SparseCsc:
+    case Layout::SparseBsr:
+    case Layout::SparseBsc:
+      switch (device_.type()) {
+#define DO_CASE(device, _)                 \
+  case c10::DeviceType::device: {          \
+    return DispatchKey::SparseCsr##device; \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for ",
+              layout_,
+              " layout: ",
+              device_.type());
+      }
+    default:
+      TORCH_CHECK(false, "Unsupported layout: ", layout_);
+  }
+}
+
+inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) {
+  switch (dispatch_key) {
+#define DO_CASE(bc, _) case DispatchKey::Sparse##bc:
+    C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
+#undef DO_CASE
+    return Layout::Sparse;
+#define DO_CASE(bc, _) case DispatchKey::SparseCsr##bc:
+    C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
+#undef DO_CASE
+    TORCH_CHECK(
+        false, "Cannot map DispatchKey ", dispatch_key, " to a unique layout.");
+    case DispatchKey::MkldnnCPU:
+      return Layout::Mkldnn;
+    default:
+      return Layout::Strided;
+  }
+}
+
+inline c10::DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
+  switch (dispatch_key) {
+    // stuff that's real
+#define DO_CASE(suffix, prefix)     \
+  case DispatchKey::prefix##suffix: \
+    return c10::DeviceType::suffix;
+#define DO_CASES(_, prefix) C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, prefix)
+    C10_FORALL_FUNCTIONALITY_KEYS(DO_CASES)
+#undef DO_CASES
+#undef DO_CASE
+
+    case DispatchKey::MkldnnCPU:
+      return c10::DeviceType::CPU;
+    case DispatchKey::Vulkan:
+      return c10::DeviceType::Vulkan;
+
+    case DispatchKey::MAIA:
+      return c10::DeviceType::MAIA;
+    default:
+      TORCH_CHECK(
+          false,
+          "DispatchKey ",
+          dispatch_key,
+          " doesn't correspond to a device");
+  }
+}
+
+inline TensorOptions dispatchKeyToTensorOptions(DispatchKey dispatch_key) {
+  return TensorOptions()
+      .layout(dispatchKeyToLayout(dispatch_key))
+      .device(dispatchKeyToDeviceType(dispatch_key));
+}
+
+namespace detail {
+inline bool backend_supports_empty_operator(const TensorOptions& options) {
+  // Quantized backends don't support at::empty().
+  // They have separate operators like at::empty_quantized() that take in
+  // extra information about how to quantize the tensor.
+  return !isQIntType(typeMetaToScalarType(options.dtype()));
+}
+
+} // namespace detail
+
+} // namespace c10
+
+C10_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a8381e887f90556b66f8b654bb5376e16afe074
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/UndefinedTensorImpl.h
@@ -0,0 +1,54 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <cstdint>
+
+namespace c10 {
+
+struct C10_API UndefinedTensorImpl final : public TensorImpl {
+ public:
+  // Without this, we get:
+  //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in
+  //  device code
+  // (ostensibly because the constexpr tricks MSVC into trying to compile this
+  // function for device as well).
+#ifdef _WIN32
+  static inline TensorImpl* singleton() {
+    return &getInstance();
+  }
+#else
+  static constexpr inline TensorImpl* singleton() {
+    return &_singleton;
+  }
+#endif
+
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+  void set_storage_offset(int64_t offset) override;
+
+ protected:
+  c10::SymBool sym_is_contiguous_custom(MemoryFormat format) const override;
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
+
+ private:
+  UndefinedTensorImpl();
+#ifdef _WIN32
+  static UndefinedTensorImpl& getInstance();
+#else
+  static UndefinedTensorImpl _singleton;
+#endif
+  const char* tensorimpl_type_name() const override;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..02570ae84ffdb64c1b2c8b20deb52178c606f57d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/WrapDimMinimal.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>
+#include <utility>
+
+namespace c10 {
+
+namespace detail {
+// This template can only be specialized at int64_t and c10::SymInt;
+// you'll get linker errors otherwise
+template <typename T>
+C10_API T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar);
+} // namespace detail
+
+template <typename T>
+T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) {
+  // Inline the fast paths
+  if (C10_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) {
+    // For SymInts, we want an explicit control flow to trigger a guard, so we
+    // may as well branch too.
+    if (dim < 0) {
+      return dim + dim_post_expr;
+    }
+    return dim;
+  }
+  // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
+  return c10::detail::maybe_wrap_dim_slow<T>(
+      std::move(dim), std::move(dim_post_expr), wrap_scalar);
+}
+
+inline int64_t maybe_wrap_dim(
+    int64_t dim,
+    int64_t dim_post_expr,
+    bool wrap_scalar = true) {
+  return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
+}
+
+inline c10::SymInt maybe_wrap_dim(
+    c10::SymInt dim,
+    c10::SymInt dim_post_expr,
+    bool wrap_scalar = true) {
+  return _maybe_wrap_dim(std::move(dim), std::move(dim_post_expr), wrap_scalar);
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ef01f7bfa99c473ebb6612a83f0cdde53eeec6b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/alignment.h
@@ -0,0 +1,35 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstddef>
+#include <new>
+
+namespace c10 {
+
+#ifdef C10_MOBILE
+// Use 16-byte alignment on mobile
+// - ARM NEON AArch32 and AArch64
+// - x86[-64] < AVX
+constexpr size_t gAlignment = 16;
+#else
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gAlignment = 64;
+#endif
+
+constexpr size_t gPagesize = 4096;
+// since the default thp pagesize is 2MB, enable thp only
+// for buffers of size 2MB or larger to avoid memory bloating
+constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
+
+// Cache line size used to avoid false sharing between threads. Falls back to 64
+// bytes if C++17 feature is unavailable.
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ef394e6e3536530af4a6427f16f0a383c39c5be
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COW.h
@@ -0,0 +1,37 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+struct StorageImpl;
+class DataPtr;
+} // namespace c10
+
+namespace c10::impl::cow {
+
+// Creates a Copy-on-write (COW) clone of the given storage. This will also
+// convert the given storage into a COW storage if it is not COW already.
+//
+// Converting the storage into a COW storage will not be successful if the
+// storage's DataPtr has some context (`DataPtr::get_context()`) which is not
+// equal to the data pointer (`DataPtr::get()`). In this case, a nullptr is
+// returned.
+C10_API c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
+    StorageImpl& storage);
+
+// Check if a storage has a simple DataPtr with no abnormal context
+C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage);
+
+// Check if a DataPtr is COW
+C10_API bool is_cow_data_ptr(const c10::DataPtr& data_ptr);
+
+// Eagerly copies a COW storage's data, turning it into a non-COW storage.
+C10_API void materialize_cow_storage(StorageImpl& storage);
+
+} // namespace c10::impl::cow
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a618003c995ce6fe949b8f0ea5110a8a47b74a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/COWDeleter.h
@@ -0,0 +1,71 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <shared_mutex>
+#include <variant>
+
+namespace c10::impl::cow {
+
+// A COWDeleterContext object is used as the `ctx` argument for DataPtr
+// to implement a Copy-on-write (COW) DataPtr.
+class C10_API COWDeleterContext {
+ public:
+  // Creates an instance, holding the pair of data and original
+  // deleter.
+  //
+  // Note that the deleter will only be called in our destructor if
+  // the last reference to this goes away without getting
+  // materialized.
+  explicit COWDeleterContext(std::unique_ptr<void, DeleterFnPtr> data);
+
+  // Increments the current refcount.
+  void increment_refcount();
+
+  // See README.md in this directory to understand the locking
+  // strategy.
+
+  // Represents a reference to the context.
+  //
+  // This is returned by decrement_refcount to allow the caller to
+  // copy the data under the shared lock.
+  using NotLastReference = std::shared_lock<std::shared_mutex>;
+
+  // Represents the last reference to the context.
+  //
+  // This will be returned by decrement_refcount when it is the last
+  // reference remaining and after any pending copies have completed.
+  using LastReference = std::unique_ptr<void, DeleterFnPtr>;
+
+  // Decrements the refcount, returning a handle indicating what to
+  // do with it.
+  std::variant<NotLastReference, LastReference> decrement_refcount();
+
+ private:
+  // The destructor is hidden, this should only ever be used within
+  // UniqueVoidPtr using cow::delete_context as the deleter.
+  ~COWDeleterContext();
+
+  std::shared_mutex mutex_;
+  std::unique_ptr<void, DeleterFnPtr> data_;
+  std::atomic<std::int64_t> refcount_ = 1;
+};
+
+// `cow_deleter` is used as the `ctx_deleter` for DataPtr to implement a COW
+// DataPtr.
+//
+// Warning: This should only be called on a pointer to a COWDeleterContext that
+// was allocated on the heap with `new`, because when the refcount reaches 0,
+// the context is deleted with `delete`.
+C10_API void cow_deleter(void* ctx);
+
+} // namespace c10::impl::cow
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8b12a993a2a82c4b09b74e5c26ca48bcff3f4bf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h
@@ -0,0 +1,417 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceCapability.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+
+// Just for C10_ANONYMOUS_VARIABLE
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <c10/util/Registry.h>
+
+#include <array>
+#include <atomic>
+
+namespace c10 {
+
+// Forward declaration
+class DataPtr;
+
+/**
+ * Note [Flags defining the behavior of events]
+ *
+ * PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The
+ * BACKEND_DEFAULT is what a particular backend would select if no
+ * flags were given. PYTORCH_DEFAULT is the PyTorch's framework default
+ * choice for events on that backend, which may not be the same.
+ *
+ * The mapping of PYTORCH_DEFAULT and BACKEND_DEFAULT is done by each
+ * backend implementation.
+ */
+enum class EventFlag {
+  // Disable timing
+  PYTORCH_DEFAULT,
+  // Enable timing
+  BACKEND_DEFAULT,
+  // FOR TESTING ONLY
+  INVALID
+};
+
+namespace impl {
+
+/**
+ * DeviceGuardImplInterface represents the virtual interface which provides
+ * functionality to provide an RAII class for device and stream switching,
+ * via DeviceGuard.  Every distinct device type, e.g., CUDA and HIP, is
+ * expected to implement and register an implementation of this interface.
+ * All classes which inherit from DeviceGuardImplInterface should be declared
+ * 'final'.
+ *
+ * This class exists because we provide a unified interface for performing
+ * device guards via DeviceGuard, but we cannot assume that we have actually
+ * compiled against the, e.g., CUDA library, which actually implements
+ * this guard functionality.  In this case, a dynamic dispatch is required
+ * to cross the library boundary.
+ *
+ * If possible, you should directly use implementations of this interface;
+ * those uses will be devirtualized.
+ */
+struct C10_API DeviceGuardImplInterface {
+  DeviceGuardImplInterface() = default;
+  DeviceGuardImplInterface(const DeviceGuardImplInterface&) = default;
+  DeviceGuardImplInterface& operator=(const DeviceGuardImplInterface&) =
+      default;
+  DeviceGuardImplInterface(DeviceGuardImplInterface&&) noexcept = default;
+  DeviceGuardImplInterface& operator=(DeviceGuardImplInterface&&) noexcept =
+      default;
+
+  /**
+   * Return the type of device managed by this guard implementation.
+   */
+  virtual DeviceType type() const = 0;
+
+  /**
+   * Set the current device to Device, and return the previous Device.
+   */
+  virtual Device exchangeDevice(Device) const = 0;
+  // NB: Implementations of exchangeDevice can be a bit boilerplatey.  You might
+  // consider replacing exchangeDevice with a non-virtual function with a baked
+  // in implementation; however, note that this will triple the number of
+  // virtual calls (when you implement exchangeDevice in a final subclass,
+  // the compiler gets to devirtualize everything; it won't do that if you don't
+  // define it in the subclass!)  A common way to solve this problem is to use
+  // some sort of CRTP; however, we can template DeviceGuardImplInterface since
+  // we really *do* need it to be virtual.  A little boilerplate seems easiest
+  // to explain.  (Another way around this problem is to provide inline
+  // functions that provide the default implementations, but this seems a little
+  // hard to explain.  In any case, we're only going to have on order of ten
+  // implementations of this anyway.)
+
+  /**
+   * Get the current device.
+   */
+  virtual Device getDevice() const = 0;
+
+  /**
+   * Set the current device to Device.
+   */
+  virtual void setDevice(Device) const = 0;
+
+  /**
+   * Set the current device to Device, without checking for errors
+   * (so, e.g., this can be called from a destructor).
+   */
+  virtual void uncheckedSetDevice(Device) const noexcept = 0;
+
+  /**
+   * Get the current stream for a given device.
+   */
+  virtual Stream getStream(Device) const = 0;
+
+  /**
+   * Get the default stream for a given device.
+   */
+  virtual Stream getDefaultStream(Device /*unused*/) const {
+    TORCH_CHECK(false, "Backend doesn't support acquiring a default stream.")
+  }
+
+  /**
+   * Get a stream from the global pool for a given device.
+   */
+  virtual Stream getStreamFromGlobalPool(
+      Device /*unused*/,
+      bool isHighPriority = false) const {
+    (void)isHighPriority; // Suppress unused variable warning
+    TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
+  }
+
+  /**
+   * Return a new stream for a given device and priority. The stream will be
+   * copied and shared around, device backend should be able to correctly handle
+   * the lifetime of the stream.
+   */
+  virtual Stream getNewStream(Device /*unused*/, int priority = 0) const {
+    (void)priority;
+    TORCH_CHECK(false, "Backend doesn't support create a new Stream.")
+  }
+
+  /**
+   * Set a stream to be the thread local current stream for its device.
+   * Return the previous stream for that device. You are NOT required
+   * to set the current device to match the device of this stream.
+   */
+  virtual Stream exchangeStream(Stream) const = 0;
+
+  /**
+   * Destroys the given event.
+   */
+  virtual void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+      const noexcept {}
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  virtual void record(
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const c10::EventFlag /*flag*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  virtual void block(void* /*event*/, const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  virtual bool queryEvent(void* /*event*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Get the number of devices.  WARNING: This is REQUIRED to not raise
+   * an exception.  If there is some sort of problem, e.g., driver error,
+   * you should report that there are zero available devices.
+   */
+  virtual DeviceIndex deviceCount() const noexcept = 0;
+
+  /**
+   * Get the following capabilities of the current device:
+   * (1) Data type support
+   * Returns DeviceCapability object.
+   */
+  virtual DeviceCapability getDeviceCapability(Device /*unused*/) const {
+    TORCH_CHECK(false, "Backend doesn't support getting device capabilities.");
+  }
+
+  /**
+   * Return true if all the work previously enqueued on the stream for
+   * asynchronous execution has completed running on the device.
+   */
+  virtual bool queryStream(const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support querying streams.");
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the stream has completed running on the device.
+   */
+  virtual void synchronizeStream(const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support synchronizing streams.");
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * recorded on the event has completed running on the device.
+   */
+  virtual void synchronizeEvent(void* /*event*/) const {
+    TORCH_CHECK(false, "Backend doesn't support synchronizing events.");
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the device has been completed.
+   */
+  virtual void synchronizeDevice(const DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(
+        false, "Backend doesn't support synchronizing all streams on device.");
+  }
+
+  /**
+   * Ensure the caching allocator (if any) is aware that the given DataPtr is
+   * being used on the given stream, and that it should thus avoid recycling the
+   * DataPtr until all work on that stream is done.
+   */
+  virtual void recordDataPtrOnStream(
+      const c10::DataPtr& /*unused*/,
+      const Stream& /*unused*/) const {}
+
+  /**
+   * Fetch the elapsed time between two recorded events.
+   */
+  virtual double elapsedTime(
+      void* /*event1*/,
+      void* /*event2*/,
+      const DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Backend doesn't support elapsedTime.");
+  }
+
+  /**
+   * Intended use of this class is to leak the DeviceGuardImpl at program end.
+   * So you better not call the destructor, buster!
+   */
+  virtual ~DeviceGuardImplInterface() = default;
+};
+
+// A no-op device guard impl that doesn't do anything interesting.  Useful
+// for devices that don't actually have a concept of device index.  Prominent
+// examples are CPU and Meta.
+template <DeviceType D>
+struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
+  NoOpDeviceGuardImpl() = default;
+  DeviceType type() const override {
+    return D;
+  }
+  Device exchangeDevice(Device /*unused*/) const override {
+    return Device(D, -1); // no-op
+  }
+  Device getDevice() const override {
+    return Device(D, -1);
+  }
+  void setDevice(Device /*unused*/) const override {
+    // no-op
+  }
+  void uncheckedSetDevice(Device /*unused*/) const noexcept override {
+    // no-op
+  }
+  Stream getStream(Device /*unused*/) const noexcept override {
+    // no-op
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+
+  Stream getNewStream(Device /*unused*/, int priority = 0) const override {
+    // no-op
+    (void)priority;
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream /*unused*/) const noexcept override {
+    // no-op
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return 1;
+  }
+
+  DeviceCapability getDeviceCapability(Device /*unused*/) const override {
+    DeviceCapability cap;
+    if constexpr (D == DeviceType::Meta) {
+      cap.capability_data.capability_bits = 0;
+      // Meta only supports basic types for shape inference
+      // Byte, Char, Short, Int, Long, Float, Double,
+      // Bool, ComplexFloat, ComplexDouble
+      cap.capability_data.capability_bits = (1ULL << kIndex_Byte) |
+          (1ULL << kIndex_Char) | (1ULL << kIndex_Short) |
+          (1ULL << kIndex_Int) | (1ULL << kIndex_Long) |
+          (1ULL << kIndex_Float) | (1ULL << kIndex_Double) |
+          (1ULL << kIndex_ComplexFloat) | (1ULL << kIndex_ComplexDouble) |
+          (1ULL << kIndex_Bool);
+    }
+    return cap;
+  }
+
+  // Event-related functions
+  void record(
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const EventFlag /*flag*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.");
+  }
+  void block(void* /*event*/, const Stream& /*stream*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.")
+  }
+  bool queryEvent(void* /*event*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.")
+  }
+  void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+      const noexcept override {}
+
+  // Stream-related functions
+  bool queryStream(const Stream& /*stream*/) const override {
+    return true;
+  }
+  void synchronizeStream(const Stream& /*stream*/) const override {
+    // Don't wait for anything.
+  }
+};
+
+// The registry is NON-owning.  Each stored pointer is std::atomic so
+// that under all interleavings of registry calls the structure is
+// race-free.  This doesn't cost us anything on reads in X86.  (An
+// unsynchronized implementation probably is OK too, but I didn't want
+// to prove that we never read from device_guard_impl_registry at the
+// same time some registration is occurring.  Shiver.)
+//
+// I'd like this registry to be valid even at program destruction time
+// (in case someone uses a DeviceGuard in a destructor to do some cleanup
+// in the CUDA API.)  Since there are no direct accesses of the underlying
+// owning objects which I can use to enforce initialization order (unlike
+// in a Meyer singleton), it implies that you must *leak* objects when
+// putting them in the registry.  This is done by deleting the destructor
+// on DeviceGuardImplInterface.
+extern C10_API std::array<
+    std::atomic<const DeviceGuardImplInterface*>,
+    static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
+    device_guard_impl_registry;
+
+// I can't conveniently use c10/util/Registry.h for the following reason:
+// c10/util/Registry.h gives me a slow way of Create'ing a object of some
+// interface from the registry, but no way of quickly accessing an already
+// created object.  I'll be banging on getDeviceGuardImpl every time we do a
+// DeviceGuard, so I really don't want to be doing an unordered_map lookup.
+// Better if the registration mechanism directly drops its implementation
+// into device_guard_impl_registry.
+
+class C10_API DeviceGuardImplRegistrar {
+ public:
+  DeviceGuardImplRegistrar(
+      DeviceType /*type*/,
+      const DeviceGuardImplInterface* /*impl*/);
+};
+
+#define C10_REGISTER_GUARD_IMPL(DevType, DeviceGuardImpl)              \
+  static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE( \
+      g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl());
+
+inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
+  // Two adjacent int16_t fields DeviceType and DeviceIndex has field access
+  // miscompiled on NVCC. To workaround this issue, we apply a mask to the
+  // DeviceType. First check if the DeviceType is 16-bit.
+  // FB employees can see
+  //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
+  // for more details
+  static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFF].load();
+
+  // This seems to be the first place where you make use of a device
+  // when you pass devices to factory functions.  Give a nicer error
+  // message in this case.
+  TORCH_CHECK(p, "PyTorch is not linked with support for ", type, " devices");
+  return p;
+}
+
+void C10_API
+registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl);
+
+inline bool hasDeviceGuardImpl(DeviceType type) {
+  return device_guard_impl_registry[static_cast<size_t>(type)].load();
+}
+
+void C10_API ensureCUDADeviceGuardSet();
+
+} // namespace impl
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..902a4d3febafc5d9ea5c5695c428d25be7c171c2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h
@@ -0,0 +1,107 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+#include <array>
+
+namespace c10::impl {
+
+// FakeGuardImpl is hardcoded to have eight devices.  Not for
+// any good reason, just to simplify code.
+constexpr DeviceIndex kFakeGuardImplMaxDevices = 8;
+
+/**
+ * A fake implementation of DeviceGuardImplInterface suitable for testing.
+ * The current device is modeled as a mutable field in the guard implementation
+ * class.  See DeviceGuard_test.cpp for an example use.
+ */
+template <DeviceType T>
+struct FakeGuardImpl final : public DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = T;
+  // Runtime device type is not used
+  FakeGuardImpl(DeviceType /*unused*/) {}
+  FakeGuardImpl() = default;
+  DeviceType type() const override {
+    return T;
+  }
+  Device exchangeDevice(Device d) const override {
+    AT_ASSERT(d.type() == type());
+    AT_ASSERT(d.index() < kFakeGuardImplMaxDevices);
+    Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+      current_device_ = d.index();
+    }
+    return old_device;
+  }
+  Device getDevice() const override {
+    return Device(type(), current_device_);
+  }
+  void setDevice(Device d) const override {
+    AT_ASSERT(d.type() == type());
+    AT_ASSERT(d.index() >= 0);
+    AT_ASSERT(d.index() < kFakeGuardImplMaxDevices);
+    current_device_ = d.index();
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    current_device_ = d.index();
+  }
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::UNSAFE, d, current_streams_[d.index()]);
+  }
+  Stream exchangeStream(Stream s) const noexcept override {
+    auto old_id = current_streams_[s.device_index()];
+    current_streams_[s.device_index()] = s.id();
+    return Stream(Stream::UNSAFE, s.device(), old_id);
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return kFakeGuardImplMaxDevices;
+  }
+
+  // Event-related functions
+  void record(
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const EventFlag /*flag*/) const override {}
+  void block(void* /*event*/, const Stream& /*stream*/) const override {}
+  bool queryEvent(void* /*event*/) const override {
+    return true;
+  }
+  void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+      const noexcept override {}
+
+  // Convenience methods for testing
+  static DeviceIndex getDeviceIndex() {
+    return current_device_;
+  }
+  static void setDeviceIndex(DeviceIndex i) {
+    AT_ASSERT(i >= 0);
+    AT_ASSERT(i < kFakeGuardImplMaxDevices);
+    current_device_ = i;
+  }
+  static StreamId getCurrentStreamIdFor(DeviceIndex i) {
+    return current_streams_.at(i);
+  }
+  static void resetStreams() {
+    current_streams_.fill(0);
+  }
+
+ private:
+  thread_local static DeviceIndex current_device_;
+  thread_local static std::array<StreamId, kFakeGuardImplMaxDevices>
+      current_streams_;
+};
+
+template <DeviceType T>
+thread_local DeviceIndex FakeGuardImpl<T>::current_device_ = 0;
+
+template <DeviceType T>
+thread_local std::array<StreamId, kFakeGuardImplMaxDevices>
+    FakeGuardImpl<T>::current_streams_ = {0, 0, 0, 0, 0, 0, 0, 0};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..57761cff9bc254158816d43451ed5bc01f60411f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/GPUTrace.h
@@ -0,0 +1,33 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+
+namespace c10::impl {
+
+struct C10_API GPUTrace {
+  // On the x86 architecture the atomic operations are lock-less.
+  static std::atomic<const PyInterpreter*> gpuTraceState;
+
+  // When PyTorch migrates to C++20, this should be changed to an atomic flag.
+  // Currently, the access to this variable is not synchronized, on the basis
+  // that it will only be flipped once and by the first interpreter that
+  // accesses it.
+  static bool haveState;
+
+  // This function will only register the first interpreter that tries to invoke
+  // it. For all of the next ones it will be a no-op.
+  static void set_trace(const PyInterpreter* /*trace*/);
+
+  static const PyInterpreter* get_trace() {
+    if (!haveState)
+      return nullptr;
+    return gpuTraceState.load(std::memory_order_acquire);
+  }
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..032b90a20bd297b742711ada1d9d5ed1501a5e7e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h
@@ -0,0 +1,67 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <atomic>
+
+namespace c10::impl {
+
+// This TLS controls whether or not we permanently associate PyObject
+// with Tensor the first time it is allocated.  When hermetic PyObject
+// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor,
+// meaning you get a distinct PyObject whenever you execute the code in
+// question.
+struct C10_API HermeticPyObjectTLS {
+  static void set_state(bool state);
+  static bool get_state() {
+    // Hypothetical fastpath if torchdeploy/multipy // codespell:ignore multipy
+    // isn't used. Per
+    // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+    // this qualifies relaxed access because it is a single-location data
+    // structure (only the boolean here).
+    //
+    // Forgetting about data races for a moment, is there a logical race?
+    //
+    //  - Boolean only ever transitions from false to true.  So the
+    //    critical situation is when one interpreter is already running
+    //    when a second interpreter switches haveState from false to true.
+    //
+    //  - The first interpreter is indifferent whether or not it sees
+    //    hasState true/false; obviously false works (this is what the
+    //    interpreter was previously using; more directly, the interpreter
+    //    calls into itself as the handler, so being hermetic is not
+    //    required), and true simply means serviced python operator calls will
+    //    be hermetic; in these cases it is expected to be functionally
+    //    equivalent.
+    //
+    //  - The second interpreter MUST see hasState true (as its requests will
+    //    be forwarded to the first interpreter), but it is assumed that there
+    //    is a synchronization between the interpreter initialization, and
+    //    when we actually perform operations, so it is guaranteed to see
+    //    hasState true.
+    //
+    // QED.
+    //
+    // This fastpath is currently disabled so that we can more easily test that
+    // hermetic mode works correctly even on stock build of PyTorch.
+    if (false && !haveState_.load(std::memory_order_relaxed))
+      return false;
+    return get_tls_state();
+  }
+  // Call this from the multipy/torchdeploy // codespell:ignore multipy
+  // top level
+  static void init_state();
+
+ private:
+  // This only flipped once from false to true during
+  // torchdeploy/multipy initialization, // codespell:ignore multipy
+  // and never again.
+  static std::atomic<bool> haveState_;
+  static bool get_tls_state();
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..34d6dff97654888cd12d52ce1f44441f30247e44
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h
@@ -0,0 +1,438 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// This file provides implementations of InlineDeviceGuard and
+// InlineOptionalDeviceGuard.
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <type_traits>
+#include <utility>
+
+namespace c10::impl {
+
+/**
+ * A DeviceGuard is an RAII class that sets a device to some value
+ * on construction, and resets the device to its original value on
+ * destruction.
+ *
+ * InlineDeviceGuard is a helper class for implementing DeviceGuards.
+ * It is templated over a DeviceGuardImpl (anything that implements
+ * DeviceGuardImplInterface).  There are two primary ways to instantiate
+ * InlineDeviceGuard:
+ *
+ *  - With a concrete implementation of DeviceGuardImpl, e.g., CUDAGuardImpl.
+ *    This is the best way to use InlineDeviceGuard, as all calls are
+ *    devirtualized, giving you code as efficient as straight line
+ *    calls to cudaGetDevice/cudaSetDevice.
+ *
+ *  - With VirtualGuardImpl, which does a virtual dispatch to a DeviceGuardImpl
+ *    retrieved from a DeviceType registry.  We have explicitly instantiated
+ *    InlineDeviceGuard this way as c10::DeviceGuard.
+ *
+ * If you are in a hurry, you can use InlineDeviceGuard directly:
+ *
+ *    using CUDAGuard = impl::InlineDeviceGuard<CUDAGuardImpl>;
+ *
+ * However, you can provide a better user experience if you explicitly write a
+ * wrapper class that itself contains the template instantiation:
+ *
+ *    class CUDAGuard {
+ *    public:
+ *      // ... the API ...
+ *    private:
+ *      impl::InlineDeviceGuard<CUDAGuardImpl> guard_;
+ *    }
+ *
+ * The wrapper class provides a good place to write documentation, and helps
+ * avoid weird template instantiation errors when a user incorrectly uses the
+ * class.
+ *
+ * If you need to test this class, consider instantiating it with FakeGuardImpl.
+ */
+template <typename T>
+class InlineDeviceGuard {
+ public:
+  // Note [Omitted default constructor from RAII]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // In principle, we could add a default constructor to
+  // DeviceGuard which reads the current device and promises to
+  // restore to that device on exit.  However, most cases where you
+  // would have written this, you probably meant to actually just
+  // use DeviceGuard (since you don't actually need the
+  // restore to happen if you don't ever actually set the device).
+  // We remove the constructor here to encourage you to think about
+  // what you actually want to happen.
+  explicit InlineDeviceGuard() = delete;
+
+  /// Set the current device to the passed Device.
+  explicit InlineDeviceGuard(Device device)
+      : impl_(device.type()),
+        original_device_(
+            device.index() == -1 ? impl_.getDevice()
+                                 : impl_.exchangeDevice(device)),
+        current_device_(device.index() == -1 ? original_device_ : device) {}
+
+  /// Set the current device index to the passed DeviceIndex.  (The
+  /// device type is inferred from the template parameter T).
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineDeviceGuard(DeviceIndex device_index)
+      : InlineDeviceGuard(Device(U::static_type, device_index)) {}
+
+  /// Construct an InlineDeviceGuard using VirtualGuardImpl with an explicit
+  /// DeviceGuardImplInterface pointer.
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineDeviceGuard(
+      Device device,
+      const DeviceGuardImplInterface* impl)
+      : impl_(
+            VirtualGuardImpl(impl ? impl : getDeviceGuardImpl(device.type()))),
+        original_device_(
+            device.index() == -1 ? impl_.getDevice()
+                                 : impl_.exchangeDevice(device)),
+        current_device_(device.index() == -1 ? original_device_ : device) {}
+
+  /// Copy is disallowed
+  InlineDeviceGuard(const InlineDeviceGuard<T>&) = delete;
+  InlineDeviceGuard<T>& operator=(const InlineDeviceGuard<T>&) = delete;
+
+  /// Move is disallowed, as DeviceGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineDeviceGuard(InlineDeviceGuard<T>&& other) = delete;
+  InlineDeviceGuard& operator=(InlineDeviceGuard<T>&& other) = delete;
+
+  ~InlineDeviceGuard() {
+    impl_.uncheckedSetDevice(original_device_);
+  }
+
+  /// Sets the device to the given one.
+  template <
+      typename U = T,
+      typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>, int> = 0>
+  void set_device(at::Device device) {
+    AT_ASSERT(
+        (U::static_type == DeviceType::HIP && device.is_cuda()) ||
+        device.type() == U::static_type);
+    auto index = device.index();
+    if (index == -1)
+      return;
+    impl_.setDevice(device);
+    current_device_ = device;
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device.  This is effectively equivalent to
+  /// set_device when a guard supports only a single device type.
+  template <typename U = T>
+  typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>> reset_device(
+      at::Device device) {
+    set_device(device);
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device (for a possibly different device
+  /// type).
+  ///
+  /// This method is named reset_device to highlight the fact that previous
+  /// device settings from this guard are NOT preserved, even if the device
+  /// has a different device type.  For example:
+  ///
+  ///   // CUDA device is 0
+  ///   DeviceGuard g(Device(kCUDA, 1));
+  ///   g.reset_device(Device(kHIP, 2));
+  ///   // CUDA device is 0 (!!)
+  ///
+  /// NOTE: this implementation may skip some device setting if it can prove
+  /// that it is unnecessary.
+  ///
+  /// Optional argument is for testing only.
+  template <typename U = T>
+  typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>> reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl = nullptr) {
+    auto index = device.index();
+    if (index == -1)
+      return;
+    if (device.type() == original_device_.type()) {
+      AT_ASSERT(impl == nullptr || impl->type() == device.type());
+      impl_.setDevice(device);
+      current_device_ = device;
+    } else {
+      // Destruct and reconstruct the DeviceGuard in place
+      impl_.setDevice(original_device_);
+      impl_ = !impl ? VirtualGuardImpl(device.type()) : VirtualGuardImpl(impl);
+      original_device_ = impl_.exchangeDevice(device);
+      current_device_ = device;
+    }
+  }
+
+  /// Sets the device index to the given one.  The device type is inferred
+  /// from the original device type.
+  void set_index(DeviceIndex index) {
+    reset_device(Device(original_device_.type(), index));
+  }
+
+  /// Returns the device that was set at the time the most recent
+  /// reset_device(), or otherwise the device at construction time.
+  Device original_device() const {
+    return original_device_;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return current_device_;
+  }
+
+ protected:
+  T impl_;
+
+ private:
+  Device original_device_;
+  Device current_device_;
+};
+
+/**
+ * A OptionalDeviceGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ *
+ * InlineOptionalDeviceGuard is a helper class for implementing
+ * OptionalDeviceGuards.  See guidance in InlineDeviceGuard on how to
+ * use this.  See OptionalDeviceGuard for user-oriented usage notes.
+ */
+template <typename T>
+class InlineOptionalDeviceGuard {
+ public:
+  // Note [Explicit initialization of optional fields]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Explicit initialization of optional fields
+  // required to workaround an nvcc bug; see
+  // https://github.com/pytorch/pytorch/issues/12117
+
+  /// Creates an uninitialized OptionalDeviceGuard.
+  explicit InlineOptionalDeviceGuard()
+      : guard_() // See Note [Explicit initialization of optional fields]
+  {}
+  ~InlineOptionalDeviceGuard() = default;
+
+  /// Set the current device to the passed Device, if it is not nullopt.
+  explicit InlineOptionalDeviceGuard(std::optional<Device> device_opt)
+      : guard_() { // See Note [Explicit initialization of optional fields]
+    if (device_opt.has_value()) {
+      guard_.emplace(device_opt.value());
+    }
+  }
+
+  /// Set the current device to the passed DeviceIndex, if it is not nullopt.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineOptionalDeviceGuard(
+      std::optional<DeviceIndex> device_index_opt)
+      : guard_() { // See Note [Explicit initialization of optional fields]
+    if (device_index_opt.has_value()) {
+      guard_.emplace(device_index_opt.value());
+    }
+  }
+
+  /// All constructors of DeviceGuard are valid for OptionalDeviceGuard
+  /// and result in initialized OptionalDeviceGuard.
+  template <typename... Args>
+  explicit InlineOptionalDeviceGuard(Args&&... args)
+      : guard_(std::in_place, std::forward<Args>(args)...) {}
+
+  // TODO: Consider reading Tensor and TensorList constructors here, when
+  // Tensor moves to c10.  (These are only valid on OptionalDeviceGuard,
+  // because a Tensor may be undefined, in which case we need an uninitialized
+  // tensor guard.)
+
+  // Note [Move construction for RAII guards is tricky]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // In principle, move construction is useful for terminating
+  // the lifetime of a `OptionalDeviceGuard` early; for example:
+  //
+  //     // current device is d0
+  //     OptionalDeviceGuard g1(d1);
+  //     // current device is d1
+  //     {
+  //       OptionalDeviceGuard g2(std::move(g1));
+  //     }
+  //     // current device is d0!!
+  //
+  // However, it's difficult to implement the move constructor
+  // in a way that works in all situations.  For example, consider
+  // the following example:
+  //
+  //     OptionalDeviceGuard g1(d1);
+  //     {
+  //       OptionalDeviceGuard g2(d2);
+  //       {
+  //         OptionalDeviceGuard g3(std::move(g1)); // !!!
+  //       }
+  //     }
+  //
+  // What should the current device be while g3 in scope... and what
+  // should it be after it goes out of scope?  What about g2?
+  // There don't seem to be satisfactory answers for these questions.
+  //
+  // It's in principle possible to raise an error when this occurs
+  // by doing some extra thread-local bookkeeping.  But why bother?
+  // Just don't provide the constructor.
+  InlineOptionalDeviceGuard(const InlineOptionalDeviceGuard<T>& other) = delete;
+  InlineOptionalDeviceGuard(InlineOptionalDeviceGuard<T>&& other) = delete;
+
+  // Note [Move assignment for RAII guards is tricky]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Move assignment is deleted, because you need to know which guard was
+  // defined "first", as that guard's original_device_ wins--with the current
+  // representation, we have no way of telling which is the case.  (Move
+  // construction does not have this problem, as one guard is always
+  // uninitialized.)
+  //
+  // We can make this clear by way of a pair of examples:
+  //
+  // Example 1:
+  //
+  //  // initial device is n0
+  //  {
+  //    CUDAGuard g1(n1);
+  //    {
+  //      CUDAGuard g2(n2);
+  //      // current device should be n2
+  //      g1 = std::move(g2);
+  //      // current device should still be n2
+  //    }
+  //    // current device should still be n2
+  //  }
+  //  // current device should be n0
+  //
+  //  Example 2 (flip the order of the two guards):
+  //
+  //  // initial device is n0
+  //  {
+  //    CUDAGuard g2(n2);
+  //    {
+  //      CUDAGuard g1(n1);
+  //      // current device should be n1
+  //      g1 = std::move(g2);
+  //      // current device should be n2
+  //    }
+  //    // current device should be n0 (since g2 has been vacated)
+  //  }
+  //
+  // In both examples, we need g1 to restore to n0 after move assignment.
+  // However, in example 1, this is determined by the restore value of g1
+  // (prior to the move). In example 2, however, it is determined by the the
+  // restore value of g2(!!). We don't know which one should win, without having
+  // a way of telling which guard was allocated first.
+  //
+  // We could solve this with an extra thread-local variable.  But no one is
+  // actually using move-assignment.  So just get rid of it.
+  InlineOptionalDeviceGuard& operator=(const InlineOptionalDeviceGuard& other) =
+      delete;
+  InlineOptionalDeviceGuard& operator=(InlineOptionalDeviceGuard&& other) =
+      delete;
+
+  /// Sets the device to the given one.  Initializes OptionalDeviceGuard if it
+  /// is not already initialized.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void set_device(at::Device device) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device);
+    } else {
+      guard_->set_device(device);
+    }
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device (for a possibly different device
+  /// type).  Initializes OptionalDeviceGuard if it is not already initialized.
+  ///
+  /// See notes on why this is called reset_device on InlineDeviceGuard.
+  ///
+  /// Optional argument is for testing only.
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  void reset_device(
+      at::Device device,
+      const DeviceGuardImplInterface* impl = nullptr) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device, impl);
+    } else {
+      guard_->reset_device(device, impl);
+    }
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device.  Initializes the guard if it is
+  /// not already initialized.  This is effectively equivalent to set_device
+  /// when a guard supports only a single device type.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void reset_device(at::Device device) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device);
+    } else {
+      guard_->reset_device(device);
+    }
+  }
+
+  /// Sets the device index to the given one.  The device type is statically
+  /// known.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void set_index(DeviceIndex index) {
+    if (!guard_.has_value()) {
+      guard_.emplace(index);
+    } else {
+      guard_->set_index(index);
+    }
+  }
+
+  /// Returns the device that was set immediately prior to initialization of
+  /// the, guard, or nullopt if the guard is uninitialized.
+  std::optional<Device> original_device() const {
+    return guard_.has_value() ? std::make_optional(guard_->original_device())
+                              : std::nullopt;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  std::optional<Device> current_device() const {
+    return guard_.has_value() ? std::make_optional(guard_->current_device())
+                              : std::nullopt;
+  }
+
+  /// Restore the original device, resetting this guard to uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  std::optional<InlineDeviceGuard<T>> guard_;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d4083daab7439295a132ca3b157eae1ba6745d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineEvent.h
@@ -0,0 +1,152 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/Exception.h>
+
+namespace c10::impl {
+
+template <typename T>
+struct InlineEvent final {
+  InlineEvent() = delete;
+  InlineEvent(
+      const DeviceType _device_type,
+      const EventFlag _flag = EventFlag::PYTORCH_DEFAULT)
+      : backend_{_device_type}, device_type_{_device_type}, flag_{_flag} {}
+
+  // Copy constructor and copy assignment operator (deleted)
+  InlineEvent(const InlineEvent&) = delete;
+  InlineEvent& operator=(const InlineEvent&) = delete;
+
+  // Move constructor and move assignment operator
+  InlineEvent(InlineEvent&& other) noexcept
+      : event_(other.event_),
+        backend_(std::move(other.backend_)),
+        device_type_(other.device_type_),
+        device_index_(other.device_index_),
+        flag_(other.flag_),
+        was_marked_for_recording_(other.was_marked_for_recording_) {
+    other.event_ = nullptr;
+  }
+  InlineEvent& operator=(InlineEvent&& other) noexcept {
+    swap(other);
+    return *this;
+  }
+
+  void swap(InlineEvent& other) noexcept {
+    std::swap(event_, other.event_);
+    std::swap(backend_, other.backend_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(flag_, other.flag_);
+    std::swap(was_marked_for_recording_, other.was_marked_for_recording_);
+  }
+
+  ~InlineEvent() noexcept {
+    if (event_)
+      backend_.destroyEvent(event_, device_index_);
+  }
+
+  DeviceType device_type() const noexcept {
+    return device_type_;
+  }
+  DeviceIndex device_index() const noexcept {
+    return device_index_;
+  }
+  EventFlag flag() const noexcept {
+    return flag_;
+  }
+  bool was_marked_for_recording() const noexcept {
+    return was_marked_for_recording_;
+  }
+
+  void recordOnce(const Stream& stream) {
+    if (!was_marked_for_recording_)
+      record(stream);
+  }
+
+  void record(const Stream& stream) {
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match recording stream's device type ",
+        DeviceTypeName(stream.device_type()),
+        ".");
+
+    backend_.record(&event_, stream, device_index_, flag_);
+    was_marked_for_recording_ = true;
+    device_index_ = stream.device_index();
+  }
+
+  void block(const Stream& stream) const {
+    if (!was_marked_for_recording_)
+      return;
+
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match blocking stream's device type ",
+        DeviceTypeName(stream.device_type()),
+        ".");
+
+    backend_.block(event_, stream);
+  }
+
+  bool query() const {
+    if (!was_marked_for_recording_)
+      return true;
+    return backend_.queryEvent(event_);
+  }
+
+  void* eventId() const {
+    return event_;
+  }
+
+  double elapsedTime(const InlineEvent& other) const {
+    TORCH_CHECK(
+        other.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match other's device type ",
+        DeviceTypeName(other.device_type()),
+        ".");
+    TORCH_CHECK_VALUE(
+        (flag_ == EventFlag::BACKEND_DEFAULT) &&
+            (other.flag_ == EventFlag::BACKEND_DEFAULT),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        was_marked_for_recording() && other.was_marked_for_recording(),
+        "Both events must be recorded before calculating elapsed time.");
+    // elapsedTime in MPS can wait event to be completed if event is not ready,
+    // which is a little different from CUDA
+    TORCH_CHECK(
+        (query() && other.query()) || device_type_ == DeviceType::MPS,
+        "Both events must be completed before calculating elapsed time.");
+
+    return backend_.elapsedTime(event_, other.event_, device_index_);
+  }
+
+  void synchronize() const {
+    if (!was_marked_for_recording_)
+      return;
+    backend_.synchronizeEvent(event_);
+  }
+
+ private:
+  void* event_ = nullptr;
+  T backend_;
+  DeviceType device_type_;
+  DeviceIndex device_index_ = -1;
+  EventFlag flag_ = EventFlag::PYTORCH_DEFAULT;
+  bool was_marked_for_recording_ = false;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ce87a9a8eb55a30e8e6fb0ab6e5a38bc065dab9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h
@@ -0,0 +1,265 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+
+namespace c10::impl {
+
+/**
+ * A StreamGuard is an RAII class that changes the current device
+ * to the device corresponding to some stream, and changes the
+ * default stream on that device to be this stream.
+ *
+ * InlineStreamGuard is a helper class for implementing StreamGuards.
+ * See InlineDeviceGuard for guidance on how to use this class.
+ */
+template <typename T>
+class InlineStreamGuard : private InlineDeviceGuard<T> {
+ public:
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit InlineStreamGuard() = delete;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  explicit InlineStreamGuard(Stream stream)
+      : InlineDeviceGuard<T>(stream.device()),
+        original_stream_of_original_device_(
+            this->impl_.getStream(original_device())),
+        original_stream_of_current_device_(this->impl_.exchangeStream(stream)),
+        current_stream_(stream) {}
+
+  /// This constructor exists purely for testing
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineStreamGuard(
+      Stream stream,
+      const DeviceGuardImplInterface* impl)
+      : InlineDeviceGuard<T>(
+            stream.device(),
+            impl ? impl : getDeviceGuardImpl(stream.device_type())),
+        original_stream_of_original_device_(
+            this->impl_.getStream(original_device())),
+        original_stream_of_current_device_(this->impl_.exchangeStream(stream)),
+        current_stream_(stream) {}
+
+  /// Copy is disallowed
+  InlineStreamGuard(const InlineStreamGuard<T>&) = delete;
+  InlineStreamGuard<T>& operator=(const InlineStreamGuard<T>&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineStreamGuard(InlineStreamGuard<T>&& other) = delete;
+  InlineStreamGuard& operator=(InlineStreamGuard<T>&& other) = delete;
+
+  ~InlineStreamGuard() {
+    this->impl_.exchangeStream(original_stream_of_current_device_);
+  }
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// use MultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    // TODO: make a version that takes an impl argument.  Unfortunately,
+    // that will require SFINAE because impl is only valid for the
+    // VirtualGuardImpl specialization.
+    if (stream.device() == this->current_device()) {
+      this->impl_.exchangeStream(stream);
+      current_stream_ = stream;
+    } else {
+      // Destruct and reconstruct the StreamGuard in-place
+      this->impl_.exchangeStream(original_stream_of_current_device_);
+      this->reset_device(stream.device());
+      original_stream_of_current_device_ = this->impl_.exchangeStream(stream);
+      current_stream_ = stream;
+    }
+  }
+
+  // It's not clear if set_device should also reset the current stream
+  // if the device is unchanged; therefore, we don't provide it.
+  // The situation is somewhat clearer with reset_device, but it's still
+  // a pretty weird thing to do, so haven't added this either.
+
+  /// Returns the stream of the original device prior to this guard.  Subtly,
+  /// the stream returned here is the original stream of the *original*
+  /// device; i.e., it's the stream that your computation *would* have
+  /// been put on, if it hadn't been for this meddling stream guard.
+  /// This is usually what you want.
+  Stream original_stream() const {
+    return original_stream_of_original_device_;
+  }
+
+  /// Returns the most recent stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  Stream current_stream() const {
+    return current_stream_;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return InlineDeviceGuard<T>::current_device();
+  }
+
+  /// Returns the device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return InlineDeviceGuard<T>::original_device();
+  }
+
+ private:
+  Stream
+      original_stream_of_original_device_; // what the user probably cares about
+  Stream original_stream_of_current_device_; // what we need to restore
+  Stream current_stream_;
+};
+
+/**
+ * An OptionalStreamGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * See InlineOptionalDeviceGuard for more guidance on how to use this class.
+ */
+template <typename T>
+class InlineOptionalStreamGuard {
+ public:
+  /// Creates an uninitialized stream guard.
+  explicit InlineOptionalStreamGuard()
+      : guard_() // See Note [Explicit initialization of optional fields]
+  {}
+  ~InlineOptionalStreamGuard() = default;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit InlineOptionalStreamGuard(std::optional<Stream> stream_opt)
+      : guard_() {
+    if (stream_opt.has_value()) {
+      guard_.emplace(stream_opt.value());
+    }
+  }
+
+  /// All constructors of StreamGuard are valid for OptionalStreamGuard
+  template <typename... Args>
+  explicit InlineOptionalStreamGuard(Args&&... args)
+      : guard_(std::in_place, std::forward<Args>(args)...) {}
+
+  InlineOptionalStreamGuard(const InlineOptionalStreamGuard<T>& other) = delete;
+  InlineOptionalStreamGuard& operator=(const InlineOptionalStreamGuard& other) =
+      delete;
+  // See Note [Move construction for RAII guards is tricky]
+  InlineOptionalStreamGuard(InlineOptionalStreamGuard<T>&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  InlineOptionalStreamGuard& operator=(InlineOptionalStreamGuard&& other) =
+      delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the OptionalStreamGuard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    if (guard_.has_value()) {
+      guard_->reset_stream(stream);
+    } else {
+      guard_.emplace(stream);
+    }
+  }
+
+  /// Returns the stream that was set at the time the guard was most recently
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<Stream> original_stream() const {
+    return guard_.has_value() ? std::make_optional(guard_->original_stream())
+                              : std::nullopt;
+  }
+
+  /// Returns the most recent stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<Stream> current_stream() const {
+    return guard_.has_value() ? std::make_optional(guard_->current_stream())
+                              : std::nullopt;
+  }
+
+  /// Restore the original device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  std::optional<InlineStreamGuard<T>> guard_;
+};
+
+template <typename T>
+class InlineMultiStreamGuard {
+ public:
+  /// Calls `set_stream` on each of the streams in the list.
+  /// This may be useful if you need to set different streams
+  /// for different devices.
+  explicit InlineMultiStreamGuard(ArrayRef<Stream> streams) {
+    if (!streams.empty()) {
+      impl_.emplace(getDeviceTypeOfStreams(streams));
+      original_streams_.reserve(streams.size());
+      for (const Stream& s : streams) {
+        original_streams_.emplace_back(this->impl_->exchangeStream(s));
+      }
+    }
+  }
+
+  /// Copy is disallowed
+  InlineMultiStreamGuard(const InlineMultiStreamGuard&) = delete;
+  InlineMultiStreamGuard<T>& operator=(const InlineMultiStreamGuard&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineMultiStreamGuard(InlineMultiStreamGuard&& other) = delete;
+  InlineMultiStreamGuard& operator=(InlineMultiStreamGuard&& other) = delete;
+
+  ~InlineMultiStreamGuard() noexcept {
+    if (this->impl_.has_value()) {
+      for (const Stream& s : original_streams_) {
+        this->impl_->exchangeStream(s);
+      }
+    }
+  }
+
+ protected:
+  std::optional<T> impl_;
+
+ private:
+  /// The original streams that were active on all devices.
+  std::vector<Stream> original_streams_;
+
+  static DeviceType getDeviceTypeOfStreams(ArrayRef<Stream> streams) {
+    TORCH_INTERNAL_ASSERT(!streams.empty());
+    DeviceType type = streams[0].device_type();
+    for (const auto idx : c10::irange(1, streams.size())) {
+      TORCH_CHECK_VALUE(
+          streams[idx].device_type() == type,
+          "Streams have a mix of device types: stream 0 is on ",
+          streams[0].device(),
+          " while stream ",
+          idx,
+          " is on device ",
+          streams[idx].device());
+    }
+    return type;
+  }
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h
new file mode 100644
index 0000000000000000000000000000000000000000..123a288a0834468abc2e8bc7dc90b6e775506621
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h
@@ -0,0 +1,174 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Export.h>
+
+// TLS management for DispatchKeySet (the "local" DispatchKeySet(s))
+//
+// This manages two thread-local DispatchKeySets:
+//
+//  - The included type set, which adds a tensor type for consideration
+//    in dispatch.  (For example, you might add Profiling to
+//    the included type set to turn on profiling on all tensor operations.)
+//
+//  - The excluded type set, which disqualifies a tensor type from dispatch.
+//    (For example, after redispatching on variable, we disqualify
+//    Autograd so we don't attempt to handle variable again.)
+//    (Exclusion wins over inclusion.)
+//
+// NB: Originally, I implemented the excluded type set as storing the inverted
+// set, but TLS is defined to be zero-initialized, so this doesn't actually work
+// (if it's inverted, you want the set to be -1 initialized).
+
+namespace c10::impl {
+
+// POD version of LocalDispatchKeySet.  Declared here just so that
+// we can put it in the guards.
+// This struct encapsulates special handling for TLS initialization
+// in set_included()/included() API so that they reflect the truth.
+// If you want to create PODLocalDispatchKeySet with non-zero state,
+// use set_included() instead of default constructor.
+struct C10_API PODLocalDispatchKeySet {
+  uint64_t included_;
+  uint64_t excluded_;
+
+  // See Note [TLS Initialization]
+  DispatchKeySet included() const {
+    return DispatchKeySet(DispatchKeySet::RAW, included_) ^
+        c10::default_included_set;
+  }
+  DispatchKeySet excluded() const {
+    return DispatchKeySet(DispatchKeySet::RAW, excluded_) ^
+        c10::default_excluded_set;
+  }
+
+  void set_included(DispatchKeySet x) {
+    included_ = (x ^ c10::default_included_set).raw_repr();
+  }
+  void set_excluded(DispatchKeySet x) {
+    excluded_ = (x ^ c10::default_excluded_set).raw_repr();
+  }
+};
+static_assert(
+    std::is_trivial_v<PODLocalDispatchKeySet>,
+    "PODLocalDispatchKeySet must be a POD type.");
+
+struct C10_API LocalDispatchKeySet {
+  /* implicit */ LocalDispatchKeySet(PODLocalDispatchKeySet x)
+      : included_(x.included()), excluded_(x.excluded()) {}
+  DispatchKeySet included_;
+  DispatchKeySet excluded_;
+};
+
+// thread_local variables cannot be C10_API on Windows.
+// Inlining this seems to break AutoDispatchBelowAutograd on Android.
+#if defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+C10_API LocalDispatchKeySet tls_local_dispatch_key_set();
+#else // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
+
+inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() {
+  // Don't let people fiddle with the thread_local directly just
+  // because they include this header.
+  return raw_local_dispatch_key_set;
+}
+#endif // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+
+// Internal, use ThreadLocalStateGuard
+C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);
+
+// RAII API for manipulating the thread-local dispatch state.
+
+class C10_API IncludeDispatchKeyGuard {
+ public:
+  IncludeDispatchKeyGuard(DispatchKeySet /*include*/);
+  IncludeDispatchKeyGuard(DispatchKey k)
+      : IncludeDispatchKeyGuard(DispatchKeySet(k)) {}
+  IncludeDispatchKeyGuard(const IncludeDispatchKeyGuard&) = delete;
+  IncludeDispatchKeyGuard operator=(const IncludeDispatchKeyGuard&) = delete;
+  IncludeDispatchKeyGuard(IncludeDispatchKeyGuard&&) = delete;
+  IncludeDispatchKeyGuard operator=(IncludeDispatchKeyGuard&&) = delete;
+  ~IncludeDispatchKeyGuard();
+
+ private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalDispatchKeySet* tls_;
+  DispatchKeySet include_;
+};
+
+class C10_API ExcludeDispatchKeyGuard {
+ public:
+  ExcludeDispatchKeyGuard(DispatchKeySet /*exclude*/);
+  ExcludeDispatchKeyGuard(DispatchKey k)
+      : ExcludeDispatchKeyGuard(DispatchKeySet(k)) {}
+  ExcludeDispatchKeyGuard(const ExcludeDispatchKeyGuard&) = delete;
+  ExcludeDispatchKeyGuard operator=(const ExcludeDispatchKeyGuard&) = delete;
+  ExcludeDispatchKeyGuard(ExcludeDispatchKeyGuard&&) = delete;
+  ExcludeDispatchKeyGuard operator=(ExcludeDispatchKeyGuard&&) = delete;
+  ~ExcludeDispatchKeyGuard();
+
+ private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalDispatchKeySet* tls_;
+  DispatchKeySet exclude_;
+};
+
+struct C10_API ForceDispatchKeyGuard {
+ public:
+  ForceDispatchKeyGuard()
+      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {}
+  ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set)
+      : ForceDispatchKeyGuard() {
+    c10::impl::_force_tls_local_dispatch_key_set(key_set);
+  }
+  ForceDispatchKeyGuard(
+      c10::DispatchKeySet include,
+      c10::DispatchKeySet exclude)
+      : ForceDispatchKeyGuard() {
+    auto updated_set = saved_keyset_;
+    updated_set.included_ = include;
+    updated_set.excluded_ = exclude;
+    c10::impl::_force_tls_local_dispatch_key_set(updated_set);
+  }
+
+  ForceDispatchKeyGuard(ForceDispatchKeyGuard&&) noexcept = delete;
+  ForceDispatchKeyGuard(const ForceDispatchKeyGuard&) = delete;
+  ForceDispatchKeyGuard& operator=(const ForceDispatchKeyGuard&) = delete;
+  ForceDispatchKeyGuard& operator=(ForceDispatchKeyGuard&&) = delete;
+  ~ForceDispatchKeyGuard() {
+    c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
+  }
+
+ private:
+  c10::impl::LocalDispatchKeySet saved_keyset_;
+};
+
+// Non-RAII API for manipulating the thread-local dispatch state.
+// Please prefer the RAII API.  The non-RAII API may be useful when
+// the included/excluded state of a given DispatchKey must span
+// many calls from the Python to the C++, so you cannot conveniently
+// use an RAII guard.
+//
+// Example use case:  a Python context manager that includes a certain
+// DispatchKey, to ensure ops running under the context manager dispatch
+// through that DispatchKey's registered overrides.
+//
+// The non-RAII API is less efficient than the RAII guards because both the
+// getter and setter will do a tls_getaddr lookup (the RAII struct only needs
+// one!)
+
+C10_API bool tls_is_dispatch_key_excluded(DispatchKey x);
+C10_API void tls_set_dispatch_key_excluded(DispatchKey x, bool desired_state);
+C10_API bool tls_is_dispatch_key_included(DispatchKey x);
+C10_API void tls_set_dispatch_key_included(DispatchKey x, bool desired_state);
+C10_API bool tls_is_dispatch_keyset_excluded(DispatchKeySet ks);
+C10_API bool tls_is_dispatch_keyset_included(DispatchKeySet ks);
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce74e9b9050b3db0db196ff4ef9f3cad198c9beb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreter.h
@@ -0,0 +1,257 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/python_stub.h>
+#include <string>
+#include <vector>
+
+// Forward declarations
+
+namespace c10 {
+struct IValue;
+class OperatorHandle;
+struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
+} // namespace c10
+
+namespace torch::jit {
+using Stack = std::vector<c10::IValue>;
+}
+
+// Actual implementation
+
+namespace c10::impl {
+
+struct C10_API PyInterpreter;
+
+// Note [Python interpreter tag]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Traditionally, PyTorch is layered such that our Python library
+// (libtorch_python) references our pure C++ library (libtorch) as the
+// natural order of things.  However, sometimes this natural order is
+// subverted: C++ objects refer to Python objects (for example, we
+// store a PyObject* pointer on TensorImpl so that converting from a
+// C++ Tensor to a Python Tensor is just a memory dereference).
+//
+// These unusual orderings must be treated with care.  To start, you need to
+// virtualize the destructor so that the PyObject can be decref'ed on
+// destruction (because the C++ object itself doesn't know anything about
+// Python--remember, layering!).  This process itself is fraught, since
+// acquiring the GIL could lead to deadlocks if someone is blocking on you
+// while holding the GIL.  Furthermore, if the C++ objects outlive the
+// interpreter (which can happen if you stash them in a static global
+// variable defined in libtorch), you may attempt to decref the object when
+// the Python interpreter has already been shutdown.
+//
+// BUT WAIT, IT GETS WORSE.  With torchdeploy, there may be multiple Python
+// interpreters in a single process. If a C++ object is accessible from
+// multiple interpreters, we must take care not to accidentally pass a
+// PyObject from one interpreter with another interpreter.
+//
+// To prevent these mixups, we introduce a PyInterpreter "tag" (object with
+// a vtable), which specifies a specific Python interpreter.
+//
+//  - Any given object can be associated with AT MOST one Python interpreter.
+//    We represent the interpreter tag as a memory address to an instance of
+//    a virtual class that is allocated once per interpreter (this is so that
+//    we can request the interpreter to perform operations for us, if
+//    necessary).
+//
+//  - It can be recorded with a PyObject (PyInterpreterObject) so that
+//    we know what interpreter the object is associated with, and we can
+//    raise an error if you try to use the PyObject from the wrong
+//    interpreter context.
+//
+//  - It contains a vtable that can be used to perform various Python
+//    operations from ordinary C++ code that ordinarily wouldn't be accessible
+//    from libtorch.
+//
+// A simple use case is when a C++ object must be associated with a PyObject.
+// However, for TensorImpl, we lazily allocate a PyObject the first time the
+// object passes into Python.  The invariants for this situation are more
+// subtle:
+//
+//  - A given TensorImpl's interpreter tag can only go from uninitialized to
+//    tagged; once tagged, this is a quiescent state (once tagged to an
+//    interpreter, ALWAYS tagged to that interpreter)
+//
+//  - A thread may mutate the PyObject field of a TensorImpl if and only if it
+//    holds the GIL for the interpreter tagged on the TensorImpl.  (If the
+//    TensorImpl is not tagged, it must first atomically claim its tag before it
+//    can validly write)
+//
+// WARNING: This class has to be written very carefully, because it may be
+// possible for a Tensor to have a reference an interpreter corresponding to
+// a shared library that has ALREADY BEEN UNLOADED.  This makes blindly calling
+// virtual methods very dangerous, because the vtable may be garbage at that
+// point (on a good day, you might get "pure virtual method called").
+//
+// The idea to solve this problem is we always leak PyInterpreters (so they
+// always stay live even after dlclose), and make sure we can disarm their
+// virtual methods by indirecting through a separate PyInterpreterVTable
+// object.  This can be replaced with a no-op vtable from libc10.so, which
+// is guaranteed to stick around until the bitter end.
+//
+// NB: The downside with representing PyInterpreter tags as full objects is that
+// it takes an extra word on TensorImpl.  If tags were instead just integer
+// indices, on 64-bit architectures we could pack the tag and PyObject together
+// into a single atomic word.  On 32-bit architectures we could simply say that
+// only one Python interpreter is supported (erroring if a nontrivial
+// interpreter tag is attempted to be set).
+//
+// The difficulty with this scheme is we need to maintain an out-of-line table
+// to get at the PyInterpreters so that we can do virtual method calls on them,
+// and registration/deregistration to this table must be done in a thread safe
+// manner.  This can be easily done if the number of possible PyInterpreters is
+// small enough (e.g., 8-bit integer) by simply preallocating an array of
+// sufficient size to hold all possible interpreters.  Surely 128 threads is
+// more than enough for anyone!
+//
+// I didn't decide to do this technique at the moment, because the extra word
+// added by the PyInterpreter tag takes us to 24 words, which means that we
+// still fit inside three eight word cache lines.  If you need to penny pinch
+// another word consider doing this!
+
+struct C10_API PyInterpreterVTable {
+  virtual ~PyInterpreterVTable() = default;
+
+  // Report the name of this interpreter
+  virtual std::string name() const = 0;
+
+  // Run Py_INCREF on a PyObject.
+  virtual void incref(PyObject* pyobj) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;
+
+  // Perform a detach by deferring to the __torch_dispatch__ implementation of
+  // detach, which will also arrange for the PyObject to get copied in this
+  // situation
+  virtual c10::intrusive_ptr<TensorImpl> detach(
+      const TensorImpl* self) const = 0;
+
+  // Invoke the Python boxed fallback dispatch to go back into Python
+  virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
+      const = 0;
+
+  virtual void reportErrorCallback(PyObject* callback, DispatchKey key)
+      const = 0;
+
+  // This is only invoked in the multipy/torchdeploy // codespell:ignore multipy
+  // situation from pythonOpRegistrationTrampoline; this lets us get to the
+  // Python interpreter to actually find the appropriate Python op registration
+  // entry to call.
+  virtual void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey,
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack,
+      bool with_keyset,
+      bool with_op) const = 0;
+
+  virtual void throw_abstract_impl_not_imported_error(
+      std::string opname,
+      const char* pymodule,
+      const char* context) const = 0;
+
+  // Invoke the Python dispatcher to handle this call
+  virtual void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const = 0;
+
+  virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
+      const = 0;
+  virtual c10::SymBool sym_is_contiguous(
+      const TensorImpl* self,
+      at::MemoryFormat) const = 0;
+  virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
+      const = 0;
+  virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
+  virtual c10::Device device(const TensorImpl* self) const = 0;
+  virtual int64_t dim(const TensorImpl* self) const = 0;
+  virtual c10::IntArrayRef strides(const TensorImpl* self) const = 0;
+  virtual c10::IntArrayRef sizes(const TensorImpl* self) const = 0;
+  virtual c10::SymIntArrayRef sym_sizes(const TensorImpl* self) const = 0;
+  virtual c10::Layout layout(const TensorImpl* self) const = 0;
+  virtual int64_t numel(const TensorImpl* self) const = 0;
+  virtual c10::SymInt sym_numel(const TensorImpl* self) const = 0;
+  virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0;
+  virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0;
+
+  virtual void trace_gpu_event_creation(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
+  virtual void trace_gpu_event_deletion(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
+  virtual void trace_gpu_event_record(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_wait(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_memory_allocation(
+      c10::DeviceType device_type,
+      uintptr_t ptr) const = 0;
+  virtual void trace_gpu_memory_deallocation(
+      c10::DeviceType device_type,
+      uintptr_t ptr) const = 0;
+  virtual void trace_gpu_stream_creation(
+      c10::DeviceType device_type,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_device_synchronization(
+      c10::DeviceType device_type) const = 0;
+  virtual void trace_gpu_stream_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
+
+  virtual void reset_backward_hooks(const TensorImpl* self) const = 0;
+};
+
+struct C10_API PyInterpreter {
+  const PyInterpreterVTable* vtable_;
+
+  PyInterpreter(const PyInterpreterVTable* vtable) : vtable_(vtable) {}
+
+  const PyInterpreterVTable& operator*() const noexcept {
+    return *vtable_;
+  }
+  const PyInterpreterVTable* operator->() const noexcept {
+    return vtable_;
+  }
+
+  // Disarm this PyInterpreter, making all of its methods noops.
+  // The vtable pointer is not an atomic at the moment, which means
+  // a disarm() invocation that is concurrent with active destructors
+  // is not thread safe and will trigger TSAN.  My hope is that this
+  // situations doesn't ever actually happen; tensor destruction should
+  // quiesce when a dlclose happens, and any long lived tensors whose
+  // destructors would be disarmed here only begin the destruction process
+  // on process shutdown (long after the dlclose has occurred).
+  void disarm() noexcept;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..acd2003569302cffcce5a907bd7fd506ac984a7b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyInterpreterHooks.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Registry.h>
+#include <memory>
+
+namespace c10::impl {
+
+// Minimal interface for PyInterpreter hooks
+struct C10_API PyInterpreterHooksInterface {
+  virtual ~PyInterpreterHooksInterface() = default;
+
+  // Get the PyInterpreter instance
+  // Stub implementation throws error when Python is not available
+  virtual PyInterpreter* getPyInterpreter() const {
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
+  }
+};
+
+struct C10_API PyInterpreterHooksArgs{};
+
+C10_DECLARE_REGISTRY(
+    PyInterpreterHooksRegistry,
+    PyInterpreterHooksInterface,
+    PyInterpreterHooksArgs);
+
+#define REGISTER_PYTHON_HOOKS(clsname) \
+  C10_REGISTER_CLASS(PyInterpreterHooksRegistry, clsname, clsname)
+
+// Get the global PyInterpreter hooks instance
+C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks();
+
+// Helper function to get the global interpreter
+C10_API PyInterpreter* getGlobalPyInterpreter();
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ba0688f66e597d4398d4a7d0407b2683ceb30aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PyObjectSlot.h
@@ -0,0 +1,70 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/HermeticPyObjectTLS.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/core/impl/PyInterpreterHooks.h>
+#include <c10/util/python_stub.h>
+#include <optional>
+
+#include <atomic>
+
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
+namespace c10::impl {
+
+struct C10_API PyObjectSlot {
+ public:
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
+
+  // Query the PyObject interpreter.  This may return null if there is no
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
+  }
+
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }
+
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
+  }
+
+  void store_pyobj(PyObject* obj) {
+    pyobj_.store(obj, std::memory_order_release);
+  }
+
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
+
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }
+
+ private:
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..cffb7fc31e3d18b4544027b261b98c686f81274a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h
@@ -0,0 +1,34 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Export.h>
+
+namespace c10::impl {
+
+struct C10_API PythonDispatcherTLS {
+  static void set_state(PyInterpreter* state);
+  static PyInterpreter* get_state();
+  static void reset_state();
+};
+
+struct C10_API DisablePythonDispatcher {
+  DisablePythonDispatcher() : old_(PythonDispatcherTLS::get_state()) {
+    PythonDispatcherTLS::set_state({});
+  }
+
+  DisablePythonDispatcher(DisablePythonDispatcher&& other) = delete;
+  DisablePythonDispatcher(const DisablePythonDispatcher&) = delete;
+  DisablePythonDispatcher& operator=(const DisablePythonDispatcher&) = delete;
+  DisablePythonDispatcher& operator=(DisablePythonDispatcher&&) = delete;
+  ~DisablePythonDispatcher() {
+    PythonDispatcherTLS::set_state(old_);
+  }
+  PyInterpreter* old_;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h
new file mode 100644
index 0000000000000000000000000000000000000000..da3a9a0c4abacf6165ca946e62257771cf2790ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/SizesAndStrides.h
@@ -0,0 +1,336 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/SmallVector.h>
+
+#define C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE 5
+
+namespace c10::impl {
+
+// Packed container for TensorImpl sizes and strides.
+// This design improves on the previous approach of using a pair of
+// c10::SmallVector<int64_t, 5> by specializing for the operations we
+// actually use and enforcing that the number of sizes is the same as
+// the number of strides. The memory layout is as follows:
+//
+// 1 size_t for the size
+// 5 eightbytes of inline sizes and 5 eightbytes of inline strides, OR pointer
+// to out-of-line array
+class C10_API SizesAndStrides {
+ public:
+  // TODO: different iterator types for sizes & strides to prevent
+  // mixing the two accidentally.
+  using sizes_iterator = int64_t*;
+  using sizes_const_iterator = const int64_t*;
+  using strides_iterator = int64_t*;
+  using strides_const_iterator = const int64_t*;
+
+  SizesAndStrides() {
+    size_at_unchecked(0) = 0;
+    stride_at_unchecked(0) = 1;
+  }
+
+  ~SizesAndStrides() {
+    if (C10_UNLIKELY(!isInline())) {
+      // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+      free(outOfLineStorage_);
+    }
+  }
+
+  SizesAndStrides(const SizesAndStrides& rhs) : size_(rhs.size_) {
+    if (C10_LIKELY(rhs.isInline())) {
+      copyDataInline(rhs);
+    } else {
+      allocateOutOfLineStorage(size_);
+      copyDataOutline(rhs);
+    }
+  }
+
+  bool operator==(const SizesAndStrides& other) const {
+    if (size_ != other.size_) {
+      return false;
+    }
+    return !(
+        isInline()
+            ? std::memcmp(
+                  inlineStorage_, other.inlineStorage_, sizeof(inlineStorage_))
+            : std::memcmp(
+                  outOfLineStorage_,
+                  other.outOfLineStorage_,
+                  storageBytes(size_)));
+  }
+
+  bool operator!=(const SizesAndStrides& other) const {
+    return !(*this == other);
+  }
+
+  SizesAndStrides& operator=(const SizesAndStrides& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_LIKELY(rhs.isInline())) {
+      if (C10_UNLIKELY(!isInline())) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      copyDataInline(rhs);
+    } else {
+      if (isInline()) {
+        allocateOutOfLineStorage(rhs.size_);
+      } else {
+        resizeOutOfLineStorage(rhs.size_);
+      }
+      copyDataOutline(rhs);
+    }
+    size_ = rhs.size_;
+    return *this;
+  }
+
+  // Move from rhs. rhs.size() == 0 afterwards.
+  SizesAndStrides(SizesAndStrides&& rhs) noexcept : size_(rhs.size_) {
+    if (C10_LIKELY(isInline())) {
+      memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_));
+    } else {
+      outOfLineStorage_ = rhs.outOfLineStorage_;
+      rhs.outOfLineStorage_ = nullptr;
+    }
+
+    rhs.size_ = 0;
+  }
+
+  // Move from rhs. rhs.size() == 0 afterwards.
+  SizesAndStrides& operator=(SizesAndStrides&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_LIKELY(rhs.isInline())) {
+      if (C10_UNLIKELY(!isInline())) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      copyDataInline(rhs);
+    } else {
+      // They're outline. We're going to steal their vector.
+      if (!isInline()) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      outOfLineStorage_ = rhs.outOfLineStorage_;
+      rhs.outOfLineStorage_ = nullptr;
+    }
+    size_ = rhs.size_;
+    rhs.size_ = 0;
+
+    return *this;
+  }
+
+  size_t size() const noexcept {
+    return size_;
+  }
+
+  const int64_t* sizes_data() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[0];
+    } else {
+      return &outOfLineStorage_[0];
+    }
+  }
+
+  int64_t* sizes_data() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[0];
+    } else {
+      return &outOfLineStorage_[0];
+    }
+  }
+
+  sizes_const_iterator sizes_begin() const noexcept {
+    return sizes_data();
+  }
+
+  sizes_iterator sizes_begin() noexcept {
+    return sizes_data();
+  }
+
+  sizes_const_iterator sizes_end() const noexcept {
+    return sizes_begin() + size();
+  }
+
+  sizes_iterator sizes_end() noexcept {
+    return sizes_begin() + size();
+  }
+
+  IntArrayRef sizes_arrayref() const noexcept {
+    return IntArrayRef{sizes_data(), size()};
+  }
+
+  void set_sizes(IntArrayRef newSizes) {
+    resize(newSizes.size());
+    std::copy(newSizes.begin(), newSizes.end(), sizes_begin());
+  }
+
+  void set_strides(IntArrayRef strides) {
+    TORCH_INTERNAL_ASSERT(strides.size() == size());
+    std::copy(strides.begin(), strides.end(), strides_begin());
+  }
+
+  const int64_t* strides_data() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  int64_t* strides_data() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_const_iterator strides_begin() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_iterator strides_begin() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_const_iterator strides_end() const noexcept {
+    return strides_begin() + size();
+  }
+
+  strides_iterator strides_end() noexcept {
+    return strides_begin() + size();
+  }
+
+  IntArrayRef strides_arrayref() const noexcept {
+    return IntArrayRef{strides_data(), size()};
+  }
+
+  // Size accessors.
+  int64_t size_at(size_t idx) const noexcept {
+    assert(idx < size());
+    return sizes_data()[idx];
+  }
+
+  int64_t& size_at(size_t idx) noexcept {
+    assert(idx < size());
+    return sizes_data()[idx];
+  }
+
+  int64_t size_at_unchecked(size_t idx) const noexcept {
+    return sizes_data()[idx];
+  }
+
+  int64_t& size_at_unchecked(size_t idx) noexcept {
+    return sizes_data()[idx];
+  }
+
+  // Size accessors.
+  int64_t stride_at(size_t idx) const noexcept {
+    assert(idx < size());
+    return strides_data()[idx];
+  }
+
+  int64_t& stride_at(size_t idx) noexcept {
+    assert(idx < size());
+    return strides_data()[idx];
+  }
+
+  int64_t stride_at_unchecked(size_t idx) const noexcept {
+    return strides_data()[idx];
+  }
+
+  int64_t& stride_at_unchecked(size_t idx) noexcept {
+    return strides_data()[idx];
+  }
+
+  void resize(size_t newSize) {
+    const auto oldSize = size();
+    if (newSize == oldSize) {
+      return;
+    }
+    if (C10_LIKELY(
+            newSize <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE && isInline())) {
+      if (oldSize < newSize) {
+        const auto bytesToZero =
+            (newSize - oldSize) * sizeof(inlineStorage_[0]);
+        memset(&inlineStorage_[oldSize], 0, bytesToZero);
+        memset(
+            &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE + oldSize],
+            0,
+            bytesToZero);
+      }
+      size_ = newSize;
+    } else {
+      resizeSlowPath(newSize, oldSize);
+    }
+  }
+
+  void resizeSlowPath(size_t newSize, size_t oldSize);
+
+ private:
+  bool isInline() const noexcept {
+    return size_ <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE;
+  }
+
+  void copyDataInline(const SizesAndStrides& rhs) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.isInline());
+    memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_));
+  }
+
+  static size_t storageBytes(size_t size) noexcept {
+    return size * 2 * sizeof(int64_t);
+  }
+
+  void allocateOutOfLineStorage(size_t size) {
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+    outOfLineStorage_ = static_cast<int64_t*>(malloc(storageBytes(size)));
+    TORCH_CHECK(
+        outOfLineStorage_,
+        "Could not allocate memory for Tensor SizesAndStrides!");
+  }
+
+  void resizeOutOfLineStorage(size_t newSize) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isInline());
+    outOfLineStorage_ = static_cast<int64_t*>(
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        realloc(outOfLineStorage_, storageBytes(newSize)));
+    TORCH_CHECK(
+        outOfLineStorage_,
+        "Could not allocate memory for Tensor SizesAndStrides!");
+  }
+
+  void copyDataOutline(const SizesAndStrides& rhs) noexcept {
+    memcpy(outOfLineStorage_, rhs.outOfLineStorage_, storageBytes(rhs.size_));
+  }
+
+  size_t size_{1};
+  union {
+    int64_t* outOfLineStorage_;
+    // NOLINTNEXTLINE(*c-array*)
+    int64_t inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE * 2]{};
+  };
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..002bf4283806448b0cf9470116758b21fa5499e6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h
@@ -0,0 +1,72 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Export.h>
+
+namespace c10::impl {
+
+enum class TorchDispatchModeKey : int8_t {
+  FAKE,
+  PROXY,
+  FUNCTIONAL,
+  NUM_MODE_KEYS
+};
+
+using PyObject_TorchDispatchMode = SafePyObjectT<TorchDispatchModeKey>;
+
+struct C10_API TorchDispatchModeTLS {
+  // This API is NOT invariant safe.
+  // It must not take in an infra mode that uses TorchDispatchModeKey
+  // If you're pushing an infra mode onto the stack, we expect
+  // you to use set_mode
+  static void push_non_infra_mode_onto_stack(
+      std::shared_ptr<PyObject_TorchDispatchMode> mode);
+  // Pops the top mode of the stack,
+  // giving precedence to user modes before attempting to pop
+  // any infra modes
+  static const std::shared_ptr<PyObject_TorchDispatchMode> pop_stack();
+  // Returns the highest-priority infra mode on the stack,
+  // along with its mode key.
+  static const std::
+      tuple<std::shared_ptr<PyObject_TorchDispatchMode>, TorchDispatchModeKey>
+      pop_highest_infra_mode();
+
+  static const std::shared_ptr<PyObject_TorchDispatchMode>& get_stack_at(
+      int64_t idx);
+  static int64_t stack_len();
+
+  static const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  get_mode(TorchDispatchModeKey mode_key);
+  static const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  unset_mode(TorchDispatchModeKey mode_key);
+  static void set_mode(
+      const std::shared_ptr<PyObject_TorchDispatchMode>& mode,
+      TorchDispatchModeKey mode_key);
+
+  static const TorchDispatchModeTLS& get_state();
+  static void set_state(TorchDispatchModeTLS state);
+
+  static bool any_modes_set(bool skip_infra_modes = false);
+
+ private:
+  std::vector<std::shared_ptr<PyObject_TorchDispatchMode>> stack_;
+  // Users are allowed to push multiple ProxyTorchDispatchMode objects onto the
+  // stack
+  // However, we only allow a single FakeTensorMode onto the stack at a time
+  // (Pushing additional FakeTensorModes onto the stack is a no-op)
+  std::array<
+      std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>,
+      static_cast<size_t>(TorchDispatchModeKey::NUM_MODE_KEYS)>
+      infra_modes_;
+};
+
+C10_API bool dispatch_mode_enabled();
+
+C10_API std::string to_string(TorchDispatchModeKey mode_key);
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..16b1970bfa1bbc7d6dc9c1a0463d17f3cb08b9fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h
@@ -0,0 +1,117 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+namespace c10::impl {
+
+/**
+ * An implementation of DeviceGuardImplInterface which delegates
+ * to virtual dispatch on the DeviceGuardImpl registry.
+ */
+class VirtualGuardImpl final : public DeviceGuardImplInterface {
+ public:
+  VirtualGuardImpl(DeviceType device_type)
+      : impl_(getDeviceGuardImpl(device_type)) {}
+  // This constructor exists purely for testing
+  VirtualGuardImpl(const DeviceGuardImplInterface* impl) : impl_(impl) {}
+
+  // Copying and moving is OK!
+  VirtualGuardImpl(const VirtualGuardImpl&) = default;
+  VirtualGuardImpl& operator=(const VirtualGuardImpl&) = default;
+  VirtualGuardImpl(VirtualGuardImpl&&) noexcept = default;
+  VirtualGuardImpl& operator=(VirtualGuardImpl&&) noexcept = default;
+  ~VirtualGuardImpl() override = default;
+
+  DeviceType type() const override {
+    return impl_->type();
+  }
+  Device exchangeDevice(Device d) const override {
+    return impl_->exchangeDevice(d);
+  }
+  Device getDevice() const override {
+    return impl_->getDevice();
+  }
+  void setDevice(Device d) const override {
+    impl_->setDevice(d);
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    impl_->uncheckedSetDevice(d);
+  }
+  Stream getStream(Device d) const override {
+    return impl_->getStream(d);
+  }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return impl_->getNewStream(d, priority);
+  }
+  Stream getDefaultStream(Device d) const override {
+    return impl_->getDefaultStream(d);
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return impl_->getStreamFromGlobalPool(d, isHighPriority);
+  }
+  Stream exchangeStream(Stream s) const override {
+    return impl_->exchangeStream(s);
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return impl_->deviceCount();
+  }
+
+  DeviceCapability getDeviceCapability(Device d) const override {
+    return impl_->getDeviceCapability(d);
+  }
+
+  // Event functions
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    impl_->record(event, stream, device_index, flag);
+  }
+  void block(void* event, const Stream& stream) const override {
+    impl_->block(event, stream);
+  }
+  bool queryEvent(void* event) const override {
+    return impl_->queryEvent(event);
+  }
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    impl_->destroyEvent(event, device_index);
+  }
+
+  bool queryStream(const Stream& stream) const override {
+    return impl_->queryStream(stream);
+  }
+  void synchronizeStream(const Stream& stream) const override {
+    impl_->synchronizeStream(stream);
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    impl_->recordDataPtrOnStream(data_ptr, stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    return impl_->elapsedTime(event1, event2, device_index);
+  }
+
+  void synchronizeEvent(void* event) const override {
+    impl_->synchronizeEvent(event);
+  }
+
+  void synchronizeDevice(const DeviceIndex device_index) const override {
+    impl_->synchronizeDevice(device_index);
+  }
+
+ private:
+  const DeviceGuardImplInterface* impl_ = nullptr;
+};
+
+} // namespace c10::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef28ed469f010d3aedeb5d68ad5405c2ffdaa055
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/impl/alloc_cpu.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+
+#include <cstddef>
+
+namespace c10 {
+
+C10_API void* alloc_cpu(size_t nbytes);
+C10_API void free_cpu(void* data);
+
+#if defined(__linux__) && !defined(__ANDROID__)
+C10_API size_t c10_compute_alignment(size_t nbytes);
+#endif
+
+#ifdef USE_MIMALLOC_ON_MKL
+namespace mi_malloc_wrapper {
+C10_API void* c10_mi_malloc(size_t size);
+C10_API void* c10_mi_calloc(size_t count, size_t size);
+C10_API void* c10_mi_realloc(void* p, size_t newsize);
+C10_API void* c10_mi_malloc_aligned(size_t size, size_t alignment);
+C10_API void c10_mi_free(void* p);
+} // namespace mi_malloc_wrapper
+#endif
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..85b9a73d6bfa7bdf5a815c6e659f0c4af6bd8ef8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/core/thread_pool.h
@@ -0,0 +1,125 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <c10/macros/Export.h>
+#include <c10/util/Registry.h>
+#include <c10/util/numa.h>
+#include <c10/util/thread_name.h>
+
+namespace c10 {
+
+class C10_API TaskThreadPoolBase {
+ public:
+  virtual void run(std::function<void()> func) = 0;
+
+  virtual size_t size() const = 0;
+
+  /**
+   * The number of available (i.e. idle) threads in this thread pool.
+   */
+  virtual size_t numAvailable() const = 0;
+
+  /**
+   * Check if the current thread is from the thread pool.
+   */
+  virtual bool inThreadPool() const = 0;
+
+  virtual ~TaskThreadPoolBase() noexcept = default;
+
+  static size_t defaultNumThreads();
+};
+
+class C10_API ThreadPool : public c10::TaskThreadPoolBase {
+ protected:
+  struct task_element_t {
+    bool run_with_id;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::function<void()> no_id;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::function<void(std::size_t)> with_id;
+
+    explicit task_element_t(std::function<void()> f)
+        : run_with_id(false), no_id(std::move(f)), with_id(nullptr) {}
+    explicit task_element_t(std::function<void(std::size_t)> f)
+        : run_with_id(true), no_id(nullptr), with_id(std::move(f)) {}
+  };
+
+  std::queue<task_element_t> tasks_;
+  std::vector<std::thread> threads_;
+  mutable std::mutex mutex_;
+  std::condition_variable condition_;
+  std::condition_variable completed_;
+  std::atomic_bool running_;
+  bool complete_;
+  std::size_t available_;
+  std::size_t total_;
+  int numa_node_id_;
+
+ public:
+  ThreadPool() = delete;
+
+  explicit ThreadPool(
+      int pool_size,
+      int numa_node_id = -1,
+      const std::function<void()>& init_thread = nullptr);
+
+  ~ThreadPool() override;
+
+  size_t size() const override;
+
+  size_t numAvailable() const override;
+
+  bool inThreadPool() const override;
+
+  void run(std::function<void()> func) override;
+
+  template <typename Task>
+  void runTaskWithID(Task task) {
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    // Set task and signal condition variable so that a worker thread will
+    // wake up and use the task.
+    tasks_.emplace(static_cast<std::function<void(std::size_t)>>(task));
+    complete_ = false;
+    condition_.notify_one();
+  }
+
+  /// @brief Wait for queue to be empty
+  void waitWorkComplete();
+
+ private:
+  // @brief Entry point for pool threads.
+  void main_loop(std::size_t index);
+};
+
+class C10_API TaskThreadPool : public c10::ThreadPool {
+ public:
+  explicit TaskThreadPool(int pool_size, int numa_node_id = -1)
+      : ThreadPool(pool_size, numa_node_id, [numa_node_id]() {
+          setThreadName("CaffeTaskThread");
+          NUMABind(numa_node_id);
+        }) {}
+};
+
+C10_DECLARE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPoolBase,
+    int,
+    int,
+    bool);
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..62995e142a3e84bf83e2e7143cdc6bc8eb67f91f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#endif
+namespace c10::cuda {
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+template <typename Iter, typename Scalar>
+__forceinline__ __device__ Iter
+lower_bound(Iter start, Iter end, Scalar value) {
+  return thrust::lower_bound(thrust::device, start, end, value);
+}
+#else
+// thrust::lower_bound is broken on device, see
+// https://github.com/NVIDIA/thrust/issues/1734 Implementation inspired by
+// https://github.com/pytorch/pytorch/blob/805120ab572efef66425c9f595d9c6c464383336/aten/src/ATen/native/cuda/Bucketization.cu#L28
+template <typename Iter, typename Scalar>
+__device__ Iter lower_bound(Iter start, Iter end, Scalar value) {
+  while (start < end) {
+    auto mid = start + ((end - start) >> 1);
+    if (*mid < value) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return end;
+}
+#endif // THRUST_DEVICE_LOWER_BOUND_WORKS
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..286eb3daecb5aa73711392c839776ab5e0444275
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
@@ -0,0 +1,211 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/AllocatorConfig.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/env.h>
+
+namespace c10::cuda::CUDACachingAllocator {
+
+enum class Expandable_Segments_Handle_Type : int {
+  UNSPECIFIED = 0,
+  POSIX_FD = 1,
+  FABRIC_HANDLE = 2,
+};
+
+// Environment config parser
+class C10_CUDA_API CUDAAllocatorConfig {
+ public:
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
+  static size_t max_split_size() {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
+  static double garbage_collection_threshold() {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        garbage_collection_threshold();
+  }
+
+  static bool expandable_segments() {
+    bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
+        use_expandable_segments();
+#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+    if (enabled) {
+      TORCH_WARN_ONCE("expandable_segments not supported on this platform")
+    }
+    return false;
+#else
+    return enabled;
+#endif
+  }
+
+  static Expandable_Segments_Handle_Type expandable_segments_handle_type() {
+    return instance().m_expandable_segments_handle_type;
+  }
+
+  static void set_expandable_segments_handle_type(
+      Expandable_Segments_Handle_Type handle_type) {
+    instance().m_expandable_segments_handle_type = handle_type;
+  }
+
+  static bool release_lock_on_cudamalloc() {
+    return instance().m_release_lock_on_cudamalloc;
+  }
+
+  static bool graph_capture_record_stream_reuse() {
+    return instance().m_graph_capture_record_stream_reuse;
+  }
+
+  static double per_process_memory_fraction() {
+    return instance().m_per_process_memory_fraction;
+  }
+
+  /** Pinned memory allocator settings */
+  static bool pinned_use_cuda_host_register() {
+    return instance().m_pinned_use_cuda_host_register;
+  }
+
+  static size_t pinned_num_register_threads() {
+    return instance().m_pinned_num_register_threads;
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
+  static bool pinned_use_background_threads() {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        pinned_use_background_threads();
+  }
+
+  static size_t pinned_reserve_segment_size_mb() {
+    return instance().m_pinned_reserve_segment_size_mb;
+  }
+
+  static size_t pinned_max_register_threads() {
+    // Based on the benchmark results, we see better allocation performance
+    // with 8 threads. However on future systems, we may need more threads
+    // and limiting this to 128 threads.
+    return 128;
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
+  static size_t roundup_power2_divisions(size_t size) {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        roundup_power2_divisions(size);
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
+  static std::vector<size_t> roundup_power2_divisions() {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        roundup_power2_divisions();
+  }
+
+  static size_t max_non_split_rounding_size() {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        max_non_split_rounding_size();
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
+  static std::string last_allocator_settings() {
+    return c10::CachingAllocator::getAllocatorSettings();
+  }
+
+  static CUDAAllocatorConfig& instance() {
+    static CUDAAllocatorConfig* s_instance = ([]() {
+      auto inst = new CUDAAllocatorConfig();
+      auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
+#ifdef USE_ROCM
+      // convenience for ROCm users, allow alternative HIP token
+      if (!env.has_value()) {
+        env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
+      }
+#endif
+      // Note: keep the parsing order and logic stable to avoid potential
+      // performance regressions in internal tests.
+      if (!env.has_value()) {
+        env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
+      }
+      if (env.has_value()) {
+        inst->parseArgs(env.value());
+      }
+      return inst;
+    })();
+    return *s_instance;
+  }
+
+  // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
+  // issue.
+  static const std::unordered_set<std::string>& getKeys() {
+    static std::unordered_set<std::string> keys{
+        "backend",
+        // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
+        // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_cud"
+        "amalloc",
+        "pinned_use_cud"
+        "a_host_register",
+        // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_hipmalloc",
+        "pinned_use_hip_host_register",
+        "graph_capture_record_stream_reuse",
+        "pinned_reserve_segment_size_mb",
+        "pinned_num_register_threads",
+        "per_process_memory_fraction"};
+    return keys;
+  }
+
+  void parseArgs(const std::string& env);
+
+ private:
+  CUDAAllocatorConfig() = default;
+
+  size_t parseAllocatorConfig(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i,
+      bool& used_cudaMallocAsync);
+  size_t parsePinnedUseCudaHostRegister(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
+  size_t parsePinnedNumRegisterThreads(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
+  size_t parsePinnedReserveSegmentSize(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
+  size_t parseGraphCaptureRecordStreamReuse(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
+  double parsePerProcessMemoryFraction(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);
+
+  std::atomic<size_t> m_pinned_num_register_threads{1};
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
+  std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
+#if CUDA_VERSION >= 12030
+      {Expandable_Segments_Handle_Type::UNSPECIFIED};
+#else
+      {Expandable_Segments_Handle_Type::POSIX_FD};
+#endif
+  std::atomic<bool> m_release_lock_on_cudamalloc{false};
+  std::atomic<bool> m_pinned_use_cuda_host_register{false};
+  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
+  std::atomic<double> m_per_process_memory_fraction{1.0};
+};
+
+// Keep this for backwards compatibility
+using c10::CachingAllocator::setAllocatorSettings;
+
+} // namespace c10::cuda::CUDACachingAllocator
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b425157814aa15296d38633501e47035e2804130
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
@@ -0,0 +1,582 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/AllocatorConfig.h>
+#include <c10/core/CachingDeviceAllocator.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+namespace c10 {
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class C10_CUDA_API FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() = default;
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__)
+} // namespace c10
+  //
+// TODO: Turn this into an honest to goodness class. I briefly attempted to do
+// this, but it was a bit irritating to figure out how to also correctly
+// apply pimpl pattern so I didn't have to leak any internal implementation
+// details in the header (CUDACachingAllocator could be made a pimpl, but
+// you also need to appropriately define a class which is a subclass
+// of Allocator. Not impossible, but required a bit more surgery than
+// I wanted to do at the time.)
+//
+// Why is this using a namespace rather than old-style THCCachingAllocator_
+// prefix?  Mostly because it made the HIPify rules easier to write; _ is
+// not counted as a word boundary, so you would otherwise have to list each
+// of these functions.
+
+namespace c10::cuda::CUDACachingAllocator {
+
+// Preserved only for BC reasons
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::CachingAllocator::kLargeBuffer;
+using c10::CachingDeviceAllocator::DeviceStats;
+
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
+
+// Struct containing info of an allocation block (i.e. a fractional part of a
+// cudaMalloc)..
+struct BlockInfo {
+  size_t size = 0;
+  size_t requested_size = 0;
+  int32_t gc_counter = 0;
+  bool allocated = false;
+  bool active = false;
+  std::shared_ptr<GatheredContext>
+      context_when_allocated; // per-watcher context
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  c10::DeviceIndex device = 0;
+  size_t address = 0;
+  size_t total_size = 0;
+  size_t requested_size = 0; // unrounded, actually requested size
+  size_t allocated_size = 0;
+  size_t active_size = 0;
+  cudaStream_t stream = nullptr;
+  bool is_large = false;
+  bool is_expandable = false;
+  MempoolId_t owner_private_pool_id = {0, 0};
+  std::vector<BlockInfo> blocks;
+  std::shared_ptr<GatheredContext> context_when_allocated;
+};
+
+struct AllocatorState {
+  virtual ~AllocatorState() = default;
+};
+
+union trace_time_ {
+  time_t t_;
+  approx_time_t approx_t_;
+};
+
+struct TraceEntry {
+  enum Action {
+    ALLOC, // API made to the caching allocator for new memory
+    FREE_REQUESTED, // API call made to the caching allocator to free memory
+    FREE_COMPLETED, // The allocator might have to delay a free because
+                    // it is still in use on another stream via record_stream
+                    // This event is generated when a free actually completes.
+    SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
+    SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
+                  // defragment or empty_caches)
+    SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
+    SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
+    SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
+              // events
+    OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
+        // bytes reported by cuda)
+  };
+  TraceEntry(
+      Action action,
+      c10::DeviceIndex device,
+      size_t addr,
+      size_t size,
+      cudaStream_t stream,
+      MempoolId_t mempool,
+      approx_time_t time,
+      std::shared_ptr<GatheredContext> context = nullptr,
+      std::string compile_context = "",
+      std::string user_metadata = "")
+      : action_(action),
+        device_(device),
+        addr_(addr),
+        context_(std::move(context)),
+        stream_(stream),
+        size_(size),
+        mempool_(std::move(mempool)),
+        compile_context_(std::move(compile_context)),
+        user_metadata_(std::move(user_metadata)) {
+    time_.approx_t_ = time;
+  }
+  Action action_;
+  c10::DeviceIndex device_;
+  size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  std::shared_ptr<GatheredContext> context_;
+  cudaStream_t stream_{};
+  size_t size_;
+  MempoolId_t mempool_;
+  trace_time_ time_{};
+  std::string compile_context_;
+  std::string user_metadata_;
+};
+
+// Calls made by record_function will save annotations
+struct AnnotationEntry {
+  AnnotationEntry(c10::DeviceIndex device, approx_time_t time)
+      : device_(device) {
+    time_.approx_t_ = time;
+  }
+
+  void recordUserMetadata(const std::string& name, std::string value) {
+    metadata_[name] = std::move(value);
+  }
+
+  c10::DeviceIndex device_;
+  trace_time_ time_{};
+  std::unordered_map<std::string, std::string> metadata_;
+};
+
+struct AllocatorConfigInfo {
+  double garbage_collection_threshold;
+  size_t max_split_size;
+  size_t pinned_num_register_threads;
+  bool expandable_segments;
+  bool release_lock_on_malloc;
+  bool pinned_use_host_register;
+  bool graph_capture_record_stream_reuse;
+  std::string last_allocator_settings;
+  std::vector<size_t> roundup_power2_divisions;
+};
+
+struct SnapshotInfo {
+  std::vector<SegmentInfo> segments;
+  std::vector<std::vector<TraceEntry>> device_traces;
+  std::vector<AnnotationEntry> external_annotations;
+  AllocatorConfigInfo config_metadata;
+};
+
+// returns the pointers freed in the pool
+// and the pointers allocated. Note: a pointer
+// may appear in both freed and allocated
+struct CheckpointDelta {
+  std::vector<void*> ptrs_freed;
+  std::vector<at::DataPtr> dataptrs_allocd;
+};
+
+enum struct RecordContext {
+  NEVER = 0,
+  STATE = 1, // only keep stacks for active allocations
+  ALLOC = 2, // additionally keep stacks for allocations in the trace history
+  ALL = 3, // additionally record stacks for when something is freed
+};
+
+using OutOfMemoryObserver = std::function<void(
+    int64_t device,
+    size_t allocated,
+    size_t device_total,
+    size_t device_free)>;
+
+using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
+
+struct ShareableHandle {
+  ptrdiff_t offset;
+  std::string handle;
+};
+
+struct StreamSegmentSize {
+  StreamSegmentSize(cudaStream_t s, bool small, size_t sz)
+      : stream(s), is_small_pool(small), total_size(sz) {}
+  cudaStream_t stream;
+  bool is_small_pool;
+  size_t total_size;
+};
+
+class CUDAAllocator : public DeviceAllocator {
+ public:
+  virtual void* raw_alloc(size_t nbytes) = 0;
+  virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
+  virtual void raw_delete(void* ptr) = 0;
+  virtual void init(int device_count) = 0;
+  virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) = 0;
+  virtual void enable(bool value) = 0;
+  virtual bool isEnabled() const = 0;
+  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
+  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
+  // Keep for BC only
+  virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    CUDAStream cuda_stream = CUDAStream(stream);
+    recordStream(ptr, cuda_stream);
+  }
+  virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
+  virtual void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(cudaStream_t)> filter) = 0;
+  virtual void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) = 0;
+  virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
+  virtual int getPoolUseCount(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support getPoolUseCount. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void createOrIncrefPool(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/,
+      CUDAAllocator* allocator = nullptr) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support createOrIncrefPool. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support setUseOnOOM. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support setNoSplit. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  // returns true if the allocated blocks are equal to expected live allocations
+  virtual bool checkPoolLiveAllocations(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/,
+      const std::unordered_set<void*>& /*expected_live_allocations*/) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support checkPoolLiveAllocations. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
+  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
+  virtual bool isHistoryEnabled() {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when,
+      bool clearHistory) = 0;
+  virtual void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
+  virtual void pushCompileContext(std::string& md) {}
+  virtual void popCompileContext() {}
+  virtual void setUserMetadata(const std::string& metadata) {}
+  virtual std::string getUserMetadata() {
+    return "";
+  }
+  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
+
+  // Attached AllocatorTraceTracker callbacks will be called while the
+  // per-device allocator lock is held. Any additional locks taken from within
+  // the callback must be proven to always have the lock order that never
+  // triggers a deadlock. In particular, Python's GIL may be held when
+  // calling the allocator so it is unsafe to try to acquire the GIL in this
+  // callback.
+  virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
+
+  virtual void enablePeerAccess(
+      c10::DeviceIndex dev,
+      c10::DeviceIndex dev_to_access) = 0;
+
+  // memory not allocated from cudaMalloc cannot be copied
+  // across devices using cudaMemcpyAsync if peer to peer access is disabled.
+  // instead it requires cudaMemcpyAsyncPeer
+  //  with P2P Enabled, all combinations work
+  //  with P2P Disabled:
+  //                       cudaMalloc cudaMallocAsync/cuMemMap
+  // cudaMemcpyAsyncPeer   works      works
+  // cudaMemcpyAsync       works      error
+
+  // This function performs chooses to use the Peer version of
+  // memcpy if required based on where the allocated put dst/src.
+  virtual cudaError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      cudaStream_t stream,
+      bool p2p_enabled) = 0;
+  virtual std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) = 0;
+  virtual CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> pps) = 0;
+  virtual std::string name() = 0;
+  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
+    c10::DeviceGuard device_guard({at::kCUDA, device});
+    size_t free = 0;
+    size_t total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    return {free, total};
+  }
+};
+
+// Allocator object, statically initialized
+// See BackendInitializer in CUDACachingAllocator.cpp.
+// Atomic loads on x86 are just normal loads,
+// (atomic stores are different), so reading this value
+// is no different than loading a pointer.
+C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
+
+inline CUDAAllocator* get() {
+  return allocator.load();
+}
+
+// Called directly by clients.
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  get()->init(device_count);
+}
+
+inline double getMemoryFraction(c10::DeviceIndex device) {
+  return get()->getMemoryFraction(device);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  get()->setMemoryFraction(fraction, device);
+}
+
+inline std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+    c10::DeviceIndex device) {
+  return get()->getExpandableSegmentSizes(device);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  get()->emptyCache(mempool_id);
+}
+
+inline void enable(bool value) {
+  get()->enable(value);
+}
+
+inline bool isEnabled() {
+  return get()->isEnabled();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
+  get()->recordStream(dataPtr, stream);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  get()->resetPeakStats(device);
+}
+
+inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+}
+
+inline std::shared_ptr<AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+// CUDAGraph interactions
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(cudaStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    RecordContext when,
+    bool clearHistory) {
+  get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+}
+
+inline void recordAnnotation(
+    const std::vector<std::pair<std::string, std::string>>& md) {
+  get()->recordAnnotation(md);
+}
+
+inline void pushCompileContext(std::string& md) {
+  get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  get()->popCompileContext();
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+  get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+  get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->releasePool(device, mempool_id);
+}
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    CUDAAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+}
+inline void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setNoSplit(device, mempool_id);
+}
+inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->getPoolUseCount(device, mempool_id);
+}
+
+// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline ShareableHandle shareIpcHandle(void* ptr) {
+  return get()->shareIpcHandle(ptr);
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline cudaError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    cudaStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  get()->enablePeerAccess(dev, dev_to_access);
+}
+
+inline void setUserMetadata(const std::string& metadata) {
+  get()->setUserMetadata(metadata);
+}
+
+inline std::string getUserMetadata() {
+  return get()->getUserMetadata();
+}
+
+} // namespace c10::cuda::CUDACachingAllocator
+
+namespace c10::cuda {
+// Keep BC only
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
new file mode 100644
index 0000000000000000000000000000000000000000..294734601cb78d68aff50da939b3452c948adb80
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/macros/Macros.h>
+
+namespace c10::cuda {
+
+#ifdef TORCH_USE_CUDA_DSA
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
+// Copy string from `src` to `dst`
+static __device__ void dstrcpy(char* dst, const char* src) {
+  int i = 0;
+  // Copy string from source to destination, ensuring that it
+  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
+  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
+    *dst++ = *src++;
+  }
+  *dst = '\0';
+}
+
+static __device__ void dsa_add_new_assertion_failure(
+    DeviceAssertionsData* assertions_data,
+    const char* assertion_msg,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const uint32_t caller,
+    const dim3 block_id,
+    const dim3 thread_id) {
+  // `assertions_data` may be nullptr if device-side assertion checking
+  // is disabled at run-time. If it is disabled at compile time this
+  // function will never be called
+  if (!assertions_data) {
+    return;
+  }
+
+  // Atomically increment so other threads can fail at the same time
+  // Note that incrementing this means that the CPU can observe that
+  // a failure has happened and can begin to respond before we've
+  // written information about that failure out to the buffer.
+  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
+
+  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
+    // At this point we're ran out of assertion buffer space.
+    // We could print a message about this, but that'd get
+    // spammy if a lot of threads did it, so we just silently
+    // ignore any other assertion failures. In most cases the
+    // failures will all probably be analogous anyway.
+    return;
+  }
+
+  // Write information about the assertion failure to memory.
+  // Note that this occurs only after the `assertion_count`
+  // increment broadcasts that there's been a problem.
+  auto& self = assertions_data->assertions[nid];
+  dstrcpy(self.assertion_msg, assertion_msg);
+  dstrcpy(self.filename, filename);
+  dstrcpy(self.function_name, function_name);
+  self.line_number = line_number;
+  self.caller = caller;
+  self.block_id[0] = block_id.x;
+  self.block_id[1] = block_id.y;
+  self.block_id[2] = block_id.z;
+  self.thread_id[0] = thread_id.x;
+  self.thread_id[1] = thread_id.y;
+  self.thread_id[2] = thread_id.z;
+}
+C10_CLANG_DIAGNOSTIC_POP()
+
+// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
+// so you should assume everything the kernel produces is garbage if there's an
+// assertion failure.
+// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
+//       arguments of the kernel and therefore accessible.
+#define CUDA_KERNEL_ASSERT2(condition)                                   \
+  do {                                                                   \
+    if (C10_UNLIKELY(!(condition))) {                                    \
+      /* Has an atomic element so threads can fail at the same time */   \
+      c10::cuda::dsa_add_new_assertion_failure(                          \
+          assertions_data,                                               \
+          C10_STRINGIZE(condition),                                      \
+          __FILE__,                                                      \
+          __FUNCTION__,                                                  \
+          __LINE__,                                                      \
+          assertion_caller_id,                                           \
+          blockIdx,                                                      \
+          threadIdx);                                                    \
+      /* Now that the kernel has failed we early exit the kernel, but */ \
+      /* otherwise keep going and rely on the host to check UVM and */   \
+      /* determine we've had a problem */                                \
+      return;                                                            \
+    }                                                                    \
+  } while (false)
+#else
+#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
+#endif
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d4921a100a1c73e2fd5a69284cd92435b7f70f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
@@ -0,0 +1,169 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_CUDA
+#define TORCH_USE_CUDA_DSA
+#endif
+
+/// Number of assertion failure messages we can store. If this is too small
+/// threads will fail silently.
+constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
+constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
+
+namespace c10::cuda {
+
+/// Holds information about any device-side assertions that fail.
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionData {
+  /// Stringification of the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// File the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Name of the function the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Line number the assertion was at
+  int line_number{};
+  /// Number uniquely identifying the kernel launch that triggered the assertion
+  uint32_t caller{};
+  /// block_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
+  /// third_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
+};
+
+/// Used to hold assertions generated by the device
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionsData {
+  /// Total number of assertions found; a subset of these will be recorded
+  /// in `assertions`
+  int32_t assertion_count{};
+  /// An array of assertions that will be written to in a race-free manner
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
+};
+
+/// Use to hold info about kernel launches so that we can run kernels
+/// asynchronously and still associate launches with device-side
+/// assertion failures
+struct CUDAKernelLaunchInfo {
+  /// Filename of the code where the kernel was launched from
+  const char* launch_filename;
+  /// Function from which the kernel was launched
+  const char* launch_function;
+  /// Line number of where the code was launched from
+  uint32_t launch_linenum;
+  /// Backtrace of where the kernel was launched from, only populated if
+  /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
+  std::string launch_stacktrace;
+  /// Kernel that was launched
+  const char* kernel_name;
+  /// Device the kernel was launched on
+  int device;
+  /// Stream the kernel was launched on
+  int32_t stream;
+  /// A number that uniquely identifies the kernel launch
+  uint64_t generation_number;
+};
+
+/// Circular buffer used to hold information about kernel launches
+/// this is later used to reconstruct how a device-side kernel assertion failure
+/// occurred CUDAKernelLaunchRegistry is used as a singleton
+class C10_CUDA_API CUDAKernelLaunchRegistry {
+ private:
+  /// Assume that this is the max number of kernel launches that might ever be
+  /// enqueued across all streams on a single device
+  static constexpr int max_kernel_launches = 1024;
+  /// How many kernel launch infos we've inserted. Used to ensure that circular
+  /// queue doesn't provide false information by always increasing, but also to
+  /// mark where we are inserting into the queue
+#ifdef TORCH_USE_CUDA_DSA
+  uint64_t generation_number = 0;
+#endif
+  /// Shared mutex between writer and accessor to ensure multi-threaded safety.
+  mutable std::mutex read_write_mutex;
+  /// Used to ensure prevent race conditions in GPU memory allocation
+  mutable std::mutex gpu_alloc_mutex;
+  /// Pointer to managed memory keeping track of device-side assertions. There
+  /// is one entry for each possible device the process might work with. Unused
+  /// entries are nullptrs. We could also use an unordered_set here, but this
+  /// vector design will be faster and the wasted memory is small since we
+  /// expect the number of GPUs per node will always be small
+  std::vector<
+      std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
+      uvm_assertions;
+  /// A single circular buffer holds information about every kernel launch the
+  /// process makes across all devices.
+  std::vector<CUDAKernelLaunchInfo> kernel_launches;
+  bool check_env_for_enable_launch_stacktracing() const;
+  bool check_env_for_dsa_enabled() const;
+
+ public:
+  CUDAKernelLaunchRegistry();
+  /// Register a new kernel launch and obtain a generation number back to be
+  /// passed to the kernel
+  uint32_t insert(
+      const char* launch_filename,
+      const char* launch_function,
+      const uint32_t launch_linenum,
+      const char* kernel_name,
+      const int32_t stream_id);
+  /// Get copies of the kernel launch registry and each device's assertion
+  /// failure buffer so they can be inspected without raising race conditions
+  std::
+      pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
+      snapshot() const;
+  /// Get a pointer to the current device's assertion failure buffer. If no such
+  /// buffer exists then one is created. This means that the first kernel launch
+  /// made on each device will be slightly slower because memory allocations are
+  /// required
+  DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
+  /// Gets the global singleton of the registry
+  static CUDAKernelLaunchRegistry& get_singleton_ref();
+  /// If not all devices support DSA, we disable it
+  const bool do_all_devices_support_managed_memory = false;
+  /// Whether or not to gather stack traces when launching kernels
+  bool gather_launch_stacktrace = false;
+  /// Whether or not host-side DSA is enabled or disabled at run-time
+  /// Note: Device-side code cannot be enabled/disabled at run-time
+  bool enabled_at_runtime = false;
+  /// Whether or not a device has indicated a failure
+  bool has_failed() const;
+#ifdef TORCH_USE_CUDA_DSA
+  const bool enabled_at_compile_time = true;
+#else
+  const bool enabled_at_compile_time = false;
+#endif
+};
+
+C10_CUDA_API std::string c10_retrieve_device_side_assertion_info();
+
+} // namespace c10::cuda
+
+// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
+// requires the same input arguments. We introduce the following macro to
+// standardize these.
+#define TORCH_DSA_KERNEL_ARGS                                              \
+  [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
+      [[maybe_unused]] uint32_t assertion_caller_id
+
+// This macro can be used to pass the DSA arguments onward to another
+// function
+#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h
new file mode 100644
index 0000000000000000000000000000000000000000..71a5a9b86d8833ca28adad37f36061b201b2d5d5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAException.h
@@ -0,0 +1,102 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/cuda/CUDADeviceAssertionHost.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAMiscFunctions.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cuda.h>
+
+// Note [CHECK macro]
+// ~~~~~~~~~~~~~~~~~~
+// This is a macro so that AT_ERROR can get accurate __LINE__
+// and __FILE__ information.  We could split this into a short
+// macro and a function implementation if we pass along __LINE__
+// and __FILE__, but no one has found this worth doing.
+
+// Used to denote errors from CUDA framework.
+// This needs to be declared here instead util/Exception.h for proper conversion
+// during hipify.
+namespace c10 {
+class C10_CUDA_API CUDAError : public c10::Error {
+  using Error::Error;
+};
+} // namespace c10
+
+#define C10_CUDA_CHECK(EXPR)                                        \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    c10::cuda::c10_cuda_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        true);                                                      \
+  } while (0)
+
+#define C10_CUDA_CHECK_WARN(EXPR)                              \
+  do {                                                         \
+    const cudaError_t __err = EXPR;                            \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                  \
+      [[maybe_unused]] auto error_unused = cudaGetLastError(); \
+      TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
+    }                                                          \
+  } while (0)
+
+// Indicates that a CUDA error is handled in a non-standard way
+#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
+
+// Intentionally ignore a CUDA error
+#define C10_CUDA_IGNORE_ERROR(EXPR)                                   \
+  do {                                                                \
+    const cudaError_t __err = EXPR;                                   \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                         \
+      [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
+    }                                                                 \
+  } while (0)
+
+// Clear the last CUDA error
+#define C10_CUDA_CLEAR_ERROR()                                      \
+  do {                                                              \
+    [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
+  } while (0)
+
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
+
+/// Launches a CUDA kernel appending to it all the information need to handle
+/// device-side assertion failures. Checks that the launch was successful.
+#define TORCH_DSA_KERNEL_LAUNCH(                                      \
+    kernel, blocks, threads, shared_mem, stream, ...)                 \
+  do {                                                                \
+    auto& launch_registry =                                           \
+        c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();     \
+    kernel<<<blocks, threads, shared_mem, stream>>>(                  \
+        __VA_ARGS__,                                                  \
+        launch_registry.get_uvm_assertions_ptr_for_current_device(),  \
+        launch_registry.insert(                                       \
+            __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();                                   \
+  } while (0)
+
+namespace c10::cuda {
+
+/// In the event of a CUDA failure, formats a nice error message about that
+/// failure and also checks for device-side assertion failures
+C10_CUDA_API void c10_cuda_check_implementation(
+    const int32_t err,
+    const char* filename,
+    const char* function_name,
+    const uint32_t line_number,
+    const bool include_device_assertions);
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a97b3d89401a64afc834bbb3c573a4f1b2f21c22
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAFunctions.h
@@ -0,0 +1,131 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// This header provides C++ wrappers around commonly used CUDA API functions.
+// The benefit of using C++ here is that we can raise an exception in the
+// event of an error, rather than explicitly pass around error codes.  This
+// leads to more natural APIs.
+//
+// The naming convention used here matches the naming convention of torch.cuda
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <cuda_runtime_api.h>
+namespace c10::cuda {
+
+// NB: In the past, we were inconsistent about whether or not this reported
+// an error if there were driver problems are not.  Based on experience
+// interacting with users, it seems that people basically ~never want this
+// function to fail; it should just return zero if things are not working.
+// Oblige them.
+// It still might log a warning for user first time it's invoked
+C10_CUDA_API DeviceIndex device_count() noexcept;
+
+// Version of device_count that throws is no devices are detected
+C10_CUDA_API DeviceIndex device_count_ensure_non_zero();
+
+C10_CUDA_API DeviceIndex current_device();
+
+C10_CUDA_API void set_device(DeviceIndex device, const bool force = false);
+
+C10_CUDA_API void device_synchronize();
+
+C10_CUDA_API void warn_or_error_on_sync();
+
+// Raw CUDA device management functions
+C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
+
+C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
+
+C10_CUDA_API cudaError_t
+SetDevice(DeviceIndex device, const bool force = false);
+
+C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API void SetTargetDevice();
+
+enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
+
+// this is a holder for c10 global state (similar to at GlobalContext)
+// currently it's used to store cuda synchronization warning state,
+// but can be expanded to hold other related global state, e.g. to
+// record stream usage
+class WarningState {
+ public:
+  void set_sync_debug_mode(SyncDebugMode l) {
+    sync_debug_mode = l;
+  }
+
+  SyncDebugMode get_sync_debug_mode() {
+    return sync_debug_mode;
+  }
+
+ private:
+  SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
+};
+
+C10_CUDA_API __inline__ WarningState& warning_state() {
+  static WarningState warning_state_;
+  return warning_state_;
+}
+// the subsequent functions are defined in the header because for performance
+// reasons we want them to be inline
+C10_CUDA_API void __inline__ memcpy_and_sync(
+    void* dst,
+    const void* src,
+    int64_t nbytes,
+    cudaMemcpyKind kind,
+    cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
+  }
+#if defined(USE_ROCM) && USE_ROCM
+  // As of ROCm 6.4.1, HIP runtime does not raise an error during capture of
+  // hipMemcpyWithStream which is a synchronous call. Thus, we add a check
+  // here explicitly.
+  hipStreamCaptureStatus captureStatus;
+  C10_CUDA_CHECK(hipStreamGetCaptureInfo(stream, &captureStatus, nullptr));
+  if (C10_LIKELY(captureStatus == hipStreamCaptureStatusNone)) {
+    C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+  } else {
+    C10_CUDA_CHECK(hipErrorStreamCaptureUnsupported);
+  }
+#else
+  C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+#endif
+}
+
+C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
+  }
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
+C10_CUDA_API std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..176c9290c3906815228faf0bdb502c50260eb1e9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
@@ -0,0 +1,81 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <iostream>
+#include <utility>
+
+// CUDA Graphs utils used by c10 and aten.
+// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only.
+
+namespace c10::cuda {
+
+// RAII guard for "cudaStreamCaptureMode", a thread-local value
+// that controls the error-checking strictness of a capture.
+struct C10_CUDA_API CUDAStreamCaptureModeGuard {
+  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
+      : strictness_(desired) {
+    C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+  CUDAStreamCaptureModeGuard(const CUDAStreamCaptureModeGuard&) = delete;
+  CUDAStreamCaptureModeGuard(CUDAStreamCaptureModeGuard&&) = delete;
+  CUDAStreamCaptureModeGuard& operator=(const CUDAStreamCaptureModeGuard&) =
+      delete;
+  CUDAStreamCaptureModeGuard& operator=(CUDAStreamCaptureModeGuard&&) = delete;
+  ~CUDAStreamCaptureModeGuard() {
+    C10_CUDA_CHECK_WARN(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+
+ private:
+  cudaStreamCaptureMode strictness_;
+};
+
+// Protects against enum cudaStreamCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0,
+    "unexpected int(cudaStreamCaptureStatusNone) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1,
+    "unexpected int(cudaStreamCaptureStatusActive) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2,
+    "unexpected int(cudaStreamCaptureStatusInvalidated) value");
+
+enum class CaptureStatus : int {
+  None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
+  Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
+  Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::None:
+      os << "cudaStreamCaptureStatusNone";
+      break;
+    case CaptureStatus::Active:
+      os << "cudaStreamCaptureStatusActive";
+      break;
+    case CaptureStatus::Invalidated:
+      os << "cudaStreamCaptureStatusInvalidated";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown CUDA graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+// Use this version where you're sure a CUDA context exists already.
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+  cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
+  C10_CUDA_CHECK(
+      cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
+  return CaptureStatus(is_capturing);
+}
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6ce4be26c07d3869fb4c7d7242fc220128fe8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAGuard.h
@@ -0,0 +1,311 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/impl/CUDAGuardImpl.h>
+
+namespace c10::cuda {
+
+// This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
+// boilerplate]
+
+/// A variant of DeviceGuard that is specialized for CUDA.  It accepts
+/// integer indices (interpreting them as CUDA devices) and is a little
+/// more efficient than DeviceGuard (it compiles to straight line
+/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
+/// from code that links against CUDA directly.
+struct CUDAGuard {
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit CUDAGuard() = delete;
+
+  /// Set the current CUDA device to the passed device index.
+  explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
+
+  /// Sets the current CUDA device to the passed device.  Errors if the passed
+  /// device is not a CUDA device.
+  explicit CUDAGuard(Device device) : guard_(device) {}
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move is not allowed (there is no uninitialized state)
+  CUDAGuard(CUDAGuard&& other) = delete;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+  ~CUDAGuard() = default;
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.  (This method is provided for uniformity with
+  /// DeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set upon construction of the guard
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any, otherwise
+  /// the device passed during construction.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  /// The guard for the current device.
+  c10::impl::InlineDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAGuard {
+  /// Create an uninitialized OptionalCUDAGuard.
+  explicit OptionalCUDAGuard() = default;
+
+  /// Set the current CUDA device to the passed Device, if it is not nullopt.
+  explicit OptionalCUDAGuard(std::optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current CUDA device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
+  OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
+  ~OptionalCUDAGuard() = default;
+
+  /// Sets the CUDA device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a CUDA
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a CUDA device.
+  /// (This method is provided for uniformity with OptionalDeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  std::optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  std::optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original CUDA device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of StreamGuard that is specialized for CUDA.  See CUDAGuard
+/// for when you can use this.
+struct CUDAStreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit CUDAStreamGuard() = delete;
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit CUDAStreamGuard(Stream stream) : guard_(stream) {}
+  ~CUDAStreamGuard() = default;
+
+  /// Copy is disallowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  /// Move is disallowed, as CUDAStreamGuard does not have an uninitialized
+  /// state, which is required for moves on types with nontrivial destructors.
+  CUDAStreamGuard(CUDAStreamGuard&& other) = delete;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Errors if the stream passed is not a CUDA stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on CUDA, use CUDAMultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was
+  /// constructed.
+  CUDAStream original_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.original_stream());
+  }
+
+  /// Returns the most recent CUDA stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  CUDAStream current_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.current_stream());
+  }
+
+  /// Returns the most recent CUDA device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the CUDA device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalStreamGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalCUDAStreamGuard() = default;
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit OptionalCUDAStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalCUDAStreamGuard(std::optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalCUDAStreamGuard(const OptionalCUDAStreamGuard&) = delete;
+  OptionalCUDAStreamGuard& operator=(const OptionalCUDAStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAStreamGuard(OptionalCUDAStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAStreamGuard& operator=(OptionalCUDAStreamGuard&& other) = delete;
+  ~OptionalCUDAStreamGuard() = default;
+
+  /// Resets the currently set CUDA stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was most
+  /// recently initialized, or nullopt if the guard is uninitialized.
+  std::optional<CUDAStream> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return CUDAStream(CUDAStream::UNCHECKED, r.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  /// Returns the most recent CUDA stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<CUDAStream> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return CUDAStream(CUDAStream::UNCHECKED, r.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  /// Restore the original CUDA device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of MultiStreamGuard that is specialized for CUDA.
+struct CUDAMultiStreamGuard {
+  explicit CUDAMultiStreamGuard(ArrayRef<CUDAStream> streams)
+      : guard_(unwrapStreams(streams)) {}
+
+  /// Copy is disallowed
+  CUDAMultiStreamGuard(const CUDAMultiStreamGuard&) = delete;
+  CUDAMultiStreamGuard& operator=(const CUDAMultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  CUDAMultiStreamGuard(CUDAMultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  CUDAMultiStreamGuard& operator=(CUDAMultiStreamGuard&& other) = delete;
+  ~CUDAMultiStreamGuard() = default;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::CUDAGuardImpl> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<CUDAStream> cudaStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(cudaStreams.size());
+    for (const CUDAStream& cudaStream : cudaStreams) {
+      streams.push_back(cudaStream);
+    }
+    return streams;
+  }
+};
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..93b371ce6ee854d074f6d47d0481c2a193e07d69
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMacros.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+
+// We have not yet modified the AMD HIP build to generate this file so
+// we add an extra option to specifically ignore it.
+#ifndef C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+#include <c10/cuda/impl/cuda_cmake_macros.h>
+#endif // C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+
+#endif
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_CUDA_BUILD_SHARED_LIBS)
+#define C10_CUDA_EXPORT __declspec(dllexport)
+#define C10_CUDA_IMPORT __declspec(dllimport)
+#else
+#define C10_CUDA_EXPORT
+#define C10_CUDA_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_CUDA_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_CUDA_EXPORT
+#endif // defined(__GNUC__)
+#define C10_CUDA_IMPORT C10_CUDA_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_cuda.so
+#ifdef C10_CUDA_BUILD_MAIN_LIB
+#define C10_CUDA_API C10_CUDA_EXPORT
+#else
+#define C10_CUDA_API C10_CUDA_IMPORT
+#endif
+
+/**
+ * The maximum number of GPUs that we recognizes. Increasing this beyond the
+ * initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
+ * This value cannot be more than 128 because our DeviceIndex is a uint8_t.
+o */
+#ifdef FBCODE_CAFFE2
+// fbcode depends on this value being 16
+#define C10_COMPILE_TIME_MAX_GPUS 16
+#else
+#define C10_COMPILE_TIME_MAX_GPUS 120
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec08cde0c1b71c9a0c8dd586e4fa7f6760e230f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
@@ -0,0 +1,157 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+/* This file defines math functions compatible across different gpu
+ * platforms (currently CUDA and HIP).
+ */
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __HIPCC__
+#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE
+#else /* __HIPCC__ */
+#ifdef __CUDACC_RTC__
+#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE
+#endif /* __CUDACC_RTC__ */
+#endif /* __HIPCC__ */
+
+namespace c10::cuda::compat {
+
+__MATH_FUNCTIONS_DECL__ float abs(float x) {
+  return ::fabsf(x);
+}
+__MATH_FUNCTIONS_DECL__ double abs(double x) {
+  return ::fabs(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp(float x) {
+  return ::expf(x);
+}
+__MATH_FUNCTIONS_DECL__ double exp(double x) {
+  return ::exp(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float ceil(float x) {
+  return ::ceilf(x);
+}
+__MATH_FUNCTIONS_DECL__ double ceil(double x) {
+  return ::ceil(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ float floor(float x) {
+  return ::floorf(x);
+}
+__MATH_FUNCTIONS_DECL__ double floor(double x) {
+  return ::floor(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log(float x) {
+  return ::logf(x);
+}
+__MATH_FUNCTIONS_DECL__ double log(double x) {
+  return ::log(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(float x) {
+  return ::log1pf(x);
+}
+
+__MATH_FUNCTIONS_DECL__ double log1p(double x) {
+  return ::log1p(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float max(float x, float y) {
+  return ::fmaxf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double max(double x, double y) {
+  return ::fmax(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float min(float x, float y) {
+  return ::fminf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double min(double x, double y) {
+  return ::fmin(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float pow(float x, float y) {
+  return ::powf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double pow(double x, double y) {
+  return ::pow(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) {
+  return ::sincosf(x, sptr, cptr);
+}
+__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) {
+  return ::sincos(x, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float sqrt(float x) {
+  return ::sqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double sqrt(double x) {
+  return ::sqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(float x) {
+  return ::rsqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
+  return ::rsqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tan(float x) {
+  return ::tanf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tan(double x) {
+  return ::tan(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tanh(float x) {
+  return ::tanhf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tanh(double x) {
+  return ::tanh(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
+  return ::normcdff(x);
+}
+__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
+  return ::normcdf(x);
+}
+
+} // namespace c10::cuda::compat
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c44105fa61281b2d06f02524b789d7c7554374f9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
@@ -0,0 +1,20 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// this file is to avoid circular dependency between CUDAFunctions.h and
+// CUDAExceptions.h
+
+#include <c10/cuda/CUDAMacros.h>
+#include <cuda_runtime.h>
+
+#include <mutex>
+#include <string>
+
+namespace c10::cuda {
+C10_CUDA_API std::string get_cuda_error_help(cudaError_t /*error*/) noexcept;
+C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
+C10_CUDA_API std::mutex* getFreeMutex();
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e616f584c5a41e40e75586c4e3d3ae8b381feb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/CUDAStream.h
@@ -0,0 +1,273 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/Exception.h>
+
+/*
+ * Stream pool note.
+ *
+ * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
+ * are backed by cuStreams, but they use several pools to minimize the costs
+ * associated with creating, retaining, and destroying cuStreams.
+ *
+ * There are three pools per device, and a device's pools are lazily created.
+ *
+ * The first pool contains only the default stream. When the default stream
+ * is requested it's returned.
+ *
+ * The second pool is the "low priority" or "default priority" streams. In
+ * HIP builds there is no distinction between streams in this pool and streams
+ * in the third pool (below). There are 32 of these streams per device, and
+ * when a stream is requested one of these streams is returned round-robin.
+ * That is, the first stream requested is at index 0, the second at index 1...
+ * to index 31, then index 0 again.
+ *
+ * This means that if 33 low priority streams are requested, the first and
+ * last streams requested are actually the same stream (under the covers)
+ * and kernels enqueued on them cannot run concurrently.
+ *
+ * The third pool is the "high priority" streams. The third pool acts like
+ * the second pool except the streams are created with a higher priority.
+ *
+ * These pools suggest that stream users should prefer many short-lived streams,
+ * as the cost of acquiring and releasing streams is effectively zero. If
+ * many longer-lived streams are required in performance critical scenarios
+ * then the functionality here may need to be extended to allow, for example,
+ * "reserving" a subset of the pool so that other streams do not accidentally
+ * overlap the performance critical streams.
+ *
+ * Note: although the notion of "current stream for device" is thread local
+ * (every OS thread has a separate current stream, as one might expect),
+ * the stream pool is global across all threads; stream 0 is always stream 0
+ * no matter which thread you use it on.  Multiple threads can synchronize
+ * on the same stream.  Although the CUDA documentation is not very clear
+ * on the matter, streams are thread safe; e.g., it is safe to enqueue
+ * a kernel on the same stream from two different threads.
+ */
+
+namespace c10::cuda {
+
+static constexpr int max_compile_time_stream_priorities = 4;
+
+// Value object representing a CUDA stream.  This is just a wrapper
+// around c10::Stream, but it comes with a little extra CUDA-specific
+// functionality (conversion to cudaStream_t), and a guarantee that
+// the wrapped c10::Stream really is a CUDA stream.
+class C10_CUDA_API CUDAStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a CUDAStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a CUDA stream.
+  explicit CUDAStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
+  }
+
+  /// Construct a CUDAStream from a Stream with no error checking.
+  /// This constructor uses the "named" constructor idiom, and can
+  /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
+  explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
+
+  bool operator==(const CUDAStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const CUDAStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to cudaStream_t.
+  operator cudaStream_t() const {
+    return stream();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// CUDA stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Used to avoid baking in device type explicitly to Python-side API.
+  DeviceType device_type() const {
+    return DeviceType::CUDA;
+  }
+
+  /// Get the CUDA device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  Device device() const {
+    return Device(DeviceType::CUDA, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+    cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
+
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    c10::cuda::stream_synchronize(stream());
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  /// Explicit conversion to cudaStream_t.
+  cudaStream_t stream() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a CUDAStream into a struct representation.
+  /// Previously the stream's data was packed into a single int64_t,
+  /// as it was assumed the fields would not require more than
+  /// 64 bits of storage in total.
+  /// See https://github.com/pytorch/pytorch/issues/75854
+  /// for more information regarding newer platforms that may violate
+  /// this assumption.
+  ///
+  /// The CUDAStream can be unpacked using unpack().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  // Unpack a CUDAStream from the 3 fields generated by pack().
+  static CUDAStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+    // Note: this returns the range of priority **supported by PyTorch**, not
+    // the range of priority **supported by CUDA**. The former is a subset of
+    // the latter.
+    int least_priority = 0, greatest_priority = 0;
+    C10_CUDA_CHECK(
+        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+#ifdef USE_ROCM
+    // See Note [HIP stream priorities]
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 1, "Unexpected HIP stream priority range");
+    least_priority = 0;
+#else
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 0, "Unexpected CUDA stream priority range");
+#endif
+    TORCH_INTERNAL_ASSERT(
+        greatest_priority <= -1, "Unexpected CUDA stream priority range");
+    greatest_priority = std::max(
+        -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+  }
+
+  // Deleted for now; use CUDAEvent::block instead
+  // void synchronize_with(const CUDAEvent& event) const;
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a new stream from the CUDA stream pool.  You can think of this
+ * as "creating" a new stream, but no such creation actually happens;
+ * instead, streams are preallocated from the pool and returned in a
+ * round-robin fashion.
+ *
+ * You can request a stream from the high priority pool by setting
+ * isHighPriority to true, or a stream for a specific device by setting device
+ * (defaulting to the current CUDA stream.)
+ */
+C10_API CUDAStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+// no default priority to disambiguate overloads
+C10_API CUDAStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get a CUDAStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+C10_API CUDAStream
+getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
+
+/**
+ * Get the default CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The default stream is
+ * where most computation occurs when you aren't explicitly using
+ * streams.
+ */
+C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The current CUDA stream
+ * will usually be the default CUDA stream for the device, but it may
+ * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
+ * or 'CUDAStreamGuard'.
+ */
+C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be
+ * the passed in stream.  Yes, you read that right: this function
+ * has *nothing* to do with the current device: it toggles the current
+ * stream of the device of the passed stream.
+ *
+ * Confused?  Avoid using this function; prefer using 'CUDAStreamGuard' instead
+ * (which will switch both your current device and current stream in the way you
+ * expect, and reset it back to its original state afterwards).
+ */
+C10_API void setCurrentCUDAStream(CUDAStream stream);
+
+C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
+
+} // namespace c10::cuda
+
+namespace std {
+template <>
+struct hash<c10::cuda::CUDAStream> {
+  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a5a131d4888f5f8f422bc07b74065db9315397
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/driver_api.h
@@ -0,0 +1,124 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cuda.h>
+#define NVML_NO_UNVERSIONED_FUNC_DEFS
+#include <nvml.h>
+
+#include <c10/util/Exception.h>
+
+#define C10_CUDA_DRIVER_CHECK(EXPR)                                        \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err [[maybe_unused]] =                        \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        TORCH_CHECK(false, "CUDA driver error: unknown error");            \
+      } else {                                                             \
+        TORCH_CHECK(false, "CUDA driver error: ", err_str);                \
+      }                                                                    \
+    }                                                                      \
+  } while (0)
+
+#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT)                             \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err [[maybe_unused]] =                        \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        TORCH_WARN("CUDA driver error: unknown error");                    \
+      } else {                                                             \
+        TORCH_WARN("CUDA driver error: ", err_str);                        \
+      }                                                                    \
+      goto NEXT;                                                           \
+    }                                                                      \
+  } while (0)
+
+// The integer in the second column specifies the requested CUDA Driver API
+// version. The dynamic loader will accept a driver with a newer version, but it
+// ensures that the requested symbol exists in *at least* the specified version
+// or earlier.
+
+// Keep these requested versions as low as possible to maximize compatibility
+// across different driver versions.
+
+// Why do we pin to an older version instead of using the latest?
+// If a user installs a newer driver, blindly resolving the symbol may bind to a
+// newer version of the function with different behavior, potentially breaking
+// PyTorch.
+
+#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \
+  _(cuDeviceGetAttribute, 12000)           \
+  _(cuMemAddressReserve, 12000)            \
+  _(cuMemRelease, 12000)                   \
+  _(cuMemMap, 12000)                       \
+  _(cuMemAddressFree, 12000)               \
+  _(cuMemSetAccess, 12000)                 \
+  _(cuMemUnmap, 12000)                     \
+  _(cuMemCreate, 12000)                    \
+  _(cuMemGetAllocationGranularity, 12000)  \
+  _(cuMemExportToShareableHandle, 12000)   \
+  _(cuMemImportFromShareableHandle, 12000) \
+  _(cuMemsetD32Async, 12000)               \
+  _(cuStreamWriteValue32, 12000)           \
+  _(cuGetErrorString, 12000)
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
+  _(cuCtxFromGreenCtx, 12080)              \
+  _(cuCtxGetCurrent, 12080)                \
+  _(cuCtxPopCurrent, 12080)                \
+  _(cuCtxPushCurrent, 12080)               \
+  _(cuCtxSetCurrent, 12080)                \
+  _(cuGreenCtxCreate, 12080)               \
+  _(cuGreenCtxDestroy, 12080)              \
+  _(cuDevSmResourceSplitByCount, 12080)    \
+  _(cuDeviceGet, 12080)                    \
+  _(cuDeviceGetDevResource, 12080)         \
+  _(cuDevResourceGenerateDesc, 12080)      \
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)              \
+  _(cuMulticastUnbind, 12030)
+#else
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
+#endif
+
+#define C10_NVML_DRIVER_API(_)            \
+  _(nvmlInit_v2)                          \
+  _(nvmlDeviceGetHandleByPciBusId_v2)     \
+  _(nvmlDeviceGetNvLinkRemoteDeviceType)  \
+  _(nvmlDeviceGetNvLinkRemotePciInfo_v2)  \
+  _(nvmlDeviceGetComputeRunningProcesses) \
+  _(nvmlSystemGetCudaDriverVersion_v2)
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040)
+#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
+#else
+#define C10_NVML_DRIVER_API_OPTIONAL(_)
+#endif
+
+namespace c10::cuda {
+
+struct DriverAPI {
+#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
+  C10_NVML_DRIVER_API(CREATE_MEMBER)
+  C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
+#undef CREATE_MEMBER_VERSIONED
+#undef CREATE_MEMBER
+
+  static DriverAPI* get();
+  static void* get_nvml_handle();
+};
+
+} // namespace c10::cuda
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..24cb643a0599072f52eb1188bf53fc236368e957
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
@@ -0,0 +1,270 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <cuda_runtime_api.h>
+#include <cstdint>
+#include <optional>
+
+namespace c10::cuda::impl {
+
+struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::CUDA;
+
+  CUDAGuardImpl() = default;
+  explicit CUDAGuardImpl(DeviceType t) {
+    TORCH_CHECK(
+        t == DeviceType::CUDA,
+        "CUDAGuardImpl initialized with non-CUDA DeviceType: ",
+        t);
+  }
+  DeviceType type() const override {
+    return DeviceType::CUDA;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d);
+    auto old_device_index = c10::cuda::ExchangeDevice(d.index());
+    return Device(DeviceType::CUDA, old_device_index);
+  }
+  Device getDevice() const override {
+    DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return Device(DeviceType::CUDA, device);
+  }
+  std::optional<Device> uncheckedGetDevice() const noexcept {
+    DeviceIndex device{-1};
+    const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
+    C10_CUDA_CHECK_WARN(err);
+    if (err != cudaSuccess) {
+      return std::nullopt;
+    }
+    return Device(DeviceType::CUDA, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d);
+    C10_CUDA_CHECK(c10::cuda::SetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_CUDA_CHECK_WARN(c10::cuda::MaybeSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const override {
+    return getCurrentCUDAStream(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultCUDAStream(d.index());
+  }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const override {
+    CUDAStream cs(s);
+    auto old_stream = getCurrentCUDAStream(s.device().index());
+    setCurrentCUDAStream(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return device_count();
+  }
+
+  // Event-related functions
+  void createEvent(cudaEvent_t* cuda_event, const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to CUDA flag
+    auto cuda_flag = cudaEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+        cuda_flag = cudaEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+        cuda_flag = cudaEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "CUDA event received unknown flag");
+    }
+
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(cuda_event, cuda_flag));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+  }
+
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+    auto cuda_event = static_cast<cudaEvent_t>(event);
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device));
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(*event);
+    CUDAStream cuda_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!cuda_event)
+      createEvent(&cuda_event, flag);
+    C10_CUDA_CHECK(cudaEventRecord(cuda_event, cuda_stream));
+    // Makes the void* point to the (possibly just allocated) CUDA event
+    *event = cuda_event;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kCUDA,
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    CUDAStream cuda_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_CUDA_CHECK(cudaStreamWaitEvent(
+        cuda_stream,
+        cuda_event,
+        /*flags (must be zero)=*/0));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::kCUDA,
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+    setDevice(orig_device);
+  }
+
+  // May be called from any device
+  bool queryEvent(void* event) const override {
+    if (!event)
+      return true;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    // Note: cudaEventQuery can be safely called from any device
+    const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
+    if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+    return (err == cudaSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    return cuda_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    cuda_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    // Note: cudaEventSynchronize can be safely called from any device
+    C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
+  }
+
+  // Note: synchronizeDevice can be safely called from any device
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
+    }
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+    C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    CUDAStream cuda_stream{stream};
+    CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    // Even though cudaEventElapsedTime can be safely called from any device, if
+    // the current device is not initialized, it will create a new cuda context,
+    // which will consume a lot of memory.
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
+    cudaEvent_t cuda_event1 = static_cast<cudaEvent_t>(event1);
+    cudaEvent_t cuda_event2 = static_cast<cudaEvent_t>(event2);
+    float time_ms = 0;
+    // raise cudaErrorNotReady if either event is recorded but not yet completed
+    C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
+};
+
+} // namespace c10::cuda::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h
new file mode 100644
index 0000000000000000000000000000000000000000..3edcfe6d88a72a94120bf95d82a6bbc0a0798500
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/CUDATest.h
@@ -0,0 +1,14 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+namespace c10::cuda::impl {
+
+C10_CUDA_API int c10_cuda_test();
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2fb43f54676972b1df12b2be146786465a1b403
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/cuda/impl/cuda_cmake_macros.h
@@ -0,0 +1,11 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// Automatically generated header file for the C10 CUDA library.  Do not
+// include this file directly.  Instead, include c10/cuda/CUDAMacros.h
+
+#define C10_CUDA_BUILD_SHARED_LIBS
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfc4378c482c621ce05179900c719510e59ee8d0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Export.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/macros/Export.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..02fdbd4df99eaed11dfdc5dc190378156ea30177
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/Macros.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/macros/Macros.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d89f61f37a9db44fc7bbe5df20ce372e37dff4c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/macros/cmake_macros.h
@@ -0,0 +1,10 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// This file exists for backwards compatibility and has been moved to
+// torch/headeronly/macros/cmake_macros.h.in. No end user library should be
+// including this file directly anyway (cuz they should be including
+// Macros.h instead).
+#include <torch/headeronly/macros/cmake_macros.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bec87d32d3efa5badc79d2b85d2cb018fe9c9a1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/atomic.h
@@ -0,0 +1,182 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <metal_atomic>
+namespace c10 {
+namespace metal {
+
+// Atomic operations helper
+template <typename T>
+struct AtomicType {};
+template <typename T>
+using AtomicType_t = typename AtomicType<T>::type;
+
+template <>
+struct AtomicType<float> {
+  using type = ::metal::atomic<float>;
+  static inline void atomic_add(device type* data, long offset, float value) {
+    ::metal::atomic_fetch_add_explicit(
+        data + offset, value, ::metal::memory_order_relaxed);
+  }
+};
+
+template <>
+struct AtomicType<int> {
+  using type = ::metal::atomic<int>;
+  static inline void atomic_add(device type* data, long offset, int value) {
+    ::metal::atomic_fetch_add_explicit(
+        data + offset, value, ::metal::memory_order_relaxed);
+  }
+};
+
+// As of Metal3.2 atomic operations are not supported on half-precision floats,
+// so they must be simulated Using atomic compare and exchange over 32-bit
+// atomic type
+template <typename T>
+static inline void atomic_add_helper(
+    device ::metal::atomic<uint>* data,
+    long offset,
+    T value) {
+  constexpr auto elem_per_enum = sizeof(uint) / sizeof(T);
+  auto ptr = data + (offset / elem_per_enum);
+  auto old = ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+  union {
+    uint i;
+    T t[elem_per_enum];
+  } val;
+  do {
+    val.i = old;
+    val.t[offset & (elem_per_enum - 1)] += value;
+  } while (!::metal::atomic_compare_exchange_weak_explicit(
+      ptr,
+      &old,
+      val.i,
+      ::metal::memory_order_relaxed,
+      ::metal::memory_order_relaxed));
+}
+
+template <>
+struct AtomicType<half> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, half value) {
+    atomic_add_helper(data, offset, value);
+  }
+};
+
+template <>
+struct AtomicType<short> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, short value) {
+    atomic_add_helper(data, offset, value);
+  }
+};
+
+template <>
+struct AtomicType<char> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, char value) {
+    atomic_add_helper(data, offset, value);
+  }
+};
+
+template <>
+struct AtomicType<uchar> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, char value) {
+    atomic_add_helper(data, offset, value);
+  }
+};
+
+template <>
+struct AtomicType<bfloat> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, bfloat value) {
+    atomic_add_helper<bfloat>(data, offset, value);
+  }
+};
+
+// Metal supports atomic_store_explicit for bools, but
+// sizeof(::metal::atomic_bool) is 4 Therefore it could not be used to
+// atomically modify unaligned memory, so fall back to compare and exchange
+// trick As accumulation over booleans are just or operation, do nothing if
+// value is false
+template <>
+struct AtomicType<bool> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, bool value) {
+    if (!value) {
+      return;
+    }
+    auto ptr = data + (offset >> 2);
+    auto old =
+        ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+    union {
+      uint i;
+      bool t[4];
+    } val;
+    do {
+      val.i = old;
+      val.t[offset & 3] = true;
+    } while (!::metal::atomic_compare_exchange_weak_explicit(
+        ptr,
+        &old,
+        val.i,
+        ::metal::memory_order_relaxed,
+        ::metal::memory_order_relaxed));
+  }
+};
+
+// ComplexHalf atomic op
+template <>
+struct AtomicType<half2> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, half2 value) {
+    auto ptr = data + offset;
+    auto old =
+        ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+    while (!::metal::atomic_compare_exchange_weak_explicit(
+        ptr,
+        &old,
+        as_type<uint>(as_type<half2>(old) + value),
+        ::metal::memory_order_relaxed,
+        ::metal::memory_order_relaxed))
+      ;
+  }
+};
+
+// There are no atomic 64-bit add in Metal yet, but templates below implements a
+// consistent add I.e. if multiple threads are modify the same 64-bit value,
+// results stored at the address will eventually be equal to its original value
+// plus sum of all operands
+template <>
+struct AtomicType<long> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, long value) {
+    const auto value_bits = as_type<ulong>(value);
+    const uint low = static_cast<uint>(value_bits);
+    uint high = static_cast<uint>(value_bits >> 32);
+    auto ptr = data + (offset << 1);
+    auto old_low =
+        atomic_fetch_add_explicit(ptr, low, ::metal::memory_order_relaxed);
+    high += (old_low + low < old_low) ? 1 : 0;
+    atomic_fetch_add_explicit(ptr + 1, high, ::metal::memory_order_relaxed);
+  }
+};
+
+// ComplexFloat atomic op, which again is not really atomic, but eventually
+// consistent
+template <>
+struct AtomicType<float2> {
+  using type = ::metal::atomic<float>;
+  static inline void atomic_add(device type* data, long offset, float2 value) {
+    auto ptr = data + (offset << 1);
+    atomic_fetch_add_explicit(ptr + 0, value.x, ::metal::memory_order_relaxed);
+    atomic_fetch_add_explicit(ptr + 1, value.y, ::metal::memory_order_relaxed);
+  }
+};
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..c508bbd55afa7077644bc5ff722ccbc46056e99c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/common.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// Set of global constants that could be shareable between CPU and Metal code
+
+#ifdef __METAL__
+#include <metal_array>
+#define C10_METAL_CONSTEXPR constant constexpr
+#else
+#include <array>
+#define C10_METAL_CONSTEXPR constexpr
+#endif
+
+#define C10_METAL_ALL_TYPES_FUNCTOR(_) \
+  _(Byte, 0)                           \
+  _(Char, 1)                           \
+  _(Short, 2)                          \
+  _(Int, 3)                            \
+  _(Long, 4)                           \
+  _(Half, 5)                           \
+  _(Float, 6)                          \
+  _(ComplexHalf, 8)                    \
+  _(ComplexFloat, 9)                   \
+  _(Bool, 11)                          \
+  _(BFloat16, 15)
+
+namespace c10 {
+namespace metal {
+C10_METAL_CONSTEXPR unsigned max_ndim = 16;
+C10_METAL_CONSTEXPR unsigned simdgroup_size = 32;
+
+#ifdef __METAL__
+template <typename T, unsigned N>
+using array = ::metal::array<T, N>;
+#else
+template <typename T, unsigned N>
+using array = std::array<T, N>;
+#endif
+
+enum class ScalarType {
+#define _DEFINE_ENUM_VAL_(_v, _n) _v = _n,
+  C10_METAL_ALL_TYPES_FUNCTOR(_DEFINE_ENUM_VAL_)
+#undef _DEFINE_ENUM_VAL_
+};
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..25786e69bb6d9c37d69ce603aed53c8cb04a4a10
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/error.h
@@ -0,0 +1,116 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/metal/common.h>
+
+namespace c10 {
+namespace metal {
+C10_METAL_CONSTEXPR unsigned error_message_count = 30;
+struct ErrorMessage {
+  char file[128];
+  char func[128];
+  char message[250];
+  unsigned int line;
+};
+
+struct ErrorMessages {
+#ifdef __METAL__
+  ::metal::atomic<unsigned int> count;
+#else
+  unsigned int count;
+#endif
+  ErrorMessage msg[error_message_count];
+};
+
+#ifdef __METAL__
+namespace detail {
+static uint strncpy(device char* dst, constant const char* src, unsigned len) {
+  uint i = 0;
+  while (src[i] != 0 && i < len - 1) {
+    dst[i] = src[i];
+    i++;
+  }
+  dst[i] = 0;
+  return i;
+}
+
+inline uint print_arg(
+    device char* ptr,
+    unsigned len,
+    constant const char* arg) {
+  return strncpy(ptr, arg, len);
+}
+
+// Returns number length as string in base10
+static inline uint base10_length(long num) {
+  uint rc = 1;
+  if (num < 0) {
+    num = -num;
+    rc += 1;
+  }
+  while (num > 9) {
+    num /= 10;
+    rc++;
+  }
+  return rc;
+}
+
+// Converts signed integer to string
+inline uint print_arg(device char* ptr, unsigned len, long arg) {
+  const auto arg_len = base10_length(arg);
+  if (arg_len >= len)
+    return 0;
+  if (arg < 0) {
+    ptr[0] = '-';
+    arg = -arg;
+  }
+  uint idx = 1;
+  do {
+    ptr[arg_len - idx] = '0' + (arg % 10);
+    arg /= 10;
+    idx++;
+  } while (arg > 0);
+  ptr[arg_len] = 0;
+  return arg_len;
+}
+
+template <typename T>
+inline void print_args(device char* ptr, unsigned len, T arg) {
+  print_arg(ptr, len, arg);
+}
+
+template <typename T, typename... Args>
+inline void print_args(device char* ptr, unsigned len, T arg, Args... args) {
+  const auto rc = print_arg(ptr, len, arg);
+  print_args(ptr + rc, len - rc, args...);
+}
+
+} // namespace detail
+
+template <typename... Args>
+static void report_error(
+    device ErrorMessages* msgs,
+    constant const char* file,
+    int line,
+    constant const char* func,
+    Args... args) {
+  const auto idx =
+      atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed);
+  if (idx >= error_message_count) {
+    return;
+  }
+  device auto* msg = &msgs->msg[idx];
+  detail::strncpy(msg->file, file, 128);
+  detail::strncpy(msg->func, func, 128);
+  detail::print_args(msg->message, 250, args...);
+  msg->line = line;
+}
+
+#define TORCH_REPORT_ERROR(buf, ...) \
+  ::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__)
+#endif
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h
new file mode 100644
index 0000000000000000000000000000000000000000..18061b711232ddc8053f6672b23814fee5023926
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/expm1f.h
@@ -0,0 +1,102 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copy-and-pasted from:
+// https://github.com/ml-explore/mlx/blob/99c33d011d63174f50cea37c3eede002958be6d3/mlx/backend/metal/kernels/expm1f.h
+
+#pragma once
+
+#include <metal_math>
+
+// Original license copied below:
+//  Copyright (c) 2015-2023 Norbert Juffa
+//  All rights reserved.
+//
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//
+//  1. Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//  2. Redistributions in binary form must reproduce the above copyright
+//     notice, this list of conditions and the following disclaimer in the
+//     documentation and/or other materials provided with the distribution.
+//
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+//  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+//  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+//  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+//  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+//  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+//  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+//  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace c10 {
+namespace metal {
+
+/* Compute exponential base e minus 1. Maximum ulp error = 0.997458
+
+   i = rint(a/log(2)), f = a-i*log(2). Then expm1(a) = 2**i * (expm1(f)+1) - 1.
+   Compute r = expm1(f). Then expm1(a)= 2 * (0.5 * 2**i * r + 0.5 * 2**i - 0.5).
+   With t = 0.5*2**i, expm1(a) = 2*(r * t + t-0.5). However, for best accuracy,
+   when i == 1, expm1(a)= 2*(r + 0.5), and when i == 0, expm1(a) = r.
+
+   NOTE: Scale factor b is only applied if i < 0 or i > 1 (should be power of 2)
+*/
+inline float expm1f_scaled_unchecked(float a, float b) {
+  float f, j, r, s, t, u, v, x, y;
+  int i;
+
+  // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
+  j = ::metal::fma(1.442695f, a, 12582912.f); // 0x1.715476p0, 0x1.8p23
+  j = j - 12582912.0f; // 0x1.8p23
+  i = (int)j;
+  f = ::metal::fma(j, -6.93145752e-1f, a);
+
+  // approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
+  s = f * f;
+  if (a == 0.0f)
+    s = a; // ensure -0 is passed through
+  // err = 0.997458  ulp1 = 11081805
+  r = 1.97350979e-4f; // 0x1.9de000p-13
+  r = ::metal::fma(r, f, 1.39309070e-3f); // 0x1.6d30bcp-10
+  r = ::metal::fma(r, f, 8.33343994e-3f); // 0x1.1111f6p-7
+  r = ::metal::fma(r, f, 4.16668020e-2f); // 0x1.55559ep-5
+  r = ::metal::fma(r, f, 1.66666716e-1f); // 0x1.55555cp-3
+  r = ::metal::fma(r, f, 4.99999970e-1f); // 0x1.fffffep-2
+  u = (j == 1) ? (f + 0.5f) : f;
+  v = ::metal::fma(r, s, u);
+  s = 0.5f * b;
+  t = ::metal::ldexp(s, i);
+  y = t - s;
+  x = (t - y) - s; // double-float canonicalization of difference
+  r = ::metal::fma(v, t, x) + y;
+  r = r + r;
+  if (j == 0)
+    r = v;
+  if (j == 1)
+    r = v + v;
+  return r;
+}
+
+/* Compute exponential base e minus 1. max ulp err = 0.99746 */
+inline float expm1f(float a) {
+  float r;
+
+  r = expm1f_scaled_unchecked(a, 1.0f);
+  /* handle severe overflow and underflow */
+  if (::metal::abs(a - 1.0f) > 88.0f) {
+    r = ::metal::pow(2, a);
+    r = ::metal::fma(r, r, -1.0f);
+  }
+  return r;
+}
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fb235e226ad27e7bb94b76a02172df86ce4c17f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/igamma.h
@@ -0,0 +1,749 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/metal/utils.h>
+#include <metal_math>
+#include <metal_stdlib>
+
+using namespace c10::metal;
+using namespace metal;
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline float log_gamma(const T);
+
+inline float expm1f(float a);
+
+template <typename T>
+float erfc(T x);
+
+} // namespace metal
+} // namespace c10
+
+namespace {
+
+template <typename T>
+inline float lgamma(const T a) {
+  return log_gamma(a);
+}
+
+inline float expm1(float a) {
+  return expm1f(a);
+}
+
+// NOTE: The following code was ported directly from the CUDA implementation in
+// `aten/src/ATen/native/cuda/IGammaKernel.cu`
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+// regularized lower & upper incomplete gamma
+template <typename scalar_t>
+scalar_t ratevl(
+    scalar_t x,
+    const scalar_t num[],
+    int64_t M,
+    const scalar_t denom[],
+    int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int64_t i, dir;
+  accscalar_t y, num_ans, denom_ans;
+  accscalar_t absx = ::fabs(x);
+  thread const accscalar_t* p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  } else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  } else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return ::pow(x, static_cast<accscalar_t>(i)) * num_ans / denom_ans;
+  } else {
+    return num_ans / denom_ans;
+  }
+}
+
+template <typename scalar_t>
+scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  using accscalar_t = opmath_t<scalar_t>;
+
+  const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+      0.006061842346248906525783753964555936883222,
+      0.5098416655656676188125178644804694509993,
+      19.51992788247617482847860966235652136208,
+      449.9445569063168119446858607650988409623,
+      6955.999602515376140356310115515198987526,
+      75999.29304014542649875303443598909137092,
+      601859.6171681098786670226533699352302507,
+      3481712.15498064590882071018964774556468,
+      14605578.08768506808414169982791359218571,
+      43338889.32467613834773723740590533316085,
+      86363131.28813859145546927288977868422342,
+      103794043.1163445451906271053616070238554,
+      56906521.91347156388090791033559122686859};
+  const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+      1.,
+      66.,
+      1925.,
+      32670.,
+      357423.,
+      2637558.,
+      13339535.,
+      45995730.,
+      105258076.,
+      150917976.,
+      120543840.,
+      39916800.,
+      0};
+  return ratevl(
+      static_cast<accscalar_t>(x),
+      lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) /
+              sizeof(lanczos_sum_expg_scaled_num[0]) -
+          1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) /
+              sizeof(lanczos_sum_expg_scaled_denom[0]) -
+          1);
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t ax, fac, res, num, numfac;
+  const accscalar_t MAXLOG = 88.72283905206835;
+  const accscalar_t EXP1 = 2.718281828459045;
+  const accscalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
+    ax = a * ::log(x) - x - ::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return ::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= ::exp(a - x) * ::pow(x / fac, a);
+  } else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const int MAXITER = 2000;
+
+  int i;
+  accscalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int n;
+  accscalar_t fac = 1;
+  accscalar_t sum = 0;
+  accscalar_t term, logx;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (::fabs(term) <= MACHEP * ::fabs(sum)) {
+      break;
+    }
+  }
+
+  logx = ::log(x);
+  term = -::expm1(a * logx - ::lgamma(1 + a));
+  return term - ::exp(a * logx - ::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t d[25][25] = {
+      {-3.3333333333333333e-1,  8.3333333333333333e-2,
+       -1.4814814814814815e-2,  1.1574074074074074e-3,
+       3.527336860670194e-4,    -1.7875514403292181e-4,
+       3.9192631785224378e-5,   -2.1854485106799922e-6,
+       -1.85406221071516e-6,    8.296711340953086e-7,
+       -1.7665952736826079e-7,  6.7078535434014986e-9,
+       1.0261809784240308e-8,   -4.3820360184533532e-9,
+       9.1476995822367902e-10,  -2.551419399494625e-11,
+       -5.8307721325504251e-11, 2.4361948020667416e-11,
+       -5.0276692801141756e-12, 1.1004392031956135e-13,
+       3.3717632624009854e-13,  -1.3923887224181621e-13,
+       2.8534893807047443e-14,  -5.1391118342425726e-16,
+       -1.9752288294349443e-15},
+      {-1.8518518518518519e-3,  -3.4722222222222222e-3,  2.6455026455026455e-3,
+       -9.9022633744855967e-4,  2.0576131687242798e-4,   -4.0187757201646091e-7,
+       -1.8098550334489978e-5,  7.6491609160811101e-6,   -1.6120900894563446e-6,
+       4.6471278028074343e-9,   1.378633446915721e-7,    -5.752545603517705e-8,
+       1.1951628599778147e-8,   -1.7543241719747648e-11, -1.0091543710600413e-9,
+       4.1627929918425826e-10,  -8.5639070264929806e-11, 6.0672151016047586e-14,
+       7.1624989648114854e-12,  -2.9331866437714371e-12, 5.9966963656836887e-13,
+       -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
+       -4.13125571381061e-15},
+      {4.1335978835978836e-3,  -2.6813271604938272e-3,  7.7160493827160494e-4,
+       2.0093878600823045e-6,  -1.0736653226365161e-4,  5.2923448829120125e-5,
+       -1.2760635188618728e-5, 3.4235787340961381e-8,   1.3721957309062933e-6,
+       -6.298992138380055e-7,  1.4280614206064242e-7,   -2.0477098421990866e-10,
+       -1.4092529910867521e-8, 6.228974084922022e-9,    -1.3670488396617113e-9,
+       9.4283561590146782e-13, 1.2872252400089318e-10,  -5.5645956134363321e-11,
+       1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
+       4.6622399463901357e-13, -9.905105763906906e-14,  1.8931876768373515e-17,
+       8.8592218725911273e-15},
+      {6.4943415637860082e-4,   2.2947209362139918e-4,  -4.6918949439525571e-4,
+       2.6772063206283885e-4,   -7.5618016718839764e-5, -2.3965051138672967e-7,
+       1.1082654115347302e-5,   -5.6749528269915966e-6, 1.4230900732435884e-6,
+       -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
+       -1.9111168485973654e-8,  2.3928620439808118e-12, 2.0620131815488798e-9,
+       -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
+       -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
+       6.2088195734079014e-17,  2.126978363279737e-13,  -9.3446887915174333e-14,
+       2.0453671226782849e-14},
+      {-8.618882909167117e-4,   7.8403922172006663e-4,
+       -2.9907248030319018e-4,  -1.4638452578843418e-6,
+       6.6414982154651222e-5,   -3.9683650471794347e-5,
+       1.1375726970678419e-5,   2.5074972262375328e-10,
+       -1.6954149536558306e-6,  8.9075075322053097e-7,
+       -2.2929348340008049e-7,  2.956794137544049e-11,
+       2.8865829742708784e-8,   -1.4189739437803219e-8,
+       3.4463580499464897e-9,   -2.3024517174528067e-13,
+       -3.9409233028046405e-10, 1.8602338968504502e-10,
+       -4.356323005056618e-11,  1.2786001016296231e-15,
+       4.6792750266579195e-12,  -2.1492464706134829e-12,
+       4.9088156148096522e-13,  -6.3385914848915603e-18,
+       -5.0453320690800944e-14},
+      {-3.3679855336635815e-4, -6.9728137583658578e-5,  2.7727532449593921e-4,
+       -1.9932570516188848e-4, 6.7977804779372078e-5,   1.419062920643967e-7,
+       -1.3594048189768693e-5, 8.0184702563342015e-6,   -2.2914811765080952e-6,
+       -3.252473551298454e-10, 3.4652846491085265e-7,   -1.8447187191171343e-7,
+       4.8240967037894181e-8,  -1.7989466721743515e-14, -6.3061945000135234e-9,
+       3.1624176287745679e-9,  -7.8409242536974293e-10, 5.1926791652540407e-15,
+       9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
+       -3.661886712685252e-17, -1.210902069055155e-12,  5.6807435849905643e-13,
+       -1.3249659916340829e-13},
+      {5.3130793646399222e-4,  -5.9216643735369388e-4,  2.7087820967180448e-4,
+       7.9023532326603279e-7,  -8.1539693675619688e-5,  5.6116827531062497e-5,
+       -1.8329116582843376e-5, -3.0796134506033048e-9,  3.4651553688036091e-6,
+       -2.0291327396058604e-6, 5.7887928631490037e-7,   2.338630673826657e-13,
+       -8.8286007463304835e-8, 4.7435958880408128e-8,   -1.2545415020710382e-8,
+       8.6496488580102925e-14, 1.6846058979264063e-9,   -8.5754928235775947e-10,
+       2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
+       1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
+       3.6902800842763467e-13},
+      {3.4436760689237767e-4,   5.1717909082605922e-5,
+       -3.3493161081142236e-4,  2.812695154763237e-4,
+       -1.0976582244684731e-4,  -1.2741009095484485e-7,
+       2.7744451511563644e-5,   -1.8263488805711333e-5,
+       5.7876949497350524e-6,   4.9387589339362704e-10,
+       -1.0595367014026043e-6,  6.1667143761104075e-7,
+       -1.7562973359060462e-7,  -1.2974473287015439e-12,
+       2.695423606288966e-8,    -1.4578352908731271e-8,
+       3.887645959386175e-9,    -3.8810022510194121e-17,
+       -5.3279941738772867e-10, 2.7437977643314845e-10,
+       -6.9957960920705679e-11, 2.5899863874868481e-17,
+       8.8566890996696381e-12,  -4.403168815871311e-12,
+       1.0865561947091654e-12},
+      {-6.5262391859530942e-4,  8.3949872067208728e-4,  -4.3829709854172101e-4,
+       -6.969091458420552e-7,   1.6644846642067548e-4,  -1.2783517679769219e-4,
+       4.6299532636913043e-5,   4.5579098679227077e-9,  -1.0595271125805195e-5,
+       6.7833429048651666e-6,   -2.1075476666258804e-6, -1.7213731432817145e-11,
+       3.7735877416110979e-7,   -2.1867506700122867e-7, 6.2202288040189269e-8,
+       6.5977038267330006e-16,  -9.5903864974256858e-9, 5.2132144922808078e-9,
+       -1.3991589583935709e-9,  5.382058999060575e-16,  1.9484714275467745e-10,
+       -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
+       -3.3721464474854592e-12},
+      {-5.9676129019274625e-4,  -7.2048954160200106e-5,
+       6.7823088376673284e-4,   -6.4014752602627585e-4,
+       2.7750107634328704e-4,   1.8197008380465151e-7,
+       -8.4795071170685032e-5,  6.105192082501531e-5,
+       -2.1073920183404862e-5,  -8.8585890141255994e-10,
+       4.5284535953805377e-6,   -2.8427815022504408e-6,
+       8.7082341778646412e-7,   3.6886101871706965e-12,
+       -1.5344695190702061e-7,  8.862466778790695e-8,
+       -2.5184812301826817e-8,  -1.0225912098215092e-14,
+       3.8969470758154777e-9,   -2.1267304792235635e-9,
+       5.7370135528051385e-10,  -1.887749850169741e-19,
+       -8.0931538694657866e-11, 4.2382723283449199e-11,
+       -1.1002224534207726e-11},
+      {1.3324454494800656e-3,   -1.9144384985654775e-3,  1.1089369134596637e-3,
+       9.932404122642299e-7,    -5.0874501293093199e-4,  4.2735056665392884e-4,
+       -1.6858853767910799e-4,  -8.1301893922784998e-9,  4.5284402370562147e-5,
+       -3.127053674781734e-5,   1.044986828530338e-5,    4.8435226265680926e-11,
+       -2.1482565873456258e-6,  1.329369701097492e-6,    -4.0295693092101029e-7,
+       -1.7567877666323291e-13, 7.0145043163668257e-8,   -4.040787734999483e-8,
+       1.1474026743371963e-8,   3.9642746853563325e-18,  -1.7804938269892714e-9,
+       9.7480262548731646e-10,  -2.6405338676507616e-10, 5.794875163403742e-18,
+       3.7647749553543836e-11},
+      {1.579727660730835e-3,   1.6251626278391582e-4,   -2.0633421035543276e-3,
+       2.1389686185689098e-3,  -1.0108559391263003e-3,  -3.9912705529919201e-7,
+       3.6235025084764691e-4,  -2.8143901463712154e-4,  1.0449513336495887e-4,
+       2.1211418491830297e-9,  -2.5779417251947842e-5,  1.7281818956040463e-5,
+       -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
+       -6.8693396379526735e-7, 2.0653236975414887e-7,   4.6714772409838506e-14,
+       -3.5609886164949055e-8, 2.0470855345905963e-8,   -5.8091738633283358e-9,
+       -1.332821287582869e-16, 9.0354604391335133e-10,  -4.9598782517330834e-10,
+       1.3481607129399749e-10},
+      {-4.0725121195140166e-3, 6.4033628338080698e-3,  -4.0410161081676618e-3,
+       -2.183732802866233e-6,  2.1740441801254639e-3,  -1.9700440518418892e-3,
+       8.3595469747962458e-4,  1.9445447567109655e-8,  -2.5779387120421696e-4,
+       1.9009987368139304e-4,  -6.7696499937438965e-5, -1.4440629666426572e-10,
+       1.5712512518742269e-5,  -1.0304008744776893e-5, 3.304517767401387e-6,
+       7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
+       -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
+       -1.1407719956357511e-8, 3.2355857064185555e-9,  4.1759468293455945e-20,
+       -5.0423112718105824e-10},
+      {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
+       -9.8576315587856125e-3, 5.0134695031021538e-3,  1.2807521786221875e-6,
+       -2.0626019342754683e-3, 1.7109128573523058e-3,  -6.7695312714133799e-4,
+       -6.9011545676562133e-9, 1.8855128143995902e-4,  -1.3395215663491969e-4,
+       4.6263183033528039e-5,  4.0034230613321351e-11, -1.0255652921494033e-5,
+       6.612086372797651e-6,   -2.0913022027253008e-6, -2.0951775649603837e-13,
+       3.9756029041993247e-7,  -2.3956211978815887e-7, 7.1182883382145864e-8,
+       8.925574873053455e-16,  -1.2101547235064676e-8, 6.9350618248334386e-9,
+       -1.9661464453856102e-9},
+      {1.7402027787522711e-2,   -2.9527880945699121e-2, 2.0045875571402799e-2,
+       7.0289515966903407e-6,   -1.2375421071343148e-2, 1.1976293444235254e-2,
+       -5.4156038466518525e-3,  -6.3290893396418616e-8, 1.8855118129005065e-3,
+       -1.473473274825001e-3,   5.5515810097708387e-4,  5.2406834412550662e-10,
+       -1.4357913535784836e-4,  9.9181293224943297e-5,  -3.3460834749478311e-5,
+       -3.5755837291098993e-12, 7.1560851960630076e-6,  -4.5516802628155526e-6,
+       1.4236576649271475e-6,   1.8803149082089664e-14, -2.6623403898929211e-7,
+       1.5950642189595716e-7,   -4.7187514673841102e-8, -6.5107872958755177e-17,
+       7.9795091026746235e-9},
+      {3.0249124160905891e-2,  2.4817436002649977e-3,  -4.9939134373457022e-2,
+       5.9915643009307869e-2,  -3.2483207601623391e-2, -5.7212968652103441e-6,
+       1.5085251778569354e-2,  -1.3261324005088445e-2, 5.5515262632426148e-3,
+       3.0263182257030016e-8,  -1.7229548406756723e-3, 1.2893570099929637e-3,
+       -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
+       -7.7378565221244477e-5, 2.5625836246985201e-5,  1.0766165333192814e-12,
+       -5.3246809282422621e-6, 3.349634863064464e-6,   -1.0381253128684018e-6,
+       -5.608909920621128e-15, 1.9150821930676591e-7,  -1.1418365800203486e-7,
+       3.3654425209171788e-8},
+      {-9.9051020880159045e-2, 1.7954011706123486e-1,   -1.2989606383463778e-1,
+       -3.1478872752284357e-5, 9.0510635276848131e-2,   -9.2828824411184397e-2,
+       4.4412112839877808e-2,  2.7779236316835888e-7,   -1.7229543805449697e-2,
+       1.4182925050891573e-2,  -5.6214161633747336e-3,  -2.39598509186381e-9,
+       1.6029634366079908e-3,  -1.1606784674435773e-3,  4.1001337768153873e-4,
+       1.8365800754090661e-11, -9.5844256563655903e-5,  6.3643062337764708e-5,
+       -2.076250624489065e-5,  -1.1806020912804483e-13, 4.2131808239120649e-6,
+       -2.6262241337012467e-6, 8.0770620494930662e-7,   6.0125912123632725e-16,
+       -1.4729737374018841e-7},
+      {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
+       -4.6435192311733545e-1, 2.6640934719197893e-1,  3.4038266027147191e-5,
+       -1.3784338709329624e-1, 1.276467178337056e-1,   -5.6213828755200985e-2,
+       -1.753150885483011e-7,  1.9235592956768113e-2,  -1.5088821281095315e-2,
+       5.7401854451350123e-3,  1.0622382710310225e-9,  -1.5335082692563998e-3,
+       1.0819320643228214e-3,  -3.7372510193945659e-4, -6.6170909729031985e-12,
+       8.4263617380909628e-5,  -5.5150706827483479e-5, 1.7769536448348069e-5,
+       3.8827923210205533e-14, -3.53513697488768e-6,   2.1865832130045269e-6,
+       -6.6812849447625594e-7},
+      {7.2438608504029431e-1,   -1.3918010932653375,    1.0654143352413968,
+       1.876173868950258e-4,    -8.2705501176152696e-1, 8.9352433347828414e-1,
+       -4.4971003995291339e-1,  -1.6107401567546652e-6, 1.9235590165271091e-1,
+       -1.6597702160042609e-1,  6.8882222681814333e-2,  1.3910091724608687e-8,
+       -2.146911561508663e-2,   1.6228980898865892e-2,  -5.9796016172584256e-3,
+       -1.1287469112826745e-10, 1.5167451119784857e-3,  -1.0478634293553899e-3,
+       3.5539072889126421e-4,   8.1704322111801517e-13, -7.7773013442452395e-5,
+       5.0291413897007722e-5,   -1.6035083867000518e-5, 1.2469354315487605e-14,
+       3.1369106244517615e-6},
+      {1.6668949727276811,     1.165462765994632e-1,   -3.3288393225018906,
+       4.4692325482864037,     -2.6977693045875807,    -2.600667859891061e-4,
+       1.5389017615694539,     -1.4937962361134612,    6.8881964633233148e-1,
+       1.3077482004552385e-6,  -2.5762963325596288e-1, 2.1097676102125449e-1,
+       -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
+       -1.7813678334552311e-2, 6.3970330388900056e-3,  4.9430807090480523e-11,
+       -1.5554602758465635e-3, 1.0561196919903214e-3,  -3.5277184460472902e-4,
+       9.3002334645022459e-14, 7.5285855026557172e-5,  -4.8186515569156351e-5,
+       1.5227271505597605e-5},
+      {-6.6188298861372935,    1.3397985455142589e+1,  -1.0789350606845146e+1,
+       -1.4352254537875018e-3, 9.2333694596189809,     -1.0456552819547769e+1,
+       5.5105526029033471,     1.2024439690716742e-5,  -2.5762961164755816,
+       2.3207442745387179,     -1.0045728797216284,    -1.0207833290021914e-7,
+       3.3975092171169466e-1,  -2.6720517450757468e-1, 1.0235252851562706e-1,
+       8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
+       -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
+       -1.1082898580743683e-3, 3.654545161310169e-4,   -5.1290032026971794e-11,
+       -7.6340103696869031e-5},
+      {-1.7112706061976095e+1, -1.1208044642899116,    3.7131966511885444e+1,
+       -5.2298271025348962e+1, 3.3058589696624618e+1,  2.4791298976200222e-3,
+       -2.061089403411526e+1,  2.088672775145582e+1,   -1.0045703956517752e+1,
+       -1.2238783449063012e-5, 4.0770134274221141,     -3.473667358470195,
+       1.4329352617312006,     7.1359914411879712e-8,  -4.4797257159115612e-1,
+       3.4112666080644461e-1,  -1.2699786326594923e-1, -2.8953677269081528e-10,
+       3.3125776278259863e-2,  -2.3274087021036101e-2, 8.0399993503648882e-3,
+       -1.177805216235265e-9,  -1.8321624891071668e-3, 1.2108282933588665e-3,
+       -3.9479941246822517e-4},
+      {7.389033153567425e+1,   -1.5680141270402273e+2, 1.322177542759164e+2,
+       1.3692876877324546e-2,  -1.2366496885920151e+2, 1.4620689391062729e+2,
+       -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
+       -3.8210340013273034e+1, 1.719522294277362e+1,   9.3519707955168356e-7,
+       -6.2716159907747034,    5.1168999071852637,     -2.0319658112299095,
+       -4.9507215582761543e-9, 5.9626397294332597e-1,  -4.4220765337238094e-1,
+       1.6079998700166273e-1,  -2.4733786203223402e-8, -4.0307574759979762e-2,
+       2.7849050747097869e-2,  -9.4751858992054221e-3, 6.419922235909132e-6,
+       2.1250180774699461e-3},
+      {2.1216837098382522e+2,  1.3107863022633868e+1,  -4.9698285932871748e+2,
+       7.3121595266969204e+2,  -4.8213821720890847e+2, -2.8817248692894889e-2,
+       3.2616720302947102e+2,  -3.4389340280087117e+2, 1.7195193870816232e+2,
+       1.4038077378096158e-4,  -7.52594195897599e+1,   6.651969984520934e+1,
+       -2.8447519748152462e+1, -7.613702615875391e-7,  9.5402237105304373,
+       -7.5175301113311376,    2.8943997568871961,     -4.6612194999538201e-7,
+       -8.0615149598794088e-1, 5.8483006570631029e-1,  -2.0845408972964956e-1,
+       1.4765818959305817e-4,  5.1000433863753019e-2,  -3.3066252141883665e-2,
+       1.5109265210467774e-2},
+      {-9.8959643098322368e+2, 2.1925555360905233e+3,  -1.9283586782723356e+3,
+       -1.5925738122215253e-1, 1.9569985945919857e+3,  -2.4072514765081556e+3,
+       1.3756149959336496e+3,  1.2920735237496668e-3,  -7.525941715948055e+2,
+       7.3171668742208716e+2,  -3.4137023466220065e+2, -9.9857390260608043e-6,
+       1.3356313181291573e+2,  -1.1276295161252794e+2, 4.6310396098204458e+1,
+       -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
+       -4.1690817945270892,    3.1008219800117808e-3,  1.1220095449981468,
+       -7.6052379926149916e-1, 3.6262236505085254e-1,  2.216867741940747e-1,
+       4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  accscalar_t lambda = x / a;
+  accscalar_t sigma = (x - a) / a;
+  accscalar_t eta, res, ck, ckterm, term, absterm;
+  accscalar_t absoldterm = INFINITY;
+  accscalar_t etapow[25] = {1};
+  accscalar_t sum = 0;
+  accscalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  } else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = ::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else if (lambda < 1) {
+    eta = -::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else {
+    eta = 0;
+  }
+  res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n - 1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n] * etapow[n];
+      ck += ckterm;
+      if (::fabs(ckterm) < MACHEP * ::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = ::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * ::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int i;
+  accscalar_t ans, ax, c, yc, r, t, y, z;
+  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const accscalar_t BIG = 16777216.;
+  const accscalar_t BIGINV = 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = ::fabs((ans - r) / r);
+      ans = r;
+    } else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the subtraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 1.0;
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 1.0;
+  } else if (isinf(x)) {
+    return 0.0;
+  }
+
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  } else if (x <= 0.5) {
+    if (-0.4 / ::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  } else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the subtraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 0.0; // zero integration limit
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 0.0;
+  } else if (isinf(x)) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. */
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+} // namespace
+
+// end of regularized lower & upper incomplete gamma
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline T igamma(T a, T b) {
+  return calc_igamma(a, b);
+}
+
+template <typename T>
+inline T igammac(T a, T b) {
+  return calc_igammac(a, b);
+}
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a35aa1b87a2aa9a80cfaafd3d0cf0cf3076a215
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/indexing.h
@@ -0,0 +1,1050 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Metal indexing primitives
+#pragma once
+#include <c10/metal/common.h>
+#include <c10/metal/utils.h>
+#include <metal_stdlib>
+
+namespace c10 {
+namespace metal {
+
+// Given coordinates and strides, calculates offset from the start of the
+// tensors
+template <typename T>
+inline T offset_from_coord(
+    thread T idx[max_ndim],
+    constant long* strides,
+    uint ndim) {
+  T rc = 0;
+  for (uint i = 0; i < ndim; ++i) {
+    rc += idx[i] * T(strides[i]);
+  }
+  return rc;
+}
+
+// Given thread index calculates position in the ndim tensor
+template <typename T>
+inline void pos_from_thread_index(
+    T idx,
+    thread T pos[max_ndim],
+    constant long* sizes,
+    uint ndim) {
+  for (uint i = 0; i < ndim; ++i) {
+    pos[i] = idx % T(sizes[i]);
+    idx /= T(sizes[i]);
+  }
+}
+
+inline long offset_from_thread_index(
+    long idx,
+    constant long* sizes,
+    constant long* strides,
+    uint ndim) {
+  long pos[max_ndim];
+  pos_from_thread_index(idx, pos, sizes, ndim);
+  return offset_from_coord(pos, strides, ndim);
+}
+
+template <typename T, typename F>
+kernel void unary_dense(
+    device result_of<F, T>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  output[index] = f(input[index]);
+}
+
+template <typename T, typename F>
+kernel void unary_strided(
+    device result_of<F, T>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  output[output_offs] = f(input[input_offs]);
+}
+
+#define REGISTER_UNARY_OP(NAME, DTYPE0, DTYPE1)                                \
+  static_assert(                                                               \
+      ::metal::                                                                \
+          is_same_v<DTYPE1, ::c10::metal::result_of<NAME##_functor, DTYPE0>>,  \
+      "Output dtype mismatch for unary op " #NAME " and input " #DTYPE0);      \
+  template [[host_name(#NAME "_dense_" #DTYPE1 "_" #DTYPE0)]] kernel void ::   \
+      c10::metal::unary_dense<DTYPE0, NAME##_functor>(                         \
+          device ::c10::metal::result_of<NAME##_functor, DTYPE0> * output,     \
+          constant DTYPE0 * input,                                             \
+          uint index);                                                         \
+  template [[host_name(#NAME "_strided_" #DTYPE1 "_" #DTYPE0)]] kernel void :: \
+      c10::metal::unary_strided<DTYPE0, NAME##_functor>(                       \
+          device ::c10::metal::result_of<NAME##_functor, DTYPE0> * output,     \
+          constant DTYPE0 * input,                                             \
+          constant long* sizes,                                                \
+          constant long* input_strides,                                        \
+          constant long* output_strides,                                       \
+          constant uint& ndim,                                                 \
+          uint index)
+
+#define DEFINE_UNARY_FLOATING_FUNCTOR(NAME)                                     \
+  struct NAME##_functor {                                                       \
+    template <typename T>                                                       \
+    inline ::metal::enable_if_t<::metal::is_floating_point_v<T>, T> operator()( \
+        const T x) {                                                            \
+      return T(NAME(x));                                                        \
+    }                                                                           \
+    template <typename T>                                                       \
+    inline ::metal::enable_if_t<::metal::is_integral_v<T>, float> operator()(   \
+        const T x) {                                                            \
+      return NAME(static_cast<float>(x));                                       \
+    }                                                                           \
+  }
+
+template <typename T, typename T2, typename F>
+kernel void unary_alpha_dense(
+    device result_of<F, T, T2>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T2& alpha [[buffer(2)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  output[index] = f(input[index], alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void unary_alpha_strided(
+    device result_of<F, T, T2>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    constant T2& alpha [[buffer(6)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  output[output_offs] = f(input[input_offs], alpha);
+}
+
+#define REGISTER_UNARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO)              \
+  static_assert(                                                           \
+      ::metal::is_same_v<                                                  \
+          DTYPEO,                                                          \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA>>,        \
+      "Output dtype mismatch for unary op " #NAME " and input " #DTYPEI);  \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI                 \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::     \
+      unary_alpha_dense<DTYPEI, DTYPEA, NAME##_functor>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA> * \
+              output,                                                      \
+          constant DTYPEI * input,                                         \
+          constant DTYPEA & alpha,                                         \
+          uint index);                                                     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI               \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::     \
+      unary_alpha_strided<DTYPEI, DTYPEA, NAME##_functor>(                 \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA> * \
+              output,                                                      \
+          constant DTYPEI * input,                                         \
+          constant long* sizes,                                            \
+          constant long* input_strides,                                    \
+          constant long* output_strides,                                   \
+          constant uint& ndim,                                             \
+          constant DTYPEA& alpha,                                          \
+          uint index)
+
+template <typename T>
+inline T val_at_offs(constant void* ptr, long offs) {
+  return *reinterpret_cast<constant T*>(
+      static_cast<constant char*>(ptr) + offs);
+}
+
+// Value at offset with dynamic cast from provided type
+template <typename T>
+inline T val_at_offs(device void* ptr, long offs) {
+  return *reinterpret_cast<device T*>(static_cast<device char*>(ptr) + offs);
+}
+
+template <typename T, typename P>
+inline T val_at_offs(P ptr, long offs, ScalarType type) {
+  switch (type) {
+    case ScalarType::Bool:
+      return cast_to<T>(val_at_offs<bool>(ptr, offs));
+    case ScalarType::Byte:
+      return cast_to<T>(val_at_offs<uchar>(ptr, offs));
+    case ScalarType::Char:
+      return cast_to<T>(val_at_offs<char>(ptr, offs));
+    case ScalarType::Short:
+      return cast_to<T>(val_at_offs<short>(ptr, offs));
+    case ScalarType::Int:
+      return cast_to<T>(val_at_offs<int>(ptr, offs));
+    case ScalarType::Long:
+      return cast_to<T>(val_at_offs<long>(ptr, offs));
+    // Floats
+    case ScalarType::Float:
+      return cast_to<T>(val_at_offs<float>(ptr, offs));
+    case ScalarType::Half:
+      return cast_to<T>(val_at_offs<half>(ptr, offs));
+    case ScalarType::BFloat16:
+      return cast_to<T>(val_at_offs<bfloat>(ptr, offs));
+      // Complex
+    case ScalarType::ComplexHalf:
+      return cast_to<T>(val_at_offs<half2>(ptr, offs));
+    case ScalarType::ComplexFloat:
+      return cast_to<T>(val_at_offs<float2>(ptr, offs));
+  }
+}
+
+template <typename T>
+inline device T& ref_at_offs(device void* ptr, long offs) {
+  return *reinterpret_cast<device T*>(static_cast<device char*>(ptr) + offs);
+}
+
+// Binary elementwise ops kernels
+// Right now there are 4 flavors available:
+// - binary_dense where both input, other and output are dense and share the
+// same type
+// - binary_strided when all inputs are of the same types, but some elements are
+// strided
+// - binary_dense_cast - inputs are dense, but of different dtypes
+// - binary_strided_cast - inputs or output are strided and of different dtypes
+// - binary_dense_broadcast - one input is dense, another one is broadcastable
+// Note about accuracy (for more info see
+// https://github.com/pytorch/pytorch/issues/152736) Sometimes when kernel is
+// invoked to produce `half` output, but one of the arguments is float arguments
+// should be upcast to float, rather than downcast to half At the moment this is
+// expressed with `om_t` optional argument (which stands for opmath_type) which
+// is identical to output type but could be something else
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_strided(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant long* input_strides [[buffer(5)]],
+    constant long* other_strides [[buffer(6)]],
+    constant uint3& ndim [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim.x);
+  const auto a = val_at_offs<T>(input, input_offs);
+  const auto b = val_at_offs<T>(other, other_offs);
+  ref_at_offs<res_t>(output, output_offs) =
+      static_cast<res_t>(f(om_t(a), om_t(b)));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_strided(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other_strides [[buffer(7)]],
+    constant uint3& ndim [[buffer(8)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim.x);
+  const auto a = val_at_offs<T>(input, input_offs);
+  const auto b = val_at_offs<T>(other, other_offs);
+  ref_at_offs<result_of<F, T, T, T2>>(output, output_offs) = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_strided_cast(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant long* input_strides [[buffer(5)]],
+    constant long* other_strides [[buffer(6)]],
+    constant uint4& ndim_types [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim_types.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x);
+  const auto a = val_at_offs<om_t>(
+      input, input_offs, static_cast<ScalarType>(ndim_types.y));
+  const auto b = val_at_offs<om_t>(
+      other, other_offs, static_cast<ScalarType>(ndim_types.z));
+  ref_at_offs<res_t>(output, output_offs) = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_strided_cast(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other_strides [[buffer(7)]],
+    constant uint4& ndim_types [[buffer(8)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim_types.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x);
+  const auto a =
+      val_at_offs<T>(input, input_offs, static_cast<ScalarType>(ndim_types.y));
+  const auto b =
+      val_at_offs<T>(other, other_offs, static_cast<ScalarType>(ndim_types.z));
+  ref_at_offs<result_of<F, T, T, T2>>(output, output_offs) = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* other [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(f(om_t(input[tid]), om_t(other[tid])));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(input[tid], other[tid], alpha);
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant uint4& sizes_types [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a = val_at_offs<om_t>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<om_t>(
+      other, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a = val_at_offs<T>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<T>(
+      other, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense_broadcast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* broadcast [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(
+      f(om_t(input[tid]), om_t(broadcast[tid % broadcast_numel])));
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense_broadcast_rhs(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant T* broadcast [[buffer(1)]],
+    constant T* input [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(
+      f(om_t(broadcast[tid % broadcast_numel]), om_t(input[tid])));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_broadcast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* broadcast [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant T2& alpha [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(input[tid], broadcast[tid % broadcast_numel], alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_broadcast_rhs(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant T* broadcast [[buffer(1)]],
+    constant T* input [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant T2& alpha [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(broadcast[tid % broadcast_numel], input[tid], alpha);
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_broadcast_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* broadcast [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a = val_at_offs<om_t>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<om_t>(
+      broadcast,
+      (tid % broadcast_numel) * sizes_types.y,
+      static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_broadcast_rhs_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant void* broadcast [[buffer(1)]],
+    constant void* input [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a = val_at_offs<om_t>(
+      broadcast,
+      (tid % broadcast_numel) * sizes_types.x,
+      static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<om_t>(
+      input, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_broadcast_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* broadcast [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant T2& alpha [[buffer(4)]],
+    constant uint4& sizes_types [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a = val_at_offs<T>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<T>(
+      broadcast,
+      (tid % broadcast_numel) * sizes_types.y,
+      static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_broadcast_rhs_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant void* broadcast [[buffer(1)]],
+    constant void* input [[buffer(2)]],
+    constant long& broadcast_numel [[buffer(3)]],
+    constant T2& alpha [[buffer(4)]],
+    constant uint4& sizes_types [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a = val_at_offs<T>(
+      broadcast,
+      (tid % broadcast_numel) * sizes_types.x,
+      static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<T>(
+      input, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense_scalar(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    device T* scalar [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(f(om_t(input[tid]), om_t(scalar[0])));
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense_scalar_lhs(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    device T* scalar [[buffer(1)]],
+    constant T* input [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(f(om_t(scalar[0]), om_t(input[tid])));
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_scalar_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    device void* scalar [[buffer(2)]],
+    constant uint4& sizes_types [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a = val_at_offs<om_t>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b =
+      val_at_offs<om_t>(scalar, 0, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_scalar_lhs_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    device void* scalar [[buffer(1)]],
+    constant void* input [[buffer(2)]],
+    constant uint4& sizes_types [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a =
+      val_at_offs<om_t>(scalar, 0, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<om_t>(
+      input, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_scalar(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    device T* scalar [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(input[tid], scalar[0], alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_scalar_lhs(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    device T* scalar [[buffer(1)]],
+    constant T* input [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(scalar[0], input[tid], alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_scalar_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    device void* scalar [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a = val_at_offs<T>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b =
+      val_at_offs<T>(scalar, 0, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_scalar_lhs_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    device void* scalar [[buffer(1)]],
+    constant void* input [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a =
+      val_at_offs<T>(scalar, 0, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<T>(
+      input, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+#define REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, OMT)                         \
+  static_assert(                                                               \
+      ::metal::is_same_v<                                                      \
+          DTYPEO,                                                              \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI>>,            \
+      "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI);     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI)]] kernel void :: \
+      c10::metal::binary_strided<DTYPEI, NAME##_functor, OMT>(                 \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint3& ndim,                                                \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_strided_cast_" #DTYPEI)]] kernel void ::c10::   \
+      metal::binary_strided_cast<DTYPEI, NAME##_functor, OMT>(                 \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint4& ndim_types,                                          \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI)]] kernel void ::   \
+      c10::metal::binary_dense<DTYPEI, NAME##_functor, OMT>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * other_,                                            \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_cast_" #DTYPEI)]] kernel void ::c10::     \
+      metal::binary_dense_cast<DTYPEI, NAME##_functor, OMT>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_" #DTYPEO "_" #DTYPEI)]]        \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_broadcast<DTYPEI, NAME##_functor, OMT>(                     \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * broadcast_,                                        \
+          constant long& broadcast_numel,                                      \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_rhs_" #DTYPEO "_" #DTYPEI)]]    \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_broadcast_rhs<DTYPEI, NAME##_functor, OMT>(                 \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant DTYPEI * broadcast_,                                        \
+          constant DTYPEI * input_,                                            \
+          constant long& broadcast_numel,                                      \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_cast_" #DTYPEI)]]               \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_broadcast_cast<DTYPEI, NAME##_functor, OMT>(                \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant void* input_,                                               \
+          constant void* broadcast_,                                           \
+          constant long& broadcast_numel,                                      \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_rhs_cast_" #DTYPEI)]]           \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_broadcast_rhs_cast<DTYPEI, NAME##_functor, OMT>(            \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant void* broadcast_,                                           \
+          constant void* input_,                                               \
+          constant long& broadcast_numel,                                      \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_" #DTYPEO "_" #DTYPEI)]]           \
+  kernel void ::c10::metal::binary_dense_scalar<DTYPEI, NAME##_functor, OMT>(  \
+      device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> * out_,   \
+      constant DTYPEI * input_,                                                \
+      device DTYPEI * scalar_,                                                 \
+      uint tid);                                                               \
+  template [[host_name(#NAME "_dense_scalar_lhs_" #DTYPEO "_" #DTYPEI)]]       \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_scalar_lhs<DTYPEI, NAME##_functor, OMT>(                    \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          device DTYPEI * scalar_,                                             \
+          constant DTYPEI * input_,                                            \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_cast_" #DTYPEI)]]                  \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_scalar_cast<DTYPEI, NAME##_functor, OMT>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant void* input_,                                               \
+          device void* scalar_,                                                \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_lhs_cast_" #DTYPEI)]]              \
+  kernel void ::c10::metal::                                                   \
+      binary_dense_scalar_lhs_cast<DTYPEI, NAME##_functor, OMT>(               \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          device void* scalar_,                                                \
+          constant void* input_,                                               \
+          constant uint4& sizes_types,                                         \
+          uint tid)
+
+// OpMath Binary Op promotes inputs to higher precision type before Functor call
+#define REGISTER_OPMATH_BINARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, ::c10::metal::opmath_t<DTYPEI>)
+
+#define REGISTER_BINARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, DTYPEI)
+
+#define REGISTER_BINARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO)                 \
+  static_assert(                                                               \
+      ::metal::is_same_v<                                                      \
+          DTYPEO,                                                              \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA>>,    \
+      "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI);     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI                   \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_strided<DTYPEI, DTYPEA, NAME##_functor>(                    \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant DTYPEA& alpha,                                              \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint3& ndim,                                                \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_strided_cast_" #DTYPEI                          \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_strided_cast<DTYPEI, DTYPEA, NAME##_functor>(               \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant DTYPEA& alpha,                                              \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint4& ndim_types,                                          \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI                     \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense<DTYPEI, DTYPEA, NAME##_functor>(                      \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * other_,                                            \
+          constant DTYPEA & alpha,                                             \
+          uint tid);                                                           \
+  template                                                                     \
+      [[host_name(#NAME "_dense_cast_" #DTYPEI "_" #DTYPEA)]] kernel void ::   \
+          c10::metal::binary_alpha_dense_cast<DTYPEI, DTYPEA, NAME##_functor>( \
+              device ::c10::metal::                                            \
+                      result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *      \
+                  out_,                                                        \
+              constant void* input,                                            \
+              constant void* other,                                            \
+              constant DTYPEA& alpha,                                          \
+              constant uint4& sizes_types,                                     \
+              uint tid);                                                       \
+  template [[host_name(#NAME "_dense_broadcast_" #DTYPEO "_" #DTYPEI           \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_broadcast<DTYPEI, DTYPEA, NAME##_functor>(            \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * broadcast_,                                        \
+          constant long& broadcast_numel,                                      \
+          constant DTYPEA& alpha,                                              \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_rhs_" #DTYPEO "_" #DTYPEI       \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_broadcast_rhs<DTYPEI, DTYPEA, NAME##_functor>(        \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant DTYPEI * broadcast_,                                        \
+          constant DTYPEI * input_,                                            \
+          constant long& broadcast_numel,                                      \
+          constant DTYPEA& alpha,                                              \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_cast_" #DTYPEI                  \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_broadcast_cast<DTYPEI, DTYPEA, NAME##_functor>(       \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant void* input_,                                               \
+          constant void* broadcast_,                                           \
+          constant long& broadcast_numel,                                      \
+          constant DTYPEA& alpha,                                              \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_broadcast_rhs_cast_" #DTYPEI              \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_broadcast_rhs_cast<DTYPEI, DTYPEA, NAME##_functor>(   \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant void* broadcast_,                                           \
+          constant void* input_,                                               \
+          constant long& broadcast_numel,                                      \
+          constant DTYPEA& alpha,                                              \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_" #DTYPEO "_" #DTYPEI              \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_scalar<DTYPEI, DTYPEA, NAME##_functor>(               \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          device DTYPEI * scalar_,                                             \
+          constant DTYPEA & alpha,                                             \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_lhs_" #DTYPEO "_" #DTYPEI          \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_scalar_lhs<DTYPEI, DTYPEA, NAME##_functor>(           \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          device DTYPEI * scalar_,                                             \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEA & alpha,                                             \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_cast_" #DTYPEI                     \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_scalar_cast<DTYPEI, DTYPEA, NAME##_functor>(          \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant void* input_,                                               \
+          device void* scalar_,                                                \
+          constant DTYPEA& alpha,                                              \
+          constant uint4& sizes_types,                                         \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_scalar_lhs_cast_" #DTYPEI                 \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense_scalar_lhs_cast<DTYPEI, DTYPEA, NAME##_functor>(      \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          device void* scalar_,                                                \
+          constant void* input_,                                               \
+          constant DTYPEA& alpha,                                              \
+          constant uint4& sizes_types,                                         \
+          uint tid)
+
+// Ternary elementwise ops kernels
+// Right now there are 4 flavors available:
+// - ternary_dense where both input, other1, other2, and output are dense and
+// share the same type
+// - ternary_strided when all inputs are of the same types, but some elements
+// are strided
+// - ternary_dense_cast - inputs are dense, but of different dtypes
+// - ternary_strided_cast - inputs or output are strided and of different dtypes
+// Note about accuracy (for more info see
+// https://github.com/pytorch/pytorch/issues/152736) Sometimes when kernel is
+// invoked to produce `half` output, but one of the arguments is float arguments
+// should be upcast to float, rather than downcast to half At the moment this is
+// expressed with `om_t` optional argument (which stands for opmath_type) which
+// is identical to output type but could be something else
+
+template <typename T, typename F, typename om_t = T>
+kernel void ternary_strided(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other1 [[buffer(2)]],
+    constant void* other2 [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other1_strides [[buffer(7)]],
+    constant long* other2_strides [[buffer(8)]],
+    constant uint& ndim [[buffer(9)]],
+    constant uint4& types [[buffer(10)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto other1_offs = offset_from_coord(pos, other1_strides, ndim);
+  const auto other2_offs = offset_from_coord(pos, other2_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto a = val_at_offs<T>(input, input_offs);
+  const auto b = val_at_offs<T>(other1, other1_offs);
+  const auto c = val_at_offs<T>(other2, other2_offs);
+  ref_at_offs<res_t>(output, output_offs) =
+      static_cast<res_t>(f(om_t(a), om_t(b), om_t(c)));
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void ternary_strided_cast(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other1 [[buffer(2)]],
+    constant void* other2 [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other1_strides [[buffer(7)]],
+    constant long* other2_strides [[buffer(8)]],
+    constant uint& ndim [[buffer(9)]],
+    constant uint4& types [[buffer(10)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto other1_offs = offset_from_coord(pos, other1_strides, ndim);
+  const auto other2_offs = offset_from_coord(pos, other2_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto a =
+      val_at_offs<om_t>(input, input_offs, static_cast<ScalarType>(types.x));
+  const auto b =
+      val_at_offs<om_t>(other1, other1_offs, static_cast<ScalarType>(types.y));
+  const auto c =
+      val_at_offs<om_t>(other2, other2_offs, static_cast<ScalarType>(types.z));
+  ref_at_offs<res_t>(output, output_offs) = static_cast<res_t>(f(a, b, c));
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void ternary_dense(
+    device result_of<F, T, T, T>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* other1 [[buffer(2)]],
+    constant T* other2 [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T, T>;
+  out[tid] = static_cast<res_t>(
+      f(om_t(input[tid]), om_t(other1[tid]), om_t(other2[tid])));
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void ternary_dense_cast(
+    device result_of<F, T, T, T>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other1 [[buffer(2)]],
+    constant void* other2 [[buffer(3)]],
+    constant uint3& sizes [[buffer(4)]],
+    constant uint3& types [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T, T>;
+  const auto a =
+      val_at_offs<om_t>(input, tid * sizes.x, static_cast<ScalarType>(types.x));
+  const auto b = val_at_offs<om_t>(
+      other1, tid * sizes.y, static_cast<ScalarType>(types.y));
+  const auto c = val_at_offs<om_t>(
+      other2, tid * sizes.z, static_cast<ScalarType>(types.z));
+  out[tid] = static_cast<res_t>(f(a, b, c));
+}
+
+#define REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, OMT)                        \
+  static_assert(                                                               \
+      ::metal::is_same_v<                                                      \
+          DTYPEO,                                                              \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEI>>,    \
+      "Output dtype mismatch for ternary op " #NAME " and input " #DTYPEI);    \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI)]] kernel void :: \
+      c10::metal::ternary_strided<DTYPEI, NAME##_functor, OMT>(                \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other1,                                               \
+          constant void* other2,                                               \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other1_strides,                                       \
+          constant long* other2_strides,                                       \
+          constant uint& ndim,                                                 \
+          constant uint4& types,                                               \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_strided_cast_" #DTYPEI)]] kernel void ::c10::   \
+      metal::ternary_strided_cast<DTYPEI, NAME##_functor, OMT>(                \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other1,                                               \
+          constant void* other2,                                               \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other1_strides,                                       \
+          constant long* other2_strides,                                       \
+          constant uint& ndim,                                                 \
+          constant uint4& types,                                               \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI)]] kernel void ::   \
+      c10::metal::ternary_dense<DTYPEI, NAME##_functor, OMT>(                  \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEI> *          \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * other1_,                                           \
+          constant DTYPEI * other2_,                                           \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_cast_" #DTYPEI)]] kernel void ::c10::     \
+      metal::ternary_dense_cast<DTYPEI, NAME##_functor, OMT>(                  \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEI> *          \
+              out_,                                                            \
+          constant void* input,                                                \
+          constant void* other1,                                               \
+          constant void* other2,                                               \
+          constant uint3& sizes,                                               \
+          constant uint3& types,                                               \
+          uint tid)
+
+// OpMath ternary Op promotes inputs to higher precision type before Functor
+// call
+#define REGISTER_OPMATH_TERNARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, ::c10::metal::opmath_t<DTYPEI>)
+
+#define REGISTER_TERNARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_TERNARY_OP_(NAME, DTYPEI, DTYPEO, DTYPEI)
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..711e446d667decbbf3e2cfc7fc5a0da5d81d3123
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/random.h
@@ -0,0 +1,83 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Philox Counter based RNG implementation for Metal
+// Borrowed from aten/src/ATen/core/PhiloxRNGEngine.h
+// Which in turn borrowed from
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+#pragma once
+#include <metal_stdlib>
+
+namespace c10 {
+namespace metal {
+
+namespace detail {
+
+constexpr float uint32_to_uniform_float(uint32_t value) {
+  // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+  constexpr float scale = 4.6566127342e-10;
+  return static_cast<float>(value & 0x7FFFFFFF) * scale;
+}
+
+inline uint2 splitlong(ulong v) {
+  return uint2(v >> 32, v & 0xffffffff);
+}
+
+} // namespace detail
+
+namespace philox4 {
+
+uint2 mulhilo(uint a, uint b) {
+  auto rc = static_cast<ulong>(a) * b;
+  return detail::splitlong(rc);
+}
+uint4 single_round(uint4 ctr, uint2 key) {
+  constexpr uint kPhiloxSA = 0xD2511F53;
+  constexpr uint kPhiloxSB = 0xCD9E8D57;
+  auto rc0 = mulhilo(kPhiloxSA, ctr.x);
+  auto rc1 = mulhilo(kPhiloxSB, ctr.z);
+  return uint4(rc1.y ^ ctr.y ^ key.x, rc1.x, rc0.y ^ ctr.w ^ key.y, rc0.x);
+}
+
+uint4 multiple_rounds(uint4 ctr, uint2 key, uint rounds) {
+  constexpr uint2 kPhilox10 = {0x9E3779B9, 0xBB67AE85};
+  for (uint round = 0; round < rounds - 1; ++round) {
+    ctr = single_round(ctr, key);
+    key += kPhilox10;
+  }
+  return ctr;
+}
+
+uint4 rand(long seed, long index) {
+  uint4 ctr = 0;
+  ctr.zw = detail::splitlong(index);
+  return multiple_rounds(ctr, detail::splitlong(seed), 10);
+}
+
+} // namespace philox4
+
+float randn(long seed, long index) {
+  auto value = philox4::rand(seed, index);
+  float u1 = 1.0 - detail::uint32_to_uniform_float(value.x);
+  float u2 = 1.0 - detail::uint32_to_uniform_float(value.y);
+  return ::metal::sqrt(-2.0 * ::metal::log(u1)) *
+      ::metal::cos(2.0 * M_PI_F * u2);
+}
+
+float rand(long seed, long index) {
+  auto value = philox4::rand(seed, index);
+  return detail::uint32_to_uniform_float(value.x);
+}
+
+long randint64(long seed, long index, long low, long high) {
+  auto range = high - low;
+  auto value = philox4::rand(seed, index);
+  // TODO: Implement better algorithm for large ranges
+  return low +
+      static_cast<long>(detail::uint32_to_uniform_float(value.x) * range);
+}
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23c1af774ed88568bc1abacc668e98760bb6f98
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/reduction_utils.h
@@ -0,0 +1,364 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/metal/utils.h>
+#include <metal_compute>
+
+namespace c10 {
+namespace metal {
+namespace detail {
+template <typename T>
+struct simd_type {
+  using t = T;
+};
+
+// Helper that allows one to run simd ops over bfl16 by upcasting them to fp32
+template <typename T>
+using simd_type_t = typename simd_type<T>::t;
+
+template <>
+struct simd_type<bfloat> {
+  using t = float;
+};
+} // namespace detail
+
+template <typename T>
+inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_sum(T val) {
+  return T(::metal::simd_sum(detail::simd_type_t<T>(val)));
+}
+
+template <typename T>
+inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_prod(T val) {
+  return T(::metal::simd_product(detail::simd_type_t<T>(val)));
+}
+
+// Extend simd_broadcast to 64-bit integral types using int2 trick
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_integral_v<T> && sizeof(T) == 8, bool> =
+        true>
+inline T simd_broadcast(T val, ushort lane_id) {
+  return as_type<T>(::metal::simd_broadcast(as_type<int2>(val), lane_id));
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<!::metal::is_integral_v<T> || sizeof(T) != 8, bool> =
+        true>
+inline T simd_broadcast(T val, ushort lane_id) {
+  return ::metal::simd_broadcast(val, lane_id);
+}
+
+// Floating simd_min/max with nan propagation
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_floating_point_v<T>, bool> = true>
+inline T simd_max(T val) {
+  if (::metal::simd_any(::metal::isnan(val))) {
+    return ::metal::numeric_limits<T>::quiet_NaN();
+  }
+  return T(::metal::simd_max(detail::simd_type_t<T>(val)));
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_floating_point_v<T>, bool> = true>
+inline T simd_min(T val) {
+  if (::metal::simd_any(::metal::isnan(val))) {
+    return ::metal::numeric_limits<T>::quiet_NaN();
+  }
+  return T(::metal::simd_min(detail::simd_type_t<T>(val)));
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_integral_v<T> && sizeof(T) != 8, bool> =
+        true>
+inline T simd_max(T val) {
+  return ::metal::simd_max(val);
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_integral_v<T> && sizeof(T) != 8, bool> =
+        true>
+inline T simd_min(T val) {
+  return ::metal::simd_min(val);
+}
+
+// Metal does not support SIMD reductions over 64-bit types, but it could be
+// implement using simd_shuffle_down, that yields result in log2(simdgroup_size)
+// iterations Use fill variant, as shuffle down returns garbage if inactive
+// thread is referenced (on M1/M2, works fine on M4) and broadcast result to all
+// threads in the end. Implementation heavily borrows from
+// https://github.com/ml-explore/mlx/blob/86389bf9707f46101af45d90510e8e97c8a90b93/mlx/backend/metal/kernels/reduction/ops.h#L16
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_sum(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val += as_type<T>(
+        ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
+  }
+  return simd_broadcast(val, 0);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_prod(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val *= as_type<T>(
+        ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
+  }
+  return simd_broadcast(val, 0);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_max(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val = ::metal::max(
+        val,
+        as_type<T>(::metal::simd_shuffle_and_fill_down(
+            as_type<int2>(val), int2(0), i)));
+  }
+  return simd_broadcast(val, 0);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_min(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val = ::metal::min(
+        val,
+        as_type<T>(::metal::simd_shuffle_and_fill_down(
+            as_type<int2>(val), int2(0), i)));
+  }
+  return simd_broadcast(val, 0);
+}
+
+// argmin/argmax helpers using simd_ballot
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_integral_v<T>, bool> = true>
+inline ::c10::metal::pair<T, ushort> simd_argmin(T val) {
+  const auto rc = simd_min(val);
+  const auto vote = ::metal::simd_ballot(val == rc);
+  return {rc, static_cast<ushort>(::metal::ctz(static_cast<ulong>(vote)))};
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_floating_point_v<T>, bool> = true>
+inline ::c10::metal::pair<T, ushort> simd_argmin(T val) {
+  const auto rc = simd_min(val);
+  const auto vote = ::metal::simd_ballot(val == rc || ::metal::isnan(val));
+  return {rc, static_cast<ushort>(::metal::ctz(static_cast<ulong>(vote)))};
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_integral_v<T>, bool> = true>
+inline ::c10::metal::pair<T, ushort> simd_argmax(T val) {
+  const auto rc = simd_max(val);
+  const auto vote = ::metal::simd_ballot(val == rc);
+  return {rc, static_cast<ushort>(::metal::ctz(static_cast<ulong>(vote)))};
+}
+
+template <
+    typename T,
+    ::metal::enable_if_t<::metal::is_floating_point_v<T>, bool> = true>
+inline ::c10::metal::pair<T, ushort> simd_argmax(T val) {
+  const auto rc = simd_max(val);
+  const auto vote = ::metal::simd_ballot(val == rc || ::metal::isnan(val));
+  return {rc, static_cast<ushort>(::metal::ctz(static_cast<ulong>(vote)))};
+}
+
+template <typename ARG_T, typename IDX_T>
+inline c10::metal::pair<ARG_T, IDX_T> simd_argmin(ARG_T val, IDX_T idx_val) {
+  auto rc = simd_argmin(val);
+  return {rc.first, simd_broadcast(idx_val, rc.second)};
+}
+
+template <typename ARG_T, typename IDX_T>
+inline c10::metal::pair<ARG_T, IDX_T> simd_argmax(ARG_T val, IDX_T idx_val) {
+  auto rc = simd_argmax(val);
+  return {rc.first, simd_broadcast(idx_val, rc.second)};
+}
+
+// Below algorithms are  written with hardcoded assumption that simdgroup is 32
+// and threadgroup_max is 1024, i.e. reduction can be done in two stages max
+template <typename T>
+opmath_t<T> threadgroup_sum(
+    threadgroup opmath_t<T>* data,
+    T val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_sum(static_cast<opmath_t<T>>(val));
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_sum(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+opmath_t<T> threadgroup_prod(
+    threadgroup opmath_t<T>* data,
+    T val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_prod(static_cast<opmath_t<T>>(val));
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_prod(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+T threadgroup_max(threadgroup T* data, T val, unsigned idx, unsigned size) {
+  auto rc = simd_max(val);
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_max(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+T threadgroup_min(threadgroup T* data, T val, unsigned idx, unsigned size) {
+  auto rc = simd_min(val);
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_min(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+float3 threadgroup_welford_reduce(threadgroup T* data, unsigned size) {
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  float m = data[0];
+  float m2 = 0;
+  for (unsigned idx = 1; idx < size; ++idx) {
+    float delta = data[idx] - m;
+    m += delta / (idx + 1);
+    m2 += delta * (data[idx] - m);
+  }
+  return float3(m, m2, size);
+}
+
+// Each vec3type is tuple of mean, m2 and weight
+template <typename T>
+float3 welford_combine(T a, T b) {
+  float delta = b.x - a.x;
+  float new_weight = a.z + b.z;
+  auto w2_over_w = new_weight != 0 ? b.z / new_weight : 0.0;
+  return float3(
+      a.x + delta * w2_over_w,
+      a.y + b.y + delta * delta * a.z * w2_over_w,
+      new_weight);
+}
+
+template <typename T>
+float3 threadgroup_welford_combine(threadgroup T* data, unsigned size) {
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  float3 rc = data[0];
+  for (unsigned idx = 1; idx < size; ++idx) {
+    rc = welford_combine(rc, data[idx]);
+  }
+  return rc;
+}
+
+template <typename ARG_T, typename IDX_T>
+IDX_T threadgroup_argmax(
+    threadgroup ARG_T* arg_data,
+    threadgroup IDX_T* idx_data,
+    ARG_T val,
+    IDX_T idx_val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_argmax(val, idx_val);
+  if (size <= simdgroup_size) {
+    return rc.second;
+  }
+  if (idx % simdgroup_size == 0) {
+    arg_data[idx / simdgroup_size] = rc.first;
+    idx_data[idx / simdgroup_size] = rc.second;
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+    auto rc1 = simd_argmax(arg_data[idx], idx_data[idx]);
+    if (idx == 0) {
+      idx_data[0] = rc1.second;
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return idx_data[0];
+}
+
+template <typename ARG_T, typename IDX_T>
+IDX_T threadgroup_argmin(
+    threadgroup ARG_T* arg_data,
+    threadgroup IDX_T* idx_data,
+    ARG_T val,
+    IDX_T idx_val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_argmin(val, idx_val);
+  if (size <= simdgroup_size) {
+    return rc.second;
+  }
+  if (idx % simdgroup_size == 0) {
+    arg_data[idx / simdgroup_size] = rc.first;
+    idx_data[idx / simdgroup_size] = rc.second;
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+    auto rc1 = simd_argmin(arg_data[idx], idx_data[idx]);
+    if (idx == 0) {
+      idx_data[0] = rc1.second;
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return idx_data[0];
+}
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0fb82cc0ad813b59aeb2e62a0d93ca37ac0d54b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/special_math.h
@@ -0,0 +1,2064 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Implementation of special math functions for Metal
+#pragma once
+#include <c10/metal/expm1f.h>
+#include <c10/metal/igamma.h>
+#include <c10/metal/utils.h>
+#include <metal_stdlib>
+
+namespace c10 {
+namespace metal {
+
+/*
+ * Approximation to the error function.
+ * Based on code from:
+ * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
+ * Copy-n-pasted from
+ * https://github.com/ml-explore/mlx/blob/2e8cf0b4506c200a5c2d199ecbbf655fdf4c2ce2/mlx/backend/metal/kernels/erf.h#L11
+ */
+template <typename T>
+inline float erf(T x) {
+  const auto a = static_cast<float>(x);
+  const auto t = ::metal::abs(a);
+  const auto s = a * a;
+  if (t > 0.927734375f) {
+    // maximum error 0.99527 ulp
+    auto r = ::metal::fma(
+        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+    const auto u = ::metal::fma(
+        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+    r = ::metal::fma(r, s, u);
+    r = ::metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+    r = ::metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+    r = ::metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+    r = ::metal::fma(r, t, -t);
+    // TODO, replace with expm1 when implemented
+    r = 1.0f - ::metal::exp(r);
+    r = ::metal::copysign(r, a);
+    return r;
+  }
+
+  // maximum error 0.98929 ulp
+  auto r = -5.96761703e-4f; // -0x1.38e000p-11
+  r = ::metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+  r = ::metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+  r = ::metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+  r = ::metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+  r = ::metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+  r = ::metal::fma(r, a, a);
+  return r;
+}
+
+template <typename T>
+float erfc(T x) {
+  return 1.0 - erf(x);
+}
+
+template <typename T>
+inline float erfinv(T y) {
+  /* coefficients in rational expansion */
+  constexpr float a[4] = {0.886226899, -1.645349621, 0.914624893, -0.140543331};
+  constexpr float b[4] = {-2.118377725, 1.442710462, -0.329097515, 0.012229801};
+  constexpr float c[4] = {-1.970840454, -1.624906493, 3.429567803, 1.641345311};
+  constexpr float d[2] = {3.543889200, 1.637067800};
+
+  float x, z, num, dem; /*working variables */
+
+  float y_abs = ::metal::abs(static_cast<float>(y));
+  if (y_abs >= 1.0f) {
+    return y_abs > 1.0f ? NAN
+                        : ::metal::copysign(INFINITY, static_cast<float>(y));
+  }
+  if (y_abs <= 0.7f) {
+    z = y * y;
+    num = ((a[3] * z + a[2]) * z + a[1]) * z + a[0];
+    dem = (((b[3] * z + b[2]) * z + b[1]) * z + b[0]) * z + 1.0f;
+    x = y * num / dem;
+  } else {
+    z = ::metal::sqrt(-1.0f * ::metal::log((1.0 - y_abs) / 2.0));
+    num = ((c[3] * z + c[2]) * z + c[1]) * z + c[0];
+    dem = (d[1] * z + d[0]) * z + 1.0f;
+    x = ::metal::copysign(num, static_cast<float>(y)) / dem;
+  }
+
+  return x;
+}
+
+/*
+ * For licensing information and documentation, please refer to the cpu
+ * implementation located in "ATen/native/Math.h".
+ */
+
+template <typename T>
+inline T chbevl(T x, const float array[], const int len) {
+  T b0, b1, b2;
+
+  b0 = array[0];
+  b1 = 0;
+
+  for (int i = 1; i < len; ++i) {
+    b2 = b1;
+    b1 = b0;
+    b0 = x * b1 - b2 + array[i];
+  }
+
+  return T{0.5} * (b0 - b2);
+}
+
+// Copied from
+// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L502
+
+template <typename T>
+inline T i0(T _x) {
+  auto x = ::metal::fabs(_x);
+
+  if (x <= 8.0) {
+    /* Chebyshev coefficients for exp(-x) I0(x)
+     *   in the interval [0,8].
+     *
+     * lim(x->0){ exp(-x) I0(x) } = 1.
+     */
+    constexpr float A[] = {
+        -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+    auto y = (x / 2.0) - 2.0;
+    return static_cast<T>(::metal::exp(x) * chbevl(y, A, 30));
+  }
+
+  // Handles x > 8 case
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+   */
+  constexpr float B[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return static_cast<T>(
+      (::metal::exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / ::metal::sqrt(x));
+}
+
+template <typename T>
+inline T i0e(T _x) {
+  auto x = ::metal::fabs(_x);
+
+  if (x <= 8.0) {
+    constexpr float coefficients[] = {
+        -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+    auto y = (x / 2.0) - 2.0;
+    return static_cast<T>(chbevl(y, coefficients, int{30}));
+  }
+
+  // x > 8
+  constexpr float coefficients[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return static_cast<T>(
+      chbevl(32.0 / x - 2.0, coefficients, 25) / ::metal::sqrt(x));
+}
+
+// Copied from
+// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576
+
+template <typename T>
+inline T i1(T _x) {
+  const auto x = ::metal::fabs(_x);
+
+  if (x <= 8.0) {
+    // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
+    //   lim(x->0){ exp(-x) i1(x) / x } = 1/2
+    constexpr float coefficients[] = {
+        2.77791411276104639959E-18, -2.11142121435816608115E-17,
+        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+        2.52587186443633654823E-1};
+    const auto y = x / 2.0 - 2.0;
+    const auto out = ::metal::exp(x) * x * chbevl(y, coefficients, 29);
+    return static_cast<T>(_x < T(0.) ? -out : out);
+  }
+
+  // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+  //   in the inverted interval [8, infinity]
+  //   lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
+  constexpr float coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+  const auto out = (::metal::exp(x) * chbevl(32. / x - 2., coefficients, 25)) /
+      ::metal::sqrt(x);
+  return static_cast<T>(_x < T(0.) ? -out : out);
+}
+
+template <typename T>
+inline T i1e(T _x) {
+  const auto x = ::metal::fabs(_x);
+  if (x <= 8.0) {
+    // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+    // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+    constexpr float coefficients[] = {
+        9.38153738649577178388E-9f,
+        -4.44505912879632808065E-8f,
+        2.00329475355213526229E-7f,
+        -8.56872026469545474066E-7f,
+        3.47025130813767847674E-6f,
+        -1.32731636560394358279E-5f,
+        4.78156510755005422638E-5f,
+        -1.61760815825896745588E-4f,
+        5.12285956168575772895E-4f,
+        -1.51357245063125314899E-3f,
+        4.15642294431288815669E-3f,
+        -1.05640848946261981558E-2f,
+        2.47264490306265168283E-2f,
+        -5.29459812080949914269E-2f,
+        1.02643658689847095384E-1f,
+        -1.76416518357834055153E-1f,
+        2.52587186443633654823E-1f};
+    const auto y = x / 2.0 - 2.0;
+    const auto out = chbevl(y, coefficients, 17) * x;
+    return static_cast<T>(_x < 0. ? -out : out);
+  }
+
+  // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+  //   in the inverted interval (8, infinity].
+  // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+  // TODO: what's an "inverted interval"? Open on the left
+  //   and closed on the right?
+  constexpr float coefficients[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+  const auto out =
+      chbevl(32. / x - 2., coefficients, 7) / ::metal::precise::sqrt(x);
+  return static_cast<T>(_x < 0. ? -out : out);
+}
+
+// gamma, lgamma
+template <typename T>
+inline float log_gamma(const T);
+
+template <typename T>
+inline float gamma(const T x) {
+  if (x < 0.001) {
+    constexpr float EULER_MASCHERONI = 0.577215664901532860606512090;
+    // For small x, 1/gamma(x) has power series x + gamma x^2  - ...
+    // So in this range, 1/gamma(x) = x + gamma x^2 with error on the order of
+    // x^3. The relative error over this interval is less than 6e-7.
+
+    return 1.0 / (x * (1.0 + EULER_MASCHERONI * x));
+  }
+  if (x >= 12.0) {
+    return ::metal::exp(log_gamma(x));
+  }
+  // The algorithm directly approximates gamma over (1,2) and uses
+  // reduction identities to reduce other arguments to this interval.
+  // numerator coefficients for gamma approximation over the interval (1,2)
+  constexpr float GAMMA_NUMERATOR_COEF[8] = {
+      -1.71618513886549492533811E+0,
+      2.47656508055759199108314E+1,
+      -3.79804256470945635097577E+2,
+      6.29331155312818442661052E+2,
+      8.66966202790413211295064E+2,
+      -3.14512729688483675254357E+4,
+      -3.61444134186911729807069E+4,
+      6.64561438202405440627855E+4};
+
+  // denominator coefficients for gamma approximation over the interval (1,2)
+  constexpr float GAMMA_DENOMINATOR_COEF[8] = {
+      -3.08402300119738975254353E+1,
+      3.15350626979604161529144E+2,
+      -1.01515636749021914166146E+3,
+      -3.10777167157231109440444E+3,
+      2.25381184209801510330112E+4,
+      4.75584627752788110767815E+3,
+      -1.34659959864969306392456E+5,
+      -1.15132259675553483497211E+5};
+
+  // Add or subtract integers as necessary to bring y into (1,2)
+  float y = 1.0 + ::metal::fract(x);
+
+  float num = 0.0;
+  float den = 1.0;
+
+  float z = y - 1;
+  for (int i = 0; i < 8; i++) {
+    num = (num + GAMMA_NUMERATOR_COEF[i]) * z;
+    den = den * z + GAMMA_DENOMINATOR_COEF[i];
+  }
+  float result = num / den + 1.0;
+
+  // Apply correction if argument was not initially in (1,2)
+  if (x < 1.0) {
+    // identity gamma(z) = gamma(z+1)/z
+    result /= (y - 1.0);
+  } else {
+    // identity gamma(z+n) = z*(z+1)* ... *(z+n-1)*gamma(z)
+    auto n = static_cast<int>(::metal::floor(x));
+    for (int i = 1; i < n; i++) {
+      result *= y++;
+    }
+  }
+
+  return result;
+}
+
+template <typename T>
+inline float log_gamma(const T x) {
+  constexpr float LOG_PI = 1.14472988584940017414342735135305;
+  constexpr float HALF_LOG_TWO_PI = 0.91893853320467274178032973640562;
+  constexpr float LGAMMA_EXPANSION_COEF[8] = {
+      1.0 / 12.0,
+      -1.0 / 360.0,
+      1.0 / 1260.0,
+      -1.0 / 1680.0,
+      1.0 / 1188.0,
+      -691.0 / 360360.0,
+      1.0 / 156.0,
+      -3617.0 / 122400.0};
+
+  float rc;
+
+  const auto abs_x = ::metal::abs(static_cast<float>(x));
+  if (abs_x == 0) {
+    return INFINITY;
+  }
+  if (abs_x < 12.0) {
+    rc = ::metal::log(::metal::abs(gamma(abs_x)));
+  } else {
+    // Abramowitz and Stegun 6.1.41
+    // Asymptotic series should be good to at least 11 or 12 figures
+    // For error analysis, see Whittiker and Watson
+    // A Course in Modern Analysis (1927), page 252
+
+    float z = 1.0 / (abs_x * abs_x);
+    float sum = LGAMMA_EXPANSION_COEF[7];
+
+    for (int i = 6; i >= 0; i--) {
+      sum *= z;
+      sum += LGAMMA_EXPANSION_COEF[i];
+    }
+    float series = sum / abs_x;
+
+    rc = (abs_x - 0.5) * ::metal::log(abs_x) - abs_x + HALF_LOG_TWO_PI + series;
+  }
+
+  if (x >= 0) {
+    return rc;
+  }
+
+  // Reflection formula
+  // Compute arg first to workaround Metal compiler bgg of sorts on M4
+  // See https://github.com/pytorch/pytorch/pull/145740 for more details
+  auto log_arg = abs_x * ::metal::abs(::metal::sinpi(abs_x));
+  return LOG_PI - rc - ::metal::log(log_arg);
+}
+
+inline float zeta(float x, float q) {
+  constexpr float MACHEP = 1.11022302462515654042E-16;
+  constexpr float ZETA_EXPANSION[] = {
+      12.0,
+      -720.0,
+      30240.0,
+      -1209600.0,
+      47900160.0,
+      -1.8924375803183791606e9,
+      7.47242496e10,
+      -2.950130727918164224e12,
+      1.1646782814350067249e14,
+      -4.5979787224074726105e15,
+      1.8152105401943546773e17,
+      -7.1661652561756670113e18};
+  if (x == 1.0f) {
+    return INFINITY;
+  }
+
+  if (x < 1.0f) {
+    return NAN;
+  }
+
+  if (q <= 0.0f) {
+    if (q == ::metal::trunc(q)) {
+      return INFINITY;
+    }
+    if (x != ::metal::trunc(x)) {
+      return NAN;
+    }
+  }
+
+  float s = ::metal::pow(q, -x);
+  float a = q;
+  int i = 0;
+  float b = 0.0f;
+  while ((i < 9) || (a <= 9.0f)) {
+    i += 1;
+    a += 1.0f;
+    b = ::metal::pow(a, -x);
+    s += b;
+    if ((-MACHEP * s < b) && (b < MACHEP * s)) {
+      return s;
+    }
+  }
+
+  float w = a;
+  s += b * w / (x - 1.0f);
+  s -= 0.5f * b;
+  a = 1.0f;
+  float t;
+  float k = 0.0f;
+  for (int i = 0; i < 12; i++) {
+    a *= x + k;
+    b /= w;
+    t = a * b / ZETA_EXPANSION[i];
+    s += t;
+    t = ::metal::fabs(t / s);
+    if (t < MACHEP) {
+      return s;
+    }
+    k += 1.0f;
+    a *= x + k;
+    b /= w;
+    k += 1.0f;
+  }
+  return s;
+}
+
+inline float calc_digamma_positive_domain(float x) {
+  constexpr float DIGAMMA_COEF[7] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2,
+  };
+
+  // Push x to be >= 10
+  float result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    constexpr float PSI_10 = 2.25175258906672110764;
+    return result + PSI_10;
+  }
+
+  // Compute asymptotic digamma
+  float y = 0;
+  if (x < 1.0E+17) {
+    float z = 1.0 / (x * x);
+    for (int i = 0; i <= 6; i++) {
+      y += ::metal::pow(z, i) * DIGAMMA_COEF[i];
+    }
+    y *= z;
+  }
+  return result + ::metal::log(x) - (0.5 / x) - y;
+}
+
+template <typename T0>
+inline float digamma(T0 x) {
+  if (x < 0.0f) {
+    if (x == ::metal::trunc(x)) {
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return NAN;
+    } else {
+      // Extracts the fractional part of x as r, since tan(pi * r) is more
+      // numerically accurate than tan(pi * x). While these operations are
+      // mathematically equivalent since both x and r are in radians and tan()
+      // has a periodicity of pi, in practice the computation of pi * x is a
+      // source of error (when |x| > 1).
+      float r = ::metal::fract(x);
+      return calc_digamma_positive_domain(1.0f - x) -
+          M_PI_F / ::metal::tan(M_PI_F * r);
+    }
+  } else if (x == 0.0f) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return ::metal::copysign(INFINITY, static_cast<float>(-x));
+  } else {
+    return calc_digamma_positive_domain(x);
+  }
+}
+
+template <typename T0>
+inline float polygamma(const int64_t order, const T0 input) {
+  // Filter out n == 0.
+  if (order == 0) {
+    return digamma(input);
+  }
+
+  float x = input;
+  float n = order;
+  float sgn = ((order % 2) ? 1 : -1);
+  return sgn * gamma(n + 1) * zeta(n + 1, x);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> sinc(T a) {
+  if (a == static_cast<T>(0)) {
+    return static_cast<T>(1);
+  }
+  auto product = M_PI_F * static_cast<float>(a);
+  return static_cast<T>(::metal::precise::sin(product) / product);
+}
+
+// Complex sinc2 implementation
+template <typename T>
+inline ::metal::enable_if_t<is_complex_v<T>, T> sinc(T inp) {
+  auto a = static_cast<float2>(inp) * M_PI_F;
+  const float a2 = a.x * a.x + a.y * a.y;
+  if (a2 == 0) {
+    return 0;
+  }
+  float cosx;
+  float sinx = ::metal::sincos(a.x, cosx);
+  float sinhy = ::metal::sinh(a.y);
+  float coshy = ::metal::cosh(a.y);
+  auto re = sinx * coshy * a.x + cosx * sinhy * a.y;
+  auto im = cosx * sinhy * a.x - sinx * coshy * a.y;
+  return T(re, im) / a2;
+}
+
+template <typename T>
+inline T spherical_bessel_j0(T x) {
+  if (::metal::isinf(x))
+    return T(0.0);
+  T x2 = x * x;
+  T k1 = static_cast<T>(-1.0);
+  T k2 = static_cast<T>(1.0);
+
+  if (::metal::fabs(static_cast<T>(x)) < T(0.5)) {
+    return T(1.0) +
+        x2 *
+        (k1 / T(6.0) +
+         x2 *
+             (k2 / T(120.0) +
+              x2 *
+                  (k1 / T(5040.0) +
+                   x2 *
+                       (k2 / T(362880.0) +
+                        x2 *
+                            (k1 / T(39916800.0) +
+                             x2 * (k2 / T(6227020800.0)))))));
+  }
+
+  return static_cast<T>(::metal::sin(x) / x);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
+    T a,
+    T b) {
+  float a0 = static_cast<float>(a);
+  float b0 = static_cast<float>(b);
+  if (::metal::isinf(a0) && a0 == b0) {
+    return static_cast<T>(a0);
+  } else {
+    float m0 = ::metal::max(a0, b0);
+    return static_cast<T>(
+        m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
+  }
+}
+
+// The function is ported from mlx
+template <typename T>
+inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
+  if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
+      ::metal::isnan(b.y)) {
+    return T(NAN, NAN);
+  }
+
+  T maxval = a.x > b.x ? a : b;
+  T minval = a.x < b.x ? a : b;
+  constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
+
+  if (minval.x == -inf || maxval.x == inf) {
+    return maxval;
+  }
+
+  float2 maxval_ = static_cast<float2>(maxval);
+  float2 minval_ = static_cast<float2>(minval);
+  float m = ::metal::exp(minval_.x - maxval_.x);
+  float2 dexp{
+      m * ::metal::cos(minval_.y - maxval_.y),
+      m * ::metal::sin(minval_.y - maxval_.y),
+  };
+  return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
+}
+
+template <typename T>
+inline T logaddexp2(T a, T b) {
+  constexpr auto log_2 = float(0.693147180559945309417232121458176);
+  constexpr auto inv_log_2 = float(1) / log_2;
+  float a0 = static_cast<float>(a);
+  float b0 = static_cast<float>(b);
+  if (::metal::isinf(a0) && a0 == b0) {
+    return static_cast<T>(a0);
+  } else {
+    float m0 = ::metal::max(a0, b0);
+    return static_cast<T>(
+        m0 +
+        ::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
+            inv_log_2);
+  }
+}
+
+template <typename T>
+inline float xlog1py(T x, T y) {
+  if (::metal::isnan(y)) {
+    return NAN;
+  }
+
+  if (x == 0) {
+    return x;
+  }
+
+  return x * ::c10::metal::log1p(y);
+}
+
+template <typename T>
+inline T entr(T a) {
+  if (a != a) {
+    return a;
+  }
+
+  if (a > 0) {
+    return static_cast<T>(-a * ::metal::log(a));
+  }
+
+  if (a == 0) {
+    return 0;
+  }
+
+  return static_cast<T>(-INFINITY);
+}
+
+// Copy-n-paste from aten/src/ATen/native/cuda/Math.cuh lines 1463-1915
+template <typename T>
+inline float bessel_j0_forward(T x) {
+  constexpr float PP[] = {
+      +7.96936729297347051624e-04,
+      +8.28352392107440799803e-02,
+      +1.23953371646414299388e+00,
+      +5.44725003058768775090e+00,
+      +8.74716500199817011941e+00,
+      +5.30324038235394892183e+00,
+      +9.99999999999999997821e-01,
+  };
+
+  constexpr float PQ[] = {
+      +9.24408810558863637013e-04,
+      +8.56288474354474431428e-02,
+      +1.25352743901058953537e+00,
+      +5.47097740330417105182e+00,
+      +8.76190883237069594232e+00,
+      +5.30605288235394617618e+00,
+      +1.00000000000000000218e+00,
+  };
+
+  constexpr float QP[] = {
+      -1.13663838898469149931e-02,
+      -1.28252718670509318512e+00,
+      -1.95539544257735972385e+01,
+      -9.32060152123768231369e+01,
+      -1.77681167980488050595e+02,
+      -1.47077505154951170175e+02,
+      -5.14105326766599330220e+01,
+      -6.05014350600728481186e+00,
+  };
+
+  constexpr float QQ[] = {
+      +6.43178256118178023184e+01,
+      +8.56430025976980587198e+02,
+      +3.88240183605401609683e+03,
+      +7.24046774195652478189e+03,
+      +5.93072701187316984827e+03,
+      +2.06209331660327847417e+03,
+      +2.42005740240291393179e+02,
+  };
+
+  constexpr float RP[] = {
+      -4.79443220978201773821e+09,
+      +1.95617491946556577543e+12,
+      -2.49248344360967716204e+14,
+      +9.70862251047306323952e+15,
+  };
+
+  constexpr float RQ[] = {
+      +4.99563147152651017219e+02,
+      +1.73785401676374683123e+05,
+      +4.84409658339962045305e+07,
+      +1.11855537045356834862e+10,
+      +2.11277520115489217587e+12,
+      +3.10518229857422583814e+14,
+      +3.18121955943204943306e+16,
+      +1.71086294081043136091e+18,
+  };
+
+  if (x < T(0)) {
+    x = -x;
+  }
+
+  if (x <= T(5.0)) {
+    if (x < T(0.00001)) {
+      return 1.0 - x * x / 4.0;
+    }
+
+    float rp = 0.0;
+
+    for (auto index = 0; index <= 3; index++) {
+      rp = rp * (x * x) + RP[index];
+    }
+
+    float rq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      rq = rq * (x * x) + RQ[index];
+    }
+
+    return (x * x - 5.78318596294678452118e+00) *
+        (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (25.0 / (x * x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (25.0 / (x * x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (25.0 / (x * x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (25.0 / (x * x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::cos(
+                  x - T(0.785398163397448309615660845819875721)) -
+          5.0 / x * (qp / qq) *
+              ::metal::precise::sin(
+                  x - 0.785398163397448309615660845819875721)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_j0_forward(T x)
+
+template <typename T>
+inline float bessel_y0_forward(T x) {
+  constexpr float PP[] = {
+      +7.96936729297347051624e-04,
+      +8.28352392107440799803e-02,
+      +1.23953371646414299388e+00,
+      +5.44725003058768775090e+00,
+      +8.74716500199817011941e+00,
+      +5.30324038235394892183e+00,
+      +9.99999999999999997821e-01,
+  };
+
+  constexpr float PQ[] = {
+      +9.24408810558863637013e-04,
+      +8.56288474354474431428e-02,
+      +1.25352743901058953537e+00,
+      +5.47097740330417105182e+00,
+      +8.76190883237069594232e+00,
+      +5.30605288235394617618e+00,
+      +1.00000000000000000218e+00,
+  };
+
+  constexpr float QP[] = {
+      -1.13663838898469149931e-02,
+      -1.28252718670509318512e+00,
+      -1.95539544257735972385e+01,
+      -9.32060152123768231369e+01,
+      -1.77681167980488050595e+02,
+      -1.47077505154951170175e+02,
+      -5.14105326766599330220e+01,
+      -6.05014350600728481186e+00,
+  };
+
+  constexpr float QQ[] = {
+      +6.43178256118178023184e+01,
+      +8.56430025976980587198e+02,
+      +3.88240183605401609683e+03,
+      +7.24046774195652478189e+03,
+      +5.93072701187316984827e+03,
+      +2.06209331660327847417e+03,
+      +2.42005740240291393179e+02,
+  };
+
+  constexpr float YP[] = {
+      +1.55924367855235737965e+04,
+      -1.46639295903971606143e+07,
+      +5.43526477051876500413e+09,
+      -9.82136065717911466409e+11,
+      +8.75906394395366999549e+13,
+      -3.46628303384729719441e+15,
+      +4.42733268572569800351e+16,
+      -1.84950800436986690637e+16,
+  };
+
+  constexpr float YQ[] = {
+      +1.04128353664259848412e+03,
+      +6.26107330137134956842e+05,
+      +2.68919633393814121987e+08,
+      +8.64002487103935000337e+10,
+      +2.02979612750105546709e+13,
+      +3.17157752842975028269e+15,
+      +2.50596256172653059228e+17,
+  };
+
+  if (x <= T(5.0)) {
+    if (x == T(0.0)) {
+      return -INFINITY;
+    }
+
+    if (x < T(0.0)) {
+      return NAN;
+    }
+
+    float yp = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      yp = yp * (x * x) + YP[index];
+    }
+
+    float yq = 0.0;
+
+    for (auto index = 0; index <= 6; index++) {
+      yq = yq * (x * x) + YQ[index];
+    }
+
+    return yp / yq +
+        (0.636619772367581343075535053490057448 * ::metal::precise::log(x) *
+         bessel_j0_forward(x));
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (25.0 / (x * x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (25.0 / (x * x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (25.0 / (x * x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (25.0 / (x * x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::sin(
+                  x - 0.785398163397448309615660845819875721) +
+          5.0 / x * (qp / qq) *
+              ::metal::precise::cos(
+                  x - 0.785398163397448309615660845819875721)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_y0_forward(T x)
+
+template <typename T>
+inline float bessel_j1_forward(T x) {
+  constexpr float PP[] = {
+      +7.62125616208173112003e-04,
+      +7.31397056940917570436e-02,
+      +1.12719608129684925192e+00,
+      +5.11207951146807644818e+00,
+      +8.42404590141772420927e+00,
+      +5.21451598682361504063e+00,
+      +1.00000000000000000254e+00,
+  };
+
+  constexpr float PQ[] = {
+      +5.71323128072548699714e-04,
+      +6.88455908754495404082e-02,
+      +1.10514232634061696926e+00,
+      +5.07386386128601488557e+00,
+      +8.39985554327604159757e+00,
+      +5.20982848682361821619e+00,
+      +9.99999999999999997461e-01,
+  };
+
+  constexpr float QP[] = {
+      +5.10862594750176621635e-02,
+      +4.98213872951233449420e+00,
+      +7.58238284132545283818e+01,
+      +3.66779609360150777800e+02,
+      +7.10856304998926107277e+02,
+      +5.97489612400613639965e+02,
+      +2.11688757100572135698e+02,
+      +2.52070205858023719784e+01,
+  };
+
+  constexpr float QQ[] = {
+      +7.42373277035675149943e+01,
+      +1.05644886038262816351e+03,
+      +4.98641058337653607651e+03,
+      +9.56231892404756170795e+03,
+      +7.99704160447350683650e+03,
+      +2.82619278517639096600e+03,
+      +3.36093607810698293419e+02,
+  };
+
+  constexpr float RP[] = {
+      -8.99971225705559398224e+08,
+      +4.52228297998194034323e+11,
+      -7.27494245221818276015e+13,
+      +3.68295732863852883286e+15,
+  };
+
+  constexpr float RQ[] = {
+      +6.20836478118054335476e+02,
+      +2.56987256757748830383e+05,
+      +8.35146791431949253037e+07,
+      +2.21511595479792499675e+10,
+      +4.74914122079991414898e+12,
+      +7.84369607876235854894e+14,
+      +8.95222336184627338078e+16,
+      +5.32278620332680085395e+18,
+  };
+
+  if (x < T(0.0)) {
+    return -bessel_j1_forward(-x);
+  }
+
+  if (x <= T(5.0)) {
+    float rp = 0.0;
+
+    for (auto index = 0; index <= 3; index++) {
+      rp = rp * (x * x) + RP[index];
+    }
+
+    float rq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      rq = rq * (x * x) + RQ[index];
+    }
+
+    return rp / rq * x * (x * x - 1.46819706421238932572e+01) *
+        (x * x - 4.92184563216946036703e+01);
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (5.0 / x * (5.0 / x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (5.0 / x * (5.0 / x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (5.0 / x * (5.0 / x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (5.0 / x * (5.0 / x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::cos(
+                  x - 2.356194490192344928846982537459627163) -
+          5.0 / x * (qp / qq) *
+              ::metal::precise::sin(
+                  x - 2.356194490192344928846982537459627163)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_j1_forward(T x)
+
+template <typename T>
+inline float bessel_y1_forward(T x) {
+  constexpr float PP[] = {
+      +7.62125616208173112003e-04,
+      +7.31397056940917570436e-02,
+      +1.12719608129684925192e+00,
+      +5.11207951146807644818e+00,
+      +8.42404590141772420927e+00,
+      +5.21451598682361504063e+00,
+      +1.00000000000000000254e+00,
+  };
+
+  constexpr float PQ[] = {
+      +5.71323128072548699714e-04,
+      +6.88455908754495404082e-02,
+      +1.10514232634061696926e+00,
+      +5.07386386128601488557e+00,
+      +8.39985554327604159757e+00,
+      +5.20982848682361821619e+00,
+      +9.99999999999999997461e-01,
+  };
+
+  constexpr float QP[] = {
+      +5.10862594750176621635e-02,
+      +4.98213872951233449420e+00,
+      +7.58238284132545283818e+01,
+      +3.66779609360150777800e+02,
+      +7.10856304998926107277e+02,
+      +5.97489612400613639965e+02,
+      +2.11688757100572135698e+02,
+      +2.52070205858023719784e+01,
+  };
+
+  constexpr float QQ[] = {
+      +7.42373277035675149943e+01,
+      +1.05644886038262816351e+03,
+      +4.98641058337653607651e+03,
+      +9.56231892404756170795e+03,
+      +7.99704160447350683650e+03,
+      +2.82619278517639096600e+03,
+      +3.36093607810698293419e+02,
+  };
+
+  constexpr float YP[] = {
+      +1.26320474790178026440e+09,
+      -6.47355876379160291031e+11,
+      +1.14509511541823727583e+14,
+      -8.12770255501325109621e+15,
+      +2.02439475713594898196e+17,
+      -7.78877196265950026825e+17,
+  };
+
+  constexpr float YQ[] = {
+      +5.94301592346128195359e+02,
+      +2.35564092943068577943e+05,
+      +7.34811944459721705660e+07,
+      +1.87601316108706159478e+10,
+      +3.88231277496238566008e+12,
+      +6.20557727146953693363e+14,
+      +6.87141087355300489866e+16,
+      +3.97270608116560655612e+18,
+  };
+
+  if (x <= T(5.0)) {
+    if (x == T(0.0)) {
+      return -INFINITY;
+    }
+
+    if (x <= T(0.0)) {
+      return NAN;
+    }
+
+    float yp = 0.0;
+
+    for (auto index = 0; index <= 5; index++) {
+      yp = yp * (x * x) + YP[index];
+    }
+
+    float yq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      yq = yq * (x * x) + YQ[index];
+    }
+
+    return x * (yp / yq) +
+        (0.636619772367581343075535053490057448 *
+         (bessel_j1_forward(x) * ::metal::precise::log(x) - 1.0 / x));
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (5.0 / x * (5.0 / x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (5.0 / x * (5.0 / x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (5.0 / x * (5.0 / x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (5.0 / x * (5.0 / x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::sin(
+                  x - 2.356194490192344928846982537459627163) +
+          5.0 / x * (qp / qq) *
+              ::metal::precise::cos(
+                  x - 2.356194490192344928846982537459627163)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_y1_forward(T x)
+
+template <typename T>
+inline float modified_bessel_i0_forward(T x) {
+  constexpr float A[] = {
+      -4.41534164647933937950e-18, +3.33079451882223809783e-17,
+      -2.43127984654795469359e-16, +1.71539128555513303061e-15,
+      -1.16853328779934516808e-14, +7.67618549860493561688e-14,
+      -4.85644678311192946090e-13, +2.95505266312963983461e-12,
+      -1.72682629144155570723e-11, +9.67580903537323691224e-11,
+      -5.18979560163526290666e-10, +2.65982372468238665035e-09,
+      -1.30002500998624804212e-08, +6.04699502254191894932e-08,
+      -2.67079385394061173391e-07, +1.11738753912010371815e-06,
+      -4.41673835845875056359e-06, +1.64484480707288970893e-05,
+      -5.75419501008210370398e-05, +1.88502885095841655729e-04,
+      -5.76375574538582365885e-04, +1.63947561694133579842e-03,
+      -4.32430999505057594430e-03, +1.05464603945949983183e-02,
+      -2.37374148058994688156e-02, +4.93052842396707084878e-02,
+      -9.49010970480476444210e-02, +1.71620901522208775349e-01,
+      -3.04682672343198398683e-01, +6.76795274409476084995e-01,
+  };
+
+  constexpr float B[] = {
+      -7.23318048787475395456e-18, -4.83050448594418207126e-18,
+      +4.46562142029675999901e-17, +3.46122286769746109310e-17,
+      -2.82762398051658348494e-16, -3.42548561967721913462e-16,
+      +1.77256013305652638360e-15, +3.81168066935262242075e-15,
+      -9.55484669882830764870e-15, -4.15056934728722208663e-14,
+      +1.54008621752140982691e-14, +3.85277838274214270114e-13,
+      +7.18012445138366623367e-13, -1.79417853150680611778e-12,
+      -1.32158118404477131188e-11, -3.14991652796324136454e-11,
+      +1.18891471078464383424e-11, +4.94060238822496958910e-10,
+      +3.39623202570838634515e-09, +2.26666899049817806459e-08,
+      +2.04891858946906374183e-07, +2.89137052083475648297e-06,
+      +6.88975834691682398426e-05, +3.36911647825569408990e-03,
+      +8.04490411014108831608e-01,
+  };
+
+  float p;
+  float q = 0.0;
+
+  if (::metal::fabs(x) <= 8.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 30; index++) {
+      p = q;
+      q = a;
+      a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index];
+    }
+
+    return ::metal::exp(::metal::fabs(x)) * (T(0.5) * (a - p));
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::exp(::metal::fabs(x)) * (.5 * (b - p)) /
+      ::metal::precise::sqrt(::metal::fabs(x));
+} // modified_bessel_i0_forward(T x)
+
+template <typename T>
+inline float modified_bessel_i1_forward(T x) {
+  constexpr float A[] = {
+      +2.77791411276104639959e-18, -2.11142121435816608115e-17,
+      +1.55363195773620046921e-16, -1.10559694773538630805e-15,
+      +7.60068429473540693410e-15, -5.04218550472791168711e-14,
+      +3.22379336594557470981e-13, -1.98397439776494371520e-12,
+      +1.17361862988909016308e-11, -6.66348972350202774223e-11,
+      +3.62559028155211703701e-10, -1.88724975172282928790e-09,
+      +9.38153738649577178388e-09, -4.44505912879632808065e-08,
+      +2.00329475355213526229e-07, -8.56872026469545474066e-07,
+      +3.47025130813767847674e-06, -1.32731636560394358279e-05,
+      +4.78156510755005422638e-05, -1.61760815825896745588e-04,
+      +5.12285956168575772895e-04, -1.51357245063125314899e-03,
+      +4.15642294431288815669e-03, -1.05640848946261981558e-02,
+      +2.47264490306265168283e-02, -5.29459812080949914269e-02,
+      +1.02643658689847095384e-01, -1.76416518357834055153e-01,
+      +2.52587186443633654823e-01,
+  };
+
+  constexpr float B[] = {
+      +7.51729631084210481353e-18, +4.41434832307170791151e-18,
+      -4.65030536848935832153e-17, -3.20952592199342395980e-17,
+      +2.96262899764595013876e-16, +3.30820231092092828324e-16,
+      -1.88035477551078244854e-15, -3.81440307243700780478e-15,
+      +1.04202769841288027642e-14, +4.27244001671195135429e-14,
+      -2.10154184277266431302e-14, -4.08355111109219731823e-13,
+      -7.19855177624590851209e-13, +2.03562854414708950722e-12,
+      +1.41258074366137813316e-11, +3.25260358301548823856e-11,
+      -1.89749581235054123450e-11, -5.58974346219658380687e-10,
+      -3.83538038596423702205e-09, -2.63146884688951950684e-08,
+      -2.51223623787020892529e-07, -3.88256480887769039346e-06,
+      -1.10588938762623716291e-04, -9.76109749136146840777e-03,
+      +7.78576235018280120474e-01,
+  };
+
+  float p;
+  float q = 0.0;
+
+  if (::metal::fabs(x) <= T(8.0)) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 29; index++) {
+      p = q;
+      q = a;
+      a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index];
+    }
+
+    return .5 * (a - p) * x * ::metal::precise::exp(::metal::fabs(x));
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index];
+  }
+
+  if (x < 0.0) {
+    return -(
+        ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) /
+        ::metal::precise::sqrt(::metal::fabs(x)));
+  }
+
+  return ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) /
+      ::metal::precise::sqrt(::metal::fabs(x));
+} // modified_bessel_i1_forward(T x)
+
+template <typename T>
+inline float modified_bessel_k0_forward(T x) {
+  constexpr float A[] = {
+      +1.37446543561352307156e-16,
+      +4.25981614279661018399e-14,
+      +1.03496952576338420167e-11,
+      +1.90451637722020886025e-09,
+      +2.53479107902614945675e-07,
+      +2.28621210311945178607e-05,
+      +1.26461541144692592338e-03,
+      +3.59799365153615016266e-02,
+      +3.44289899924628486886e-01,
+      -5.35327393233902768720e-01,
+  };
+
+  constexpr float B[] = {
+      +5.30043377268626276149e-18, -1.64758043015242134646e-17,
+      +5.21039150503902756861e-17, -1.67823109680541210385e-16,
+      +5.51205597852431940784e-16, -1.84859337734377901440e-15,
+      +6.34007647740507060557e-15, -2.22751332699166985548e-14,
+      +8.03289077536357521100e-14, -2.98009692317273043925e-13,
+      +1.14034058820847496303e-12, -4.51459788337394416547e-12,
+      +1.85594911495471785253e-11, -7.95748924447710747776e-11,
+      +3.57739728140030116597e-10, -1.69753450938905987466e-09,
+      +8.57403401741422608519e-09, -4.66048989768794782956e-08,
+      +2.76681363944501510342e-07, -1.83175552271911948767e-06,
+      +1.39498137188764993662e-05, -1.28495495816278026384e-04,
+      +1.56988388573005337491e-03, -3.14481013119645005427e-02,
+      +2.44030308206595545468e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 10; index++) {
+      p = q;
+      q = a;
+      a = (x * x - 2.0) * q - p + A[index];
+    }
+
+    return 0.5 * (a - p) -
+        ::metal::log(0.5 * x) * modified_bessel_i0_forward(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::exp(-x) * (0.5 * (b - p)) / ::metal::sqrt(x);
+} // modified_bessel_k0_forward(T x)
+
+template <typename T>
+inline float modified_bessel_k1_forward(T x) {
+  constexpr float A[] = {
+      -7.02386347938628759343e-18,
+      -2.42744985051936593393e-15,
+      -6.66690169419932900609e-13,
+      -1.41148839263352776110e-10,
+      -2.21338763073472585583e-08,
+      -2.43340614156596823496e-06,
+      -1.73028895751305206302e-04,
+      -6.97572385963986435018e-03,
+      -1.22611180822657148235e-01,
+      -3.53155960776544875667e-01,
+      +1.52530022733894777053e+00,
+  };
+
+  constexpr float B[] = {
+      -5.75674448366501715755e-18, +1.79405087314755922667e-17,
+      -5.68946255844285935196e-17, +1.83809354436663880070e-16,
+      -6.05704724837331885336e-16, +2.03870316562433424052e-15,
+      -7.01983709041831346144e-15, +2.47715442448130437068e-14,
+      -8.97670518232499435011e-14, +3.34841966607842919884e-13,
+      -1.28917396095102890680e-12, +5.13963967348173025100e-12,
+      -2.12996783842756842877e-11, +9.21831518760500529508e-11,
+      -4.19035475934189648750e-10, +2.01504975519703286596e-09,
+      -1.03457624656780970260e-08, +5.74108412545004946722e-08,
+      -3.50196060308781257119e-07, +2.40648494783721712015e-06,
+      -1.93619797416608296024e-05, +1.95215518471351631108e-04,
+      -2.85781685962277938680e-03, +1.03923736576817238437e-01,
+      +2.72062619048444266945e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 11; index++) {
+      p = q;
+      q = a;
+      a = (x * x - T(2.0)) * q - p + A[index];
+    }
+
+    return ::metal::precise::log(T(0.5) * x) * modified_bessel_i1_forward(x) +
+        0.5 * (a - p) / x;
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::precise::exp(-x) * (0.5 * (b - p)) /
+      ::metal::precise::sqrt(x);
+}
+
+template <typename T>
+inline float scaled_modified_bessel_k0_forward(T x) {
+  constexpr float A[] = {
+      +1.37446543561352307156e-16,
+      +4.25981614279661018399e-14,
+      +1.03496952576338420167e-11,
+      +1.90451637722020886025e-09,
+      +2.53479107902614945675e-07,
+      +2.28621210311945178607e-05,
+      +1.26461541144692592338e-03,
+      +3.59799365153615016266e-02,
+      +3.44289899924628486886e-01,
+      -5.35327393233902768720e-01,
+  };
+
+  constexpr float B[] = {
+      +5.30043377268626276149e-18, -1.64758043015242134646e-17,
+      +5.21039150503902756861e-17, -1.67823109680541210385e-16,
+      +5.51205597852431940784e-16, -1.84859337734377901440e-15,
+      +6.34007647740507060557e-15, -2.22751332699166985548e-14,
+      +8.03289077536357521100e-14, -2.98009692317273043925e-13,
+      +1.14034058820847496303e-12, -4.51459788337394416547e-12,
+      +1.85594911495471785253e-11, -7.95748924447710747776e-11,
+      +3.57739728140030116597e-10, -1.69753450938905987466e-09,
+      +8.57403401741422608519e-09, -4.66048989768794782956e-08,
+      +2.76681363944501510342e-07, -1.83175552271911948767e-06,
+      +1.39498137188764993662e-05, -1.28495495816278026384e-04,
+      +1.56988388573005337491e-03, -3.14481013119645005427e-02,
+      +2.44030308206595545468e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 10; index++) {
+      p = q;
+      q = a;
+      a = (x * x - T(2.0)) * q - p + A[index];
+    }
+
+    return (0.5 * (a - p) -
+            ::metal::precise::log(0.5 * x) * modified_bessel_i0_forward(x)) *
+        ::metal::precise::exp(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return 0.5 * (b - p) / ::metal::precise::sqrt(x);
+}
+
+template <typename T>
+inline float scaled_modified_bessel_k1_forward(T x) {
+  constexpr float A[] = {
+      -7.02386347938628759343e-18,
+      -2.42744985051936593393e-15,
+      -6.66690169419932900609e-13,
+      -1.41148839263352776110e-10,
+      -2.21338763073472585583e-08,
+      -2.43340614156596823496e-06,
+      -1.73028895751305206302e-04,
+      -6.97572385963986435018e-03,
+      -1.22611180822657148235e-01,
+      -3.53155960776544875667e-01,
+      +1.52530022733894777053e+00,
+  };
+
+  constexpr float B[] = {
+      -5.75674448366501715755e-18, +1.79405087314755922667e-17,
+      -5.68946255844285935196e-17, +1.83809354436663880070e-16,
+      -6.05704724837331885336e-16, +2.03870316562433424052e-15,
+      -7.01983709041831346144e-15, +2.47715442448130437068e-14,
+      -8.97670518232499435011e-14, +3.34841966607842919884e-13,
+      -1.28917396095102890680e-12, +5.13963967348173025100e-12,
+      -2.12996783842756842877e-11, +9.21831518760500529508e-11,
+      -4.19035475934189648750e-10, +2.01504975519703286596e-09,
+      -1.03457624656780970260e-08, +5.74108412545004946722e-08,
+      -3.50196060308781257119e-07, +2.40648494783721712015e-06,
+      -1.93619797416608296024e-05, +1.95215518471351631108e-04,
+      -2.85781685962277938680e-03, +1.03923736576817238437e-01,
+      +2.72062619048444266945e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 11; index++) {
+      p = q;
+      q = a;
+      a = (x * x - 2.0) * q - p + A[index];
+    }
+
+    return (::metal::precise::log(0.5 * x) * modified_bessel_i1_forward(x) +
+            0.5 * (a - p) / x) *
+        ::metal::precise::exp(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return (0.5 * (b - p) / ::metal::precise::sqrt(x));
+}
+
+template <typename T>
+float chebyshev_polynomial_t_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0 || n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if ((n > 6) && (::metal::precise::fabs(x) < 1.0)) {
+    return ::metal::precise::cos(n * ::metal::precise::acos(x));
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x;
+  }
+
+  float p = 1.0;
+  float q = x;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (x + x) * q - p;
+    p = q;
+    q = r;
+  }
+  return r;
+}
+
+template <typename T>
+float chebyshev_polynomial_u_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0 || n % 2 == 0) {
+      return n + 1;
+    }
+
+    return -(n + 1);
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::sin(acos_x) != 0.0) {
+      return ::metal::precise::sin((n + 1) * acos_x) /
+          ::metal::precise::sin(acos_x);
+    }
+
+    return (n + 1) * ::metal::precise::cos((n + 1) * acos_x) / x;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = 2 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+}
+
+template <typename T>
+float chebyshev_polynomial_v_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0) {
+      return 1.0;
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::sin(.5 * acos_x) != 1.0) {
+      return ::metal::precise::cos((n + 0.5) * acos_x) /
+          ::metal::precise::cos(.5 * acos_x);
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x - 1.0;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = 2 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template <typename T>
+float chebyshev_polynomial_w_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0) {
+      return n + n + 1;
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::cos(.5 * acos_x) != 1.0) {
+      return ::metal::precise::sin((n + 0.5) * acos_x) /
+          ::metal::precise::sin(.5 * acos_x);
+    }
+
+    if (x > 0.0) {
+      return n + n + 1;
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x + 1.0;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = 2.0 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == T(1.0)) {
+    return 1.0;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    return ::metal::precise::cos(n * ::metal::precise::acos(xpxm1));
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1;
+  }
+
+  float p = 1.0;
+  float q = xpxm1;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return n + 1;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return n + 1;
+    }
+
+    return -(n + 1);
+  }
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    const float divisor = ::metal::precise::sin(acos_2xm1);
+    if (divisor != 0.0) {
+      return ::metal::precise::sin((n + 1) * acos_2xm1) / divisor;
+    }
+
+    return (n + 1) * ::metal::precise::cos((n + 1) * acos_2xm1) / xpxm1;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return 1.0;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return (n + n + 1);
+    }
+
+    return -(n + n + 1);
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    if (::metal::precise::sin(acos_2xm1 / 2.0) != 1.0) {
+      return ::metal::precise::cos((n + 0.5) * acos_2xm1) /
+          ::metal::precise::cos(acos_2xm1 / 2.0);
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if (n == 0) {
+    return T(1.0);
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1 - 1.0;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1 - 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template <typename T>
+float shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (x == 1.0) {
+    return n + n + 1;
+  }
+
+  if (x == 0.0) {
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  const float xpxm1 = x + x - 1.0;
+  if ((n > 4) && (::metal::abs(xpxm1) < 1.0)) {
+    const float acos_2xm1 = ::metal::precise::acos(xpxm1);
+    if (::metal::precise::cos(acos_2xm1 / 2.0) != 1.0) {
+      return ::metal::precise::sin((n + 0.5) * acos_2xm1) /
+          ::metal::precise::sin(acos_2xm1 / 2.0);
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return xpxm1 + xpxm1 + 1.0;
+  }
+
+  float p = 1.0;
+  float q = xpxm1 + xpxm1 + 1.0;
+  float r;
+
+  for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+    r = (xpxm1 + xpxm1) * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
+template <typename T>
+// TODO: Add 512 if/when double will be supported in Metal
+inline constexpr int getHermitianLimit() {
+  return 128;
+}
+
+template <typename T>
+inline float hermite_polynomial_h_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x + x;
+  }
+
+  if (n > getHermitianLimit<T>()) {
+    return NAN;
+  }
+
+  float p = 1.0;
+  float q = x + x;
+  float r = 0.0;
+
+  for (int64_t k = 2; k < n + n; k += 2) {
+    r = (x + x) * q - k * p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // hermite_polynomial_h_forward(T x, int64_t n)
+
+template <typename T>
+inline float hermite_polynomial_he_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x;
+  }
+
+  if (n > getHermitianLimit<T>()) {
+    return NAN;
+  }
+
+  float p = 1.0;
+  float q = x;
+  float r;
+
+  for (int64_t k = 1; k < n; k++) {
+    r = x * q - k * p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // hermite_polynomial_he_forward(T x, int64_t n)
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..13c23ac7ed705a4e8fc76ba144f603be82a9c503
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/metal/utils.h
@@ -0,0 +1,386 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Metal helper functions
+#pragma once
+#include <c10/metal/common.h>
+#include <metal_stdlib>
+
+namespace c10 {
+namespace metal {
+
+namespace detail {
+template <typename T>
+struct vectypes {};
+
+template <>
+struct vectypes<float> {
+  using type4 = float4;
+  using type3 = float3;
+  using type2 = float2;
+};
+
+template <>
+struct vectypes<half> {
+  using type4 = half4;
+  using type3 = half3;
+  using type2 = half2;
+};
+
+template <>
+struct vectypes<bfloat> {
+  using type4 = bfloat4;
+  using type3 = bfloat3;
+  using type2 = bfloat2;
+};
+
+template <>
+struct vectypes<short> {
+  using type4 = short4;
+  using type3 = short3;
+  using type2 = short2;
+};
+
+template <>
+struct vectypes<int> {
+  using type4 = int4;
+  using type3 = int3;
+  using type2 = int2;
+};
+
+template <>
+struct vectypes<long> {
+  using type4 = short4;
+  using type3 = short3;
+  using type2 = short2;
+};
+
+template <typename T>
+struct OpMathType {
+  using type = T;
+};
+
+template <>
+struct OpMathType<half> {
+  using type = float;
+};
+
+template <>
+struct OpMathType<short> {
+  using type = int;
+};
+
+template <>
+struct OpMathType<char> {
+  using type = int;
+};
+
+template <>
+struct OpMathType<uchar> {
+  using type = int;
+};
+
+template <>
+struct OpMathType<bfloat> {
+  using type = float;
+};
+
+// Type promotion structure for higher precision accumulation
+template <typename T>
+struct AccumulationType {
+  using type = T;
+};
+
+// Specialization for half - promote to float for accumulation
+template <>
+struct AccumulationType<half> {
+  using type = float;
+};
+
+// Specialization for bfloat - promote to float for accumulation
+template <>
+struct AccumulationType<bfloat> {
+  using type = float;
+};
+
+} // namespace detail
+
+template <typename T>
+::metal::enable_if_t<::metal::is_floating_point_v<T>, T> max(T a, T b) {
+  return ::metal::isunordered(a, b) ? NAN : ::metal::max(a, b);
+}
+
+template <typename T, typename U>
+::metal::enable_if_t<::metal::is_integral_v<T>&& ::metal::is_integral_v<U>, T>
+max(T a, U b) {
+  return ::metal::max(a, static_cast<T>(b));
+}
+
+template <typename T>
+::metal::enable_if_t<::metal::is_floating_point_v<T>, T> min(T a, T b) {
+  return ::metal::isunordered(a, b) ? NAN : ::metal::min(a, b);
+}
+
+template <typename T, typename U>
+::metal::enable_if_t<::metal::is_integral_v<T>&& ::metal::is_integral_v<U>, T>
+min(T a, U b) {
+  return ::metal::min(a, static_cast<T>(b));
+}
+
+template <>
+inline bfloat min(bfloat a, bfloat b) {
+  return bfloat(
+      ::metal::isunordered(a, b) ? NAN : ::metal::min(float(a), float(b)));
+}
+
+template <>
+inline bfloat max(bfloat a, bfloat b) {
+  return bfloat(
+      ::metal::isunordered(a, b) ? NAN : ::metal::max(float(a), float(b)));
+}
+
+template <typename T>
+using vec2type_t = typename detail::vectypes<T>::type2;
+
+template <typename T>
+using vec4type_t = typename detail::vectypes<T>::type4;
+
+template <typename T>
+using opmath_t = typename detail::OpMathType<T>::type;
+
+template <typename T>
+using accum_t = typename detail::AccumulationType<T>::type;
+
+// TODO: Move it to type_traits header may be
+template <typename F, typename... Args>
+using result_of = decltype(::metal::declval<F>()(::metal::declval<Args>()...));
+
+template <typename T>
+constexpr constant bool is_complex_v =
+    ::metal::is_same_v<T, float2> || ::metal::is_same_v<T, half2>;
+
+template <typename T>
+constexpr constant bool is_scalar_floating_point_v =
+    ::metal::is_floating_point_v<T> && ::metal::is_scalar_v<T>;
+
+template <typename T>
+constexpr constant bool is_scalar_integral_v =
+    ::metal::is_integral_v<T> && ::metal::is_scalar_v<T>;
+
+template <typename U, typename V>
+using common_dtype = decltype(U(0) + V(0));
+
+// floor_divide
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> floor_divide(T x, U y) {
+  const auto quot = x / y;
+  return (x < 0) == (y < 0) ? quot : (x % y != 0) ? quot - 1 : quot;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> && is_scalar_floating_point_v<U>,
+        bool> = true>
+inline common_dtype<T, U> floor_divide(T x, U y) {
+  return ::metal::floor(x / y);
+}
+
+// fmod
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> fmod(T x, U y) {
+  return x % y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> && is_scalar_floating_point_v<U>,
+        bool> = true>
+inline common_dtype<T, U> fmod(T x, U y) {
+  return ::metal::fmod(x, y);
+}
+
+// cast_to primitives
+//  - No-op if types as the same
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<::metal::is_same_v<U, T>, bool> = true>
+inline T cast_to(const U from) {
+  return from;
+}
+//  - Simple cast between scalar and complex dtypes
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        !::metal::is_same_v<U, T> && (is_complex_v<T> == is_complex_v<U>),
+        bool> = true>
+inline T cast_to(const U from) {
+  return static_cast<T>(from);
+}
+
+// - Scalar to complex
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && !is_complex_v<U>, bool> = true>
+inline T cast_to(const U from) {
+  return T(float(from), 0.0);
+}
+// - Complex to scalar (should not really be used, but exists for compliteness)
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline T cast_to(const U from) {
+  return static_cast<T>(from.x);
+}
+
+// Generalizable math operators (used for both scalar and complex)
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T>, bool> = true>
+inline common_dtype<T, U> mul(const T x, const U y) {
+  return x * y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline common_dtype<T, U> mul(const T x, const U y) {
+  return T(x.x * y.x - x.y * y.y, x.x * y.y + x.y * y.x);
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T>, bool> = true>
+inline common_dtype<T, U> div(const T x, const U y) {
+  return x / y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline common_dtype<T, U> div(const T x, const U y) {
+  return T(::metal::dot(x, y), x.y * y.x - x.x * y.y) / ::metal::dot(y, y);
+}
+
+// Remainder operator
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> || is_scalar_floating_point_v<U>,
+        bool> = true>
+inline float remainder(const T x, const U y) {
+  const auto x_f = static_cast<float>(x);
+  const auto y_f = static_cast<float>(y);
+  return x_f - y_f * floor_divide(x_f, y_f);
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> remainder(const T x, const U y) {
+  auto rc = x % y;
+  return rc == 0 || (x ^ y) > 0 ? rc : rc + y;
+}
+
+// Based on algorithm described in
+// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202
+inline float log1p(float x) {
+  const auto xp1 = 1.0f + x;
+  // First two elements of Taylor series for log(1+x) in Horner's form are:
+  // log(1+x) = x * (1 - x * (.5 ...)), but if 1 + x == x, then it's just x
+  if (xp1 == 1.0f) {
+    return x;
+  }
+  auto rc = ::metal::precise::log(xp1);
+  if (x > -.5 && x < .5) {
+    // Order of operations is important here for higher precision
+    rc *= x / (xp1 - 1.0f);
+  }
+  return rc;
+}
+
+// The function is ported from mlx
+inline float2 log1p(float2 in) {
+  float x = in.x;
+  float y = in.y;
+  float zabs = ::metal::precise::sqrt(x * x + y * y);
+  float theta = ::metal::atan2(y, x + 1);
+  if (zabs < 0.5f) {
+    float r = x * (2 + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {0.5f * log1p(r), theta};
+  } else {
+    auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
+    return {::metal::log(z0), theta};
+  }
+}
+
+template <typename T1, typename T2 = T1>
+struct pair {
+  T1 first;
+  T2 second;
+};
+
+template <typename T>
+inline T conj(T a) {
+  return a;
+}
+
+template <>
+inline half2 conj(half2 a) {
+  return half2(a.x, -a.y);
+}
+
+template <>
+inline float2 conj(float2 a) {
+  return float2(a.x, -a.y);
+}
+
+#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
+  MACRO(float);                          \
+  MACRO(half);                           \
+  MACRO(bfloat);                         \
+  MACRO(float2);                         \
+  MACRO(long);                           \
+  MACRO(char);                           \
+  MACRO(uchar);                          \
+  MACRO(short);                          \
+  MACRO(int);
+
+#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
+  MACRO(float);                            \
+  MACRO(half);                             \
+  MACRO(bfloat);
+
+} // namespace metal
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad6854b8871d9e55324bea686b1313f64c1f5883
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUCachingAllocator.h
@@ -0,0 +1,111 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstddef>
+#include <mutex>
+
+#include <c10/macros/Export.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+/*
+ * CPUCachingAllocator:
+ * DISCLAIMER:
+ *    This is subject to change (beta) and only supported on mobile builds.
+ *    If code snippet such as in 'Usage pattern' is used outside of mobile
+ *    build you will not observe the intended behavior.
+ *    See below for more information.
+ * Why?
+ *    It has been observed that some mobile platforms, such as pixel 3, return
+ *    memory aggressively to the system. This results in page faults in some
+ * cases and ends up hurting performance. This caching allocator aims to address
+ * that. Furthermore it also allows users to specify their own allocator by
+ * implementing allocate/free virtual interfaces. What are the cons? There are
+ * some cons that were observed where use of caching allocator led to worse
+ * performance on some platforms. Reason being that the caching mechanism used
+ * by this allocator left us worse off compared to the corresponding platform's
+ *    tuned memory allocator. In that case it seemed better to not use this
+ * allocator. Note there are some ideas to fix this in the works.
+ *
+ * Usage:
+ * Usage pattern:
+ * Instantiate and own the caching allocator.
+ * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
+ *   std::make_unique<c10::CPUCachingAllocator>();
+ * Use caching allocator with a scoped guard at inference time.
+ * {
+ * WithCPUCachingAllocatorGuard(caching_allocator.get());
+ * ... model.forward(...);
+ * }
+ */
+
+namespace c10 {
+
+class C10_API CPUCachingAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+ private:
+  inline void* allocate_and_cache(const size_t bytes);
+  void free_cached();
+
+ protected:
+  // Invariants.
+  // 1. If memory is ever allocated via this allocator then
+  //    the pointer will exist in allocation_map_, unless the allocator
+  //    returned the memory to OS via free_cached.
+  //  1.1. Therefore even when the said memory is "freed" via this
+  //       allocator (and thus cached), it will continue to stay
+  //       in allocation_map_. Furthermore it will also exist in
+  //       available_map_. Thus an allocated memory pointer can be in both
+  //       allocation_map_ and available_map_ simultaneously.
+  // 2. Memory pointer maybe removed from allocation_map_, when it
+  //    is freed outside of the scope of this allocator, but was allocated
+  //    by this allocator.
+  // 3. Available map only contains that memory which was allocated
+  //    by this allocator and subsequently freed by this allocator.
+  // As a result of above invariants, allocated memory ptr cannot be in
+  // available_map_ unless it is in allocation_map_ as well.
+  ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
+  static ska::flat_hash_map<void*, size_t> allocation_map_;
+  // Since allocation_map, which is a global instance, is mutated/read via
+  // all public APIs we need a global mutex.
+  static std::mutex mutex_;
+
+ public:
+  static void record_free(void* ptr);
+  virtual ~CPUCachingAllocator();
+  // Checks the cache to see if allocation of size bytes can be found.
+  // If so return cached memory, else
+  // allocates memory, records it for caching and returns.
+  virtual void* allocate(const size_t bytes);
+  // Checks if the memory being freed is was marked for allocation by
+  // an earlier call to allocate. If so cache the allocation.
+  // Otherwise free.
+  virtual void free(void* ptr);
+};
+
+CPUCachingAllocator* GetDefaultCPUCachingAllocator();
+
+bool ThreadLocalCachingAllocatorEnabled();
+CPUCachingAllocator* GetThreadLocalCachingAllocator();
+
+class C10_API WithCPUCachingAllocatorGuard {
+ public:
+  WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
+  ~WithCPUCachingAllocatorGuard();
+
+ private:
+  CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..07064210e115bb5799906828fac135ccb63a3146
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/mobile/CPUProfilingAllocator.h
@@ -0,0 +1,157 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/flat_hash_map.h>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace c10 {
+
+/*
+ * Given a sequence of allocations in a thread, AllocationPlan records
+ * 1. size of each allocation
+ * 2. Lifetime of each allocation.
+ * 3. allocation offsets: Memory offset for each allocation in a single blob of
+ * memory
+ * 4. Total size of a blob of memory required to satisfy all the allocations.
+ */
+class C10_API AllocationPlan {
+ private:
+  // Records size of each allocation by their sequential allocation ids.
+  std::vector<uint64_t> allocation_sizes;
+  // This maps one allocation id (X) to another allocation id (Y).
+  // Allocation X is alive until allocation Y. From allocation Y onwards
+  // allocation X is not referenced.
+  // Thus Y is the id of the first allocation after X is freed.
+  // NB: When an allocation is recorded, along with recording its size,
+  // we also set the lifetime to be numeric_limits::max()
+  // This is to track allocations that are made during the scope of
+  // profiling but were not freed until after the scope ended.
+  // Such allocations are not managed by profiling allocator.
+  std::vector<uint64_t> allocation_lifetimes;
+  // Maps an allocation to some offset in a blob of memory.
+  std::vector<uint64_t> allocation_offsets;
+  uint64_t total_size{0};
+  void clear();
+  friend class AllocationPlanner;
+  friend class CPUProfilingAllocator;
+};
+
+/*
+ * Map of memory ptr to allocation id. This is auxiliary information only
+ * used to establish lifetime of allocations.
+ */
+class C10_API AllocationPlanner {
+ private:
+  AllocationPlan* allocation_plan_{nullptr};
+  // Maps allocated ptr to its allocation id.
+  // This is used when freeing the memory to look up the allocation id
+  // in order to establish the lifetime of a particular allocation.
+  ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+  uint64_t allocation_id_{0};
+  bool validation_mode_{false};
+
+  bool validate_allocation(const uint64_t size, const void* ptr);
+  bool validate_free(const void* ptr);
+
+ public:
+  bool validation_success{true};
+
+  AllocationPlanner() = delete;
+  AllocationPlanner(AllocationPlan* plan, bool validate = false)
+      : allocation_plan_(plan), validation_mode_(validate) {}
+  void record_allocation(const uint64_t size, const void* ptr);
+  void record_free(const void* ptr);
+  void formulate_plan();
+  void clear();
+};
+
+// NOT THREAD SAFE profiling allocator.
+class C10_API CPUProfilingAllocator {
+ private:
+  const AllocationPlan* plan_{nullptr};
+  uint64_t allocation_id_{0};
+  uint64_t current_size_{0};
+  void* blob_{nullptr};
+  ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+
+ public:
+  ~CPUProfilingAllocator();
+  void set_plan(const AllocationPlan* plan);
+  void unset_plan();
+  void* allocate(const size_t bytes);
+  void free(void* const ptr);
+};
+
+/*
+ * Usage: Profile allocations made by one run of the model.
+ * AllocationPlan plan;
+ * {
+ *   WithProfileAllocationGuard profile_guard(&plan);
+ *   module.forward(...);
+ * }
+ * plan now contains allocation plan.
+ */
+class C10_API WithProfileAllocationsGuard {
+ public:
+  WithProfileAllocationsGuard(AllocationPlan* plan);
+  ~WithProfileAllocationsGuard();
+
+ private:
+  std::unique_ptr<AllocationPlanner> planner_;
+};
+
+/*
+ * Usage: Validate allocation plan made with WithProfileAllocationGuard
+ * bool plan_validation_success, success = true;
+ * for (some number of representative inputs)
+ * {
+ *   WithValidateAllocationPlanGuard(&plan, &plan_validation_success);
+ *   module.forward(...);
+ *   success = success && plan_validation_success;
+ * }
+ * success == true means allocations are according to plan
+ * else for some inputs allocation pattern changed.
+ */
+class C10_API WithValidateAllocationPlanGuard {
+ public:
+  WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success);
+  ~WithValidateAllocationPlanGuard();
+
+ private:
+  std::unique_ptr<AllocationPlanner> planner_;
+  bool* success_;
+};
+
+AllocationPlanner* GetThreadLocalAllocationPlanner();
+
+/*
+ * Usage: Allocate tensors accordingly to allocation plan
+ * First make allocation plan.
+ *  See WithProfileAllocationsGuard usage.
+ * Second validate allocation plan.
+ *  See WithValidateAllocationPlanGuard usage.
+ * CPUProfilingAllocator profiling_allocator;
+ * {
+ *   WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan);
+ *   module.forward(...);
+ * }
+ */
+class C10_API WithProfilingAllocatorGuard {
+ public:
+  WithProfilingAllocatorGuard(
+      CPUProfilingAllocator* allocator,
+      const AllocationPlan* plan);
+  ~WithProfilingAllocatorGuard();
+};
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator();
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..026570edcd7f2be024266f65b5745a65036bbeed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/Macros.h
@@ -0,0 +1,14 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef C10_TEST_CORE_MACROS_MACROS_H_
+
+#ifdef _WIN32
+#define DISABLED_ON_WINDOWS(x) DISABLED_##x
+#else
+#define DISABLED_ON_WINDOWS(x) x
+#endif
+
+#endif // C10_MACROS_MACROS_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..a68a35cd968a95ef35b61b92594837fcbdbf79a6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_math_test_common.h
@@ -0,0 +1,672 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Warning: this file is included twice in
+// aten/src/ATen/test/cuda_complex_math_test.cu
+
+#include <c10/util/complex.h>
+#include <gtest/gtest.h>
+
+#ifndef PI
+#define PI 3.141592653589793238463
+#endif
+
+#ifndef tol
+#define tol 1e-6
+#endif
+
+// Exponential functions
+
+C10_DEFINE_TEST(TestExponential, IPi) {
+  // exp(i*pi) = -1
+  {
+    c10::complex<float> e_i_pi = std::exp(c10::complex<float>(0, float(PI)));
+    C10_ASSERT_NEAR(e_i_pi.real(), -1, tol);
+    C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol);
+  }
+  {
+    c10::complex<float> e_i_pi = ::exp(c10::complex<float>(0, float(PI)));
+    C10_ASSERT_NEAR(e_i_pi.real(), -1, tol);
+    C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol);
+  }
+  {
+    c10::complex<double> e_i_pi = std::exp(c10::complex<double>(0, PI));
+    C10_ASSERT_NEAR(e_i_pi.real(), -1, tol);
+    C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol);
+  }
+  {
+    c10::complex<double> e_i_pi = ::exp(c10::complex<double>(0, PI));
+    C10_ASSERT_NEAR(e_i_pi.real(), -1, tol);
+    C10_ASSERT_NEAR(e_i_pi.imag(), 0, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestExponential, EulerFormula) {
+  // exp(ix) = cos(x) + i * sin(x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> e = std::exp(x);
+    float expected_real = std::exp(x.real()) * std::cos(x.imag());
+    float expected_imag = std::exp(x.real()) * std::sin(x.imag());
+    C10_ASSERT_NEAR(e.real(), expected_real, tol);
+    C10_ASSERT_NEAR(e.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> e = ::exp(x);
+    float expected_real = ::exp(x.real()) * ::cos(x.imag());
+    float expected_imag = ::exp(x.real()) * ::sin(x.imag());
+    C10_ASSERT_NEAR(e.real(), expected_real, tol);
+    C10_ASSERT_NEAR(e.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> e = std::exp(x);
+    float expected_real = std::exp(x.real()) * std::cos(x.imag());
+    float expected_imag = std::exp(x.real()) * std::sin(x.imag());
+    C10_ASSERT_NEAR(e.real(), expected_real, tol);
+    C10_ASSERT_NEAR(e.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> e = ::exp(x);
+    float expected_real = ::exp(x.real()) * ::cos(x.imag());
+    float expected_imag = ::exp(x.real()) * ::sin(x.imag());
+    C10_ASSERT_NEAR(e.real(), expected_real, tol);
+    C10_ASSERT_NEAR(e.imag(), expected_imag, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestExpm1, Normal) {
+  // expm1(x) = exp(x) - 1
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l1 = std::expm1(x);
+    c10::complex<float> l2 = std::exp(x) - 1.0f;
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l1 = std::expm1(x);
+    c10::complex<double> l2 = std::exp(x) - 1.0;
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestExpm1, Small) {
+  // expm1(x) = exp(x) - 1
+  // expm1(x) provides greater precision than exp(x) - 1 for small values of x
+  {
+    c10::complex<float> x(1e-30, 1e-30);
+    c10::complex<float> l1 = std::expm1(x);
+    C10_ASSERT_NEAR(l1.real(), 1e-30, tol);
+    C10_ASSERT_NEAR(l1.imag(), 1e-30, tol);
+  }
+  {
+    c10::complex<double> x(1e-100, 1e-100);
+    c10::complex<double> l1 = std::expm1(x);
+    C10_ASSERT_NEAR(l1.real(), 1e-30, tol);
+    C10_ASSERT_NEAR(l1.imag(), 1e-30, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog, Definition) {
+  // log(x) = log(r) + i*theta
+  {
+    c10::complex<float> x(1.2, 3.4);
+    c10::complex<float> l = std::log(x);
+    float expected_real = std::log(std::abs(x));
+    float expected_imag = std::arg(x);
+    C10_ASSERT_NEAR(l.real(), expected_real, tol);
+    C10_ASSERT_NEAR(l.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<float> x(1.2, 3.4);
+    c10::complex<float> l = ::log(x);
+    float expected_real = ::log(std::abs(x));
+    float expected_imag = std::arg(x);
+    C10_ASSERT_NEAR(l.real(), expected_real, tol);
+    C10_ASSERT_NEAR(l.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(1.2, 3.4);
+    c10::complex<double> l = std::log(x);
+    float expected_real = std::log(std::abs(x));
+    float expected_imag = std::arg(x);
+    C10_ASSERT_NEAR(l.real(), expected_real, tol);
+    C10_ASSERT_NEAR(l.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(1.2, 3.4);
+    c10::complex<double> l = ::log(x);
+    float expected_real = ::log(std::abs(x));
+    float expected_imag = std::arg(x);
+    C10_ASSERT_NEAR(l.real(), expected_real, tol);
+    C10_ASSERT_NEAR(l.imag(), expected_imag, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog10, Rev) {
+  // log10(10^x) = x
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l = std::log10(std::pow(float(10), x));
+    C10_ASSERT_NEAR(l.real(), float(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), float(1.2), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l = ::log10(::pow(float(10), x));
+    C10_ASSERT_NEAR(l.real(), float(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), float(1.2), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l = std::log10(std::pow(double(10), x));
+    C10_ASSERT_NEAR(l.real(), double(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), double(1.2), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l = ::log10(::pow(double(10), x));
+    C10_ASSERT_NEAR(l.real(), double(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), double(1.2), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog2, Rev) {
+  // log2(2^x) = x
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l = std::log2(std::pow(float(2), x));
+    C10_ASSERT_NEAR(l.real(), float(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), float(1.2), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l = ::log2(std::pow(float(2), x));
+    C10_ASSERT_NEAR(l.real(), float(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), float(1.2), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l = std::log2(std::pow(double(2), x));
+    C10_ASSERT_NEAR(l.real(), double(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), double(1.2), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l = ::log2(std::pow(double(2), x));
+    C10_ASSERT_NEAR(l.real(), double(0.1), tol);
+    C10_ASSERT_NEAR(l.imag(), double(1.2), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog1p, Normal) {
+  // log1p(x) = log(1 + x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l1 = std::log1p(x);
+    c10::complex<float> l2 = std::log(1.0f + x);
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l1 = std::log1p(x);
+    c10::complex<double> l2 = std::log(1.0 + x);
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog1p, Small) {
+  // log(1 + x) ~ x for |x| << 1
+  {
+    c10::complex<float> x(1e-9, 2e-9);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real() / x.real(), 1, tol);
+    C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol);
+  }
+  {
+    c10::complex<double> x(1e-100, 2e-100);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real() / x.real(), 1, tol);
+    C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog1p, Extreme) {
+  // log(1 + x) ~ x for |x| << 1 and in the brink of overflow / underflow
+  {
+    c10::complex<float> x(-1, 1e-30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), -69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<float> x(-1, 1e30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<float> x(1e30, 1);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-30, tol);
+  }
+  {
+    c10::complex<float> x(1e-30, 1);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<float> x(1e30, 1e30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.42412638010134, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<float> x(1e-38, 1e-38);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-38, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-38, tol);
+  }
+  {
+    c10::complex<float> x(1e-38, 2e-30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-30, tol);
+    C10_ASSERT_NEAR(l.imag(), 2e-30, tol);
+  }
+  {
+    c10::complex<double> x(-1, 1e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), -575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<double> x(-1, 1e250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<double> x(1e250, 1);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-250, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 1);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<double> x(1e250, 1e250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.9928468387914, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 1e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-250, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-250, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 2e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-250, tol);
+    C10_ASSERT_NEAR(l.imag(), 2e-250, tol);
+  }
+  {
+    c10::complex<double> x(2e-308, 1.5e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 2e-308, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5e-308, tol);
+  }
+}
+
+// Power functions
+
+C10_DEFINE_TEST(TestPowSqrt, Equal) {
+  // x^0.5 = sqrt(x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::pow(x, float(0.5));
+    c10::complex<float> z = std::sqrt(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::pow(x, float(0.5));
+    c10::complex<float> z = ::sqrt(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::pow(x, double(0.5));
+    c10::complex<double> z = std::sqrt(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::pow(x, double(0.5));
+    c10::complex<double> z = ::sqrt(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestPow, Square) {
+  // x^2 = x * x
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::pow(x, float(2));
+    c10::complex<float> z = x * x;
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::pow(x, float(2));
+    c10::complex<float> z = x * x;
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::pow(x, double(2));
+    c10::complex<double> z = x * x;
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::pow(x, double(2));
+    c10::complex<double> z = x * x;
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+}
+
+// Trigonometric functions and hyperbolic functions
+
+C10_DEFINE_TEST(TestSinCosSinhCosh, Identity) {
+  // sin(x + i * y) = sin(x) * cosh(y) + i * cos(x) * sinh(y)
+  // cos(x + i * y) = cos(x) * cosh(y) - i * sin(x) * sinh(y)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::sin(x);
+    float expected_real = std::sin(x.real()) * std::cosh(x.imag());
+    float expected_imag = std::cos(x.real()) * std::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::sin(x);
+    float expected_real = ::sin(x.real()) * ::cosh(x.imag());
+    float expected_imag = ::cos(x.real()) * ::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::cos(x);
+    float expected_real = std::cos(x.real()) * std::cosh(x.imag());
+    float expected_imag = -std::sin(x.real()) * std::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::cos(x);
+    float expected_real = ::cos(x.real()) * ::cosh(x.imag());
+    float expected_imag = -::sin(x.real()) * ::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::sin(x);
+    float expected_real = std::sin(x.real()) * std::cosh(x.imag());
+    float expected_imag = std::cos(x.real()) * std::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::sin(x);
+    float expected_real = ::sin(x.real()) * ::cosh(x.imag());
+    float expected_imag = ::cos(x.real()) * ::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::cos(x);
+    float expected_real = std::cos(x.real()) * std::cosh(x.imag());
+    float expected_imag = -std::sin(x.real()) * std::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::cos(x);
+    float expected_real = ::cos(x.real()) * ::cosh(x.imag());
+    float expected_imag = -::sin(x.real()) * ::sinh(x.imag());
+    C10_ASSERT_NEAR(y.real(), expected_real, tol);
+    C10_ASSERT_NEAR(y.imag(), expected_imag, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestTan, Identity) {
+  // tan(x) = sin(x) / cos(x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::tan(x);
+    c10::complex<float> z = std::sin(x) / std::cos(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::tan(x);
+    c10::complex<float> z = ::sin(x) / ::cos(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::tan(x);
+    c10::complex<double> z = std::sin(x) / std::cos(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::tan(x);
+    c10::complex<double> z = ::sin(x) / ::cos(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestTanh, Identity) {
+  // tanh(x) = sinh(x) / cosh(x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = std::tanh(x);
+    c10::complex<float> z = std::sinh(x) / std::cosh(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> y = ::tanh(x);
+    c10::complex<float> z = ::sinh(x) / ::cosh(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = std::tanh(x);
+    c10::complex<double> z = std::sinh(x) / std::cosh(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> y = ::tanh(x);
+    c10::complex<double> z = ::sinh(x) / ::cosh(x);
+    C10_ASSERT_NEAR(y.real(), z.real(), tol);
+    C10_ASSERT_NEAR(y.imag(), z.imag(), tol);
+  }
+}
+
+// Rev trigonometric functions
+
+C10_DEFINE_TEST(TestRevTrigonometric, Rev) {
+  // asin(sin(x)) = x
+  // acos(cos(x)) = x
+  // atan(tan(x)) = x
+  {
+    c10::complex<float> x(0.5, 0.6);
+    c10::complex<float> s = std::sin(x);
+    c10::complex<float> ss = std::asin(s);
+    c10::complex<float> c = std::cos(x);
+    c10::complex<float> cc = std::acos(c);
+    c10::complex<float> t = std::tan(x);
+    c10::complex<float> tt = std::atan(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.5, 0.6);
+    c10::complex<float> s = ::sin(x);
+    c10::complex<float> ss = ::asin(s);
+    c10::complex<float> c = ::cos(x);
+    c10::complex<float> cc = ::acos(c);
+    c10::complex<float> t = ::tan(x);
+    c10::complex<float> tt = ::atan(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.5, 0.6);
+    c10::complex<double> s = std::sin(x);
+    c10::complex<double> ss = std::asin(s);
+    c10::complex<double> c = std::cos(x);
+    c10::complex<double> cc = std::acos(c);
+    c10::complex<double> t = std::tan(x);
+    c10::complex<double> tt = std::atan(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.5, 0.6);
+    c10::complex<double> s = ::sin(x);
+    c10::complex<double> ss = ::asin(s);
+    c10::complex<double> c = ::cos(x);
+    c10::complex<double> cc = ::acos(c);
+    c10::complex<double> t = ::tan(x);
+    c10::complex<double> tt = ::atan(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+}
+
+// Rev hyperbolic functions
+
+C10_DEFINE_TEST(TestRevHyperbolic, Rev) {
+  // asinh(sinh(x)) = x
+  // acosh(cosh(x)) = x
+  // atanh(tanh(x)) = x
+  {
+    c10::complex<float> x(0.5, 0.6);
+    c10::complex<float> s = std::sinh(x);
+    c10::complex<float> ss = std::asinh(s);
+    c10::complex<float> c = std::cosh(x);
+    c10::complex<float> cc = std::acosh(c);
+    c10::complex<float> t = std::tanh(x);
+    c10::complex<float> tt = std::atanh(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<float> x(0.5, 0.6);
+    c10::complex<float> s = ::sinh(x);
+    c10::complex<float> ss = ::asinh(s);
+    c10::complex<float> c = ::cosh(x);
+    c10::complex<float> cc = ::acosh(c);
+    c10::complex<float> t = ::tanh(x);
+    c10::complex<float> tt = ::atanh(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.5, 0.6);
+    c10::complex<double> s = std::sinh(x);
+    c10::complex<double> ss = std::asinh(s);
+    c10::complex<double> c = std::cosh(x);
+    c10::complex<double> cc = std::acosh(c);
+    c10::complex<double> t = std::tanh(x);
+    c10::complex<double> tt = std::atanh(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.5, 0.6);
+    c10::complex<double> s = ::sinh(x);
+    c10::complex<double> ss = ::asinh(s);
+    c10::complex<double> c = ::cosh(x);
+    c10::complex<double> cc = ::acosh(c);
+    c10::complex<double> t = ::tanh(x);
+    c10::complex<double> tt = ::atanh(t);
+    C10_ASSERT_NEAR(x.real(), ss.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), ss.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), cc.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), cc.imag(), tol);
+    C10_ASSERT_NEAR(x.real(), tt.real(), tol);
+    C10_ASSERT_NEAR(x.imag(), tt.imag(), tol);
+  }
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..94586ba1293ac4c922d6638817ce7a92b14d83b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/test/util/complex_test_common.h
@@ -0,0 +1,663 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <c10/macros/Macros.h>
+#include <c10/util/complex.h>
+#include <c10/util/hash.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define MAYBE_GLOBAL __global__
+#else
+#define MAYBE_GLOBAL
+#endif
+
+#define PI 3.141592653589793238463
+
+namespace memory {
+
+MAYBE_GLOBAL void test_size() {
+  static_assert(sizeof(c10::complex<float>) == 2 * sizeof(float), "");
+  static_assert(sizeof(c10::complex<double>) == 2 * sizeof(double), "");
+}
+
+MAYBE_GLOBAL void test_align() {
+  static_assert(alignof(c10::complex<float>) == 2 * sizeof(float), "");
+  static_assert(alignof(c10::complex<double>) == 2 * sizeof(double), "");
+}
+
+MAYBE_GLOBAL void test_pod() {
+  static_assert(std::is_standard_layout<c10::complex<float>>::value, "");
+  static_assert(std::is_standard_layout<c10::complex<double>>::value, "");
+}
+
+TEST(TestMemory, ReinterpretCast) {
+  {
+    std::complex<float> z(1, 2);
+    c10::complex<float> zz = *reinterpret_cast<c10::complex<float>*>(&z);
+    ASSERT_EQ(zz.real(), float(1));
+    ASSERT_EQ(zz.imag(), float(2));
+  }
+
+  {
+    c10::complex<float> z(3, 4);
+    std::complex<float> zz = *reinterpret_cast<std::complex<float>*>(&z);
+    ASSERT_EQ(zz.real(), float(3));
+    ASSERT_EQ(zz.imag(), float(4));
+  }
+
+  {
+    std::complex<double> z(1, 2);
+    c10::complex<double> zz = *reinterpret_cast<c10::complex<double>*>(&z);
+    ASSERT_EQ(zz.real(), double(1));
+    ASSERT_EQ(zz.imag(), double(2));
+  }
+
+  {
+    c10::complex<double> z(3, 4);
+    std::complex<double> zz = *reinterpret_cast<std::complex<double>*>(&z);
+    ASSERT_EQ(zz.real(), double(3));
+    ASSERT_EQ(zz.imag(), double(4));
+  }
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+TEST(TestMemory, ThrustReinterpretCast) {
+  {
+    thrust::complex<float> z(1, 2);
+    c10::complex<float> zz = *reinterpret_cast<c10::complex<float>*>(&z);
+    ASSERT_EQ(zz.real(), float(1));
+    ASSERT_EQ(zz.imag(), float(2));
+  }
+
+  {
+    c10::complex<float> z(3, 4);
+    thrust::complex<float> zz = *reinterpret_cast<thrust::complex<float>*>(&z);
+    ASSERT_EQ(zz.real(), float(3));
+    ASSERT_EQ(zz.imag(), float(4));
+  }
+
+  {
+    thrust::complex<double> z(1, 2);
+    c10::complex<double> zz = *reinterpret_cast<c10::complex<double>*>(&z);
+    ASSERT_EQ(zz.real(), double(1));
+    ASSERT_EQ(zz.imag(), double(2));
+  }
+
+  {
+    c10::complex<double> z(3, 4);
+    thrust::complex<double> zz =
+        *reinterpret_cast<thrust::complex<double>*>(&z);
+    ASSERT_EQ(zz.real(), double(3));
+    ASSERT_EQ(zz.imag(), double(4));
+  }
+}
+#endif
+
+} // namespace memory
+
+namespace constructors {
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_construct_from_scalar() {
+  constexpr scalar_t num1 = scalar_t(1.23);
+  constexpr scalar_t num2 = scalar_t(4.56);
+  constexpr scalar_t zero = scalar_t();
+  static_assert(c10::complex<scalar_t>(num1, num2).real() == num1, "");
+  static_assert(c10::complex<scalar_t>(num1, num2).imag() == num2, "");
+  static_assert(c10::complex<scalar_t>(num1).real() == num1, "");
+  static_assert(c10::complex<scalar_t>(num1).imag() == zero, "");
+  static_assert(c10::complex<scalar_t>().real() == zero, "");
+  static_assert(c10::complex<scalar_t>().imag() == zero, "");
+}
+
+template <typename scalar_t, typename other_t>
+C10_HOST_DEVICE void test_construct_from_other() {
+  constexpr other_t num1 = other_t(1.23);
+  constexpr other_t num2 = other_t(4.56);
+  constexpr scalar_t num3 = scalar_t(num1);
+  constexpr scalar_t num4 = scalar_t(num2);
+  static_assert(
+      c10::complex<scalar_t>(c10::complex<other_t>(num1, num2)).real() == num3,
+      "");
+  static_assert(
+      c10::complex<scalar_t>(c10::complex<other_t>(num1, num2)).imag() == num4,
+      "");
+}
+
+MAYBE_GLOBAL void test_convert_constructors() {
+  test_construct_from_scalar<float>();
+  test_construct_from_scalar<double>();
+
+  static_assert(
+      std::is_convertible<c10::complex<float>, c10::complex<float>>::value, "");
+  static_assert(
+      !std::is_convertible<c10::complex<double>, c10::complex<float>>::value,
+      "");
+  static_assert(
+      std::is_convertible<c10::complex<float>, c10::complex<double>>::value,
+      "");
+  static_assert(
+      std::is_convertible<c10::complex<double>, c10::complex<double>>::value,
+      "");
+
+  static_assert(
+      std::is_constructible<c10::complex<float>, c10::complex<float>>::value,
+      "");
+  static_assert(
+      std::is_constructible<c10::complex<double>, c10::complex<float>>::value,
+      "");
+  static_assert(
+      std::is_constructible<c10::complex<float>, c10::complex<double>>::value,
+      "");
+  static_assert(
+      std::is_constructible<c10::complex<double>, c10::complex<double>>::value,
+      "");
+
+  test_construct_from_other<float, float>();
+  test_construct_from_other<float, double>();
+  test_construct_from_other<double, float>();
+  test_construct_from_other<double, double>();
+}
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_construct_from_std() {
+  constexpr scalar_t num1 = scalar_t(1.23);
+  constexpr scalar_t num2 = scalar_t(4.56);
+  static_assert(
+      c10::complex<scalar_t>(std::complex<scalar_t>(num1, num2)).real() == num1,
+      "");
+  static_assert(
+      c10::complex<scalar_t>(std::complex<scalar_t>(num1, num2)).imag() == num2,
+      "");
+}
+
+MAYBE_GLOBAL void test_std_conversion() {
+  test_construct_from_std<float>();
+  test_construct_from_std<double>();
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename scalar_t>
+void test_construct_from_thrust() {
+  constexpr scalar_t num1 = scalar_t(1.23);
+  constexpr scalar_t num2 = scalar_t(4.56);
+  ASSERT_EQ(
+      c10::complex<scalar_t>(thrust::complex<scalar_t>(num1, num2)).real(),
+      num1);
+  ASSERT_EQ(
+      c10::complex<scalar_t>(thrust::complex<scalar_t>(num1, num2)).imag(),
+      num2);
+}
+
+TEST(TestConstructors, FromThrust) {
+  test_construct_from_thrust<float>();
+  test_construct_from_thrust<double>();
+}
+#endif
+
+TEST(TestConstructors, UnorderedMap) {
+  std::unordered_map<
+      c10::complex<double>,
+      c10::complex<double>,
+      c10::hash<c10::complex<double>>>
+      m;
+  auto key1 = c10::complex<double>(2.5, 3);
+  auto key2 = c10::complex<double>(2, 0);
+  auto val1 = c10::complex<double>(2, -3.2);
+  auto val2 = c10::complex<double>(0, -3);
+  m[key1] = val1;
+  m[key2] = val2;
+  ASSERT_EQ(m[key1], val1);
+  ASSERT_EQ(m[key2], val2);
+}
+
+} // namespace constructors
+
+namespace assignment {
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> one() {
+  c10::complex<scalar_t> result(3, 4);
+  result = scalar_t(1);
+  return result;
+}
+
+MAYBE_GLOBAL void test_assign_real() {
+  static_assert(one<float>().real() == float(1), "");
+  static_assert(one<float>().imag() == float(), "");
+  static_assert(one<double>().real() == double(1), "");
+  static_assert(one<double>().imag() == double(), "");
+}
+
+constexpr std::tuple<c10::complex<double>, c10::complex<float>> one_two() {
+  constexpr c10::complex<float> src(1, 2);
+  c10::complex<double> ret0;
+  c10::complex<float> ret1;
+  ret0 = ret1 = src;
+  return std::make_tuple(ret0, ret1);
+}
+
+MAYBE_GLOBAL void test_assign_other() {
+  constexpr auto tup = one_two();
+  static_assert(std::get<c10::complex<double>>(tup).real() == double(1), "");
+  static_assert(std::get<c10::complex<double>>(tup).imag() == double(2), "");
+  static_assert(std::get<c10::complex<float>>(tup).real() == float(1), "");
+  static_assert(std::get<c10::complex<float>>(tup).imag() == float(2), "");
+}
+
+constexpr std::tuple<c10::complex<double>, c10::complex<float>> one_two_std() {
+  constexpr std::complex<float> src(1, 1);
+  c10::complex<double> ret0;
+  c10::complex<float> ret1;
+  ret0 = ret1 = src;
+  return std::make_tuple(ret0, ret1);
+}
+
+MAYBE_GLOBAL void test_assign_std() {
+  constexpr auto tup = one_two();
+  static_assert(std::get<c10::complex<double>>(tup).real() == double(1), "");
+  static_assert(std::get<c10::complex<double>>(tup).imag() == double(2), "");
+  static_assert(std::get<c10::complex<float>>(tup).real() == float(1), "");
+  static_assert(std::get<c10::complex<float>>(tup).imag() == float(2), "");
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+C10_HOST_DEVICE std::tuple<c10::complex<double>, c10::complex<float>>
+one_two_thrust() {
+  thrust::complex<float> src(1, 2);
+  c10::complex<double> ret0;
+  c10::complex<float> ret1;
+  ret0 = ret1 = src;
+  return std::make_tuple(ret0, ret1);
+}
+
+TEST(TestAssignment, FromThrust) {
+  auto tup = one_two_thrust();
+  ASSERT_EQ(std::get<c10::complex<double>>(tup).real(), double(1));
+  ASSERT_EQ(std::get<c10::complex<double>>(tup).imag(), double(2));
+  ASSERT_EQ(std::get<c10::complex<float>>(tup).real(), float(1));
+  ASSERT_EQ(std::get<c10::complex<float>>(tup).imag(), float(2));
+}
+#endif
+
+} // namespace assignment
+
+namespace literals {
+
+MAYBE_GLOBAL void test_complex_literals() {
+  using namespace c10::complex_literals;
+  static_assert(std::is_same<decltype(0.5_if), c10::complex<float>>::value, "");
+  static_assert((0.5_if).real() == float(), "");
+  static_assert((0.5_if).imag() == float(0.5), "");
+  static_assert(
+      std::is_same<decltype(0.5_id), c10::complex<double>>::value, "");
+  static_assert((0.5_id).real() == float(), "");
+  static_assert((0.5_id).imag() == float(0.5), "");
+
+  static_assert(std::is_same<decltype(1_if), c10::complex<float>>::value, "");
+  static_assert((1_if).real() == float(), "");
+  static_assert((1_if).imag() == float(1), "");
+  static_assert(std::is_same<decltype(1_id), c10::complex<double>>::value, "");
+  static_assert((1_id).real() == double(), "");
+  static_assert((1_id).imag() == double(1), "");
+}
+
+} // namespace literals
+
+namespace real_imag {
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> zero_one() {
+  c10::complex<scalar_t> result;
+  result.imag(scalar_t(1));
+  return result;
+}
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> one_zero() {
+  c10::complex<scalar_t> result;
+  result.real(scalar_t(1));
+  return result;
+}
+
+MAYBE_GLOBAL void test_real_imag_modify() {
+  static_assert(zero_one<float>().real() == float(0), "");
+  static_assert(zero_one<float>().imag() == float(1), "");
+  static_assert(zero_one<double>().real() == double(0), "");
+  static_assert(zero_one<double>().imag() == double(1), "");
+
+  static_assert(one_zero<float>().real() == float(1), "");
+  static_assert(one_zero<float>().imag() == float(0), "");
+  static_assert(one_zero<double>().real() == double(1), "");
+  static_assert(one_zero<double>().imag() == double(0), "");
+}
+
+} // namespace real_imag
+
+namespace arithmetic_assign {
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> p(scalar_t value) {
+  c10::complex<scalar_t> result(scalar_t(2), scalar_t(2));
+  result += value;
+  return result;
+}
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> m(scalar_t value) {
+  c10::complex<scalar_t> result(scalar_t(2), scalar_t(2));
+  result -= value;
+  return result;
+}
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> t(scalar_t value) {
+  c10::complex<scalar_t> result(scalar_t(2), scalar_t(2));
+  result *= value;
+  return result;
+}
+
+template <typename scalar_t>
+constexpr c10::complex<scalar_t> d(scalar_t value) {
+  c10::complex<scalar_t> result(scalar_t(2), scalar_t(2));
+  result /= value;
+  return result;
+}
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_arithmetic_assign_scalar() {
+  constexpr c10::complex<scalar_t> x = p(scalar_t(1));
+  static_assert(x.real() == scalar_t(3), "");
+  static_assert(x.imag() == scalar_t(2), "");
+  constexpr c10::complex<scalar_t> y = m(scalar_t(1));
+  static_assert(y.real() == scalar_t(1), "");
+  static_assert(y.imag() == scalar_t(2), "");
+  constexpr c10::complex<scalar_t> z = t(scalar_t(2));
+  static_assert(z.real() == scalar_t(4), "");
+  static_assert(z.imag() == scalar_t(4), "");
+  constexpr c10::complex<scalar_t> t = d(scalar_t(2));
+  static_assert(t.real() == scalar_t(1), "");
+  static_assert(t.imag() == scalar_t(1), "");
+}
+
+template <typename scalar_t, typename rhs_t>
+constexpr c10::complex<scalar_t> p(
+    scalar_t real,
+    scalar_t imag,
+    c10::complex<rhs_t> rhs) {
+  c10::complex<scalar_t> result(real, imag);
+  result += rhs;
+  return result;
+}
+
+template <typename scalar_t, typename rhs_t>
+constexpr c10::complex<scalar_t> m(
+    scalar_t real,
+    scalar_t imag,
+    c10::complex<rhs_t> rhs) {
+  c10::complex<scalar_t> result(real, imag);
+  result -= rhs;
+  return result;
+}
+
+template <typename scalar_t, typename rhs_t>
+constexpr c10::complex<scalar_t> t(
+    scalar_t real,
+    scalar_t imag,
+    c10::complex<rhs_t> rhs) {
+  c10::complex<scalar_t> result(real, imag);
+  result *= rhs;
+  return result;
+}
+
+template <typename scalar_t, typename rhs_t>
+constexpr c10::complex<scalar_t> d(
+    scalar_t real,
+    scalar_t imag,
+    c10::complex<rhs_t> rhs) {
+  c10::complex<scalar_t> result(real, imag);
+  result /= rhs;
+  return result;
+}
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_arithmetic_assign_complex() {
+  using namespace c10::complex_literals;
+  constexpr c10::complex<scalar_t> x2 = p(scalar_t(2), scalar_t(2), 1.0_if);
+  static_assert(x2.real() == scalar_t(2), "");
+  static_assert(x2.imag() == scalar_t(3), "");
+  constexpr c10::complex<scalar_t> x3 = p(scalar_t(2), scalar_t(2), 1.0_id);
+  static_assert(x3.real() == scalar_t(2), "");
+
+  // this test is skipped due to a bug in constexpr evaluation
+  // in nvcc. This bug has already been fixed since CUDA 11.2
+#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020)
+  static_assert(x3.imag() == scalar_t(3), "");
+#endif
+
+  constexpr c10::complex<scalar_t> y2 = m(scalar_t(2), scalar_t(2), 1.0_if);
+  static_assert(y2.real() == scalar_t(2), "");
+  static_assert(y2.imag() == scalar_t(1), "");
+  constexpr c10::complex<scalar_t> y3 = m(scalar_t(2), scalar_t(2), 1.0_id);
+  static_assert(y3.real() == scalar_t(2), "");
+
+  // this test is skipped due to a bug in constexpr evaluation
+  // in nvcc. This bug has already been fixed since CUDA 11.2
+#if !defined(__CUDACC__) || (defined(CUDA_VERSION) && CUDA_VERSION >= 11020)
+  static_assert(y3.imag() == scalar_t(1), "");
+#endif
+
+  constexpr c10::complex<scalar_t> z2 = t(scalar_t(1), scalar_t(-2), 1.0_if);
+  static_assert(z2.real() == scalar_t(2), "");
+  static_assert(z2.imag() == scalar_t(1), "");
+  constexpr c10::complex<scalar_t> z3 = t(scalar_t(1), scalar_t(-2), 1.0_id);
+  static_assert(z3.real() == scalar_t(2), "");
+  static_assert(z3.imag() == scalar_t(1), "");
+
+  constexpr c10::complex<scalar_t> t2 = d(scalar_t(-1), scalar_t(2), 1.0_if);
+  static_assert(t2.real() == scalar_t(2), "");
+  static_assert(t2.imag() == scalar_t(1), "");
+  constexpr c10::complex<scalar_t> t3 = d(scalar_t(-1), scalar_t(2), 1.0_id);
+  static_assert(t3.real() == scalar_t(2), "");
+  static_assert(t3.imag() == scalar_t(1), "");
+}
+
+MAYBE_GLOBAL void test_arithmetic_assign() {
+  test_arithmetic_assign_scalar<float>();
+  test_arithmetic_assign_scalar<double>();
+  test_arithmetic_assign_complex<float>();
+  test_arithmetic_assign_complex<double>();
+}
+
+} // namespace arithmetic_assign
+
+namespace arithmetic {
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_arithmetic_() {
+  static_assert(
+      c10::complex<scalar_t>(1, 2) == +c10::complex<scalar_t>(1, 2), "");
+  static_assert(
+      c10::complex<scalar_t>(-1, -2) == -c10::complex<scalar_t>(1, 2), "");
+
+  static_assert(
+      c10::complex<scalar_t>(1, 2) + c10::complex<scalar_t>(3, 4) ==
+          c10::complex<scalar_t>(4, 6),
+      "");
+  static_assert(
+      c10::complex<scalar_t>(1, 2) + scalar_t(3) ==
+          c10::complex<scalar_t>(4, 2),
+      "");
+  static_assert(
+      scalar_t(3) + c10::complex<scalar_t>(1, 2) ==
+          c10::complex<scalar_t>(4, 2),
+      "");
+
+  static_assert(
+      c10::complex<scalar_t>(1, 2) - c10::complex<scalar_t>(3, 4) ==
+          c10::complex<scalar_t>(-2, -2),
+      "");
+  static_assert(
+      c10::complex<scalar_t>(1, 2) - scalar_t(3) ==
+          c10::complex<scalar_t>(-2, 2),
+      "");
+  static_assert(
+      scalar_t(3) - c10::complex<scalar_t>(1, 2) ==
+          c10::complex<scalar_t>(2, -2),
+      "");
+
+  static_assert(
+      c10::complex<scalar_t>(1, 2) * c10::complex<scalar_t>(3, 4) ==
+          c10::complex<scalar_t>(-5, 10),
+      "");
+  static_assert(
+      c10::complex<scalar_t>(1, 2) * scalar_t(3) ==
+          c10::complex<scalar_t>(3, 6),
+      "");
+  static_assert(
+      scalar_t(3) * c10::complex<scalar_t>(1, 2) ==
+          c10::complex<scalar_t>(3, 6),
+      "");
+
+  static_assert(
+      c10::complex<scalar_t>(-5, 10) / c10::complex<scalar_t>(3, 4) ==
+          c10::complex<scalar_t>(1, 2),
+      "");
+  static_assert(
+      c10::complex<scalar_t>(5, 10) / scalar_t(5) ==
+          c10::complex<scalar_t>(1, 2),
+      "");
+  static_assert(
+      scalar_t(25) / c10::complex<scalar_t>(3, 4) ==
+          c10::complex<scalar_t>(3, -4),
+      "");
+}
+
+MAYBE_GLOBAL void test_arithmetic() {
+  test_arithmetic_<float>();
+  test_arithmetic_<double>();
+}
+
+template <typename T, typename int_t>
+void test_binary_ops_for_int_type_(T real, T img, int_t num) {
+  c10::complex<T> c(real, img);
+  ASSERT_EQ(c + num, c10::complex<T>(real + num, img));
+  ASSERT_EQ(num + c, c10::complex<T>(num + real, img));
+  ASSERT_EQ(c - num, c10::complex<T>(real - num, img));
+  ASSERT_EQ(num - c, c10::complex<T>(num - real, -img));
+  ASSERT_EQ(c * num, c10::complex<T>(real * num, img * num));
+  ASSERT_EQ(num * c, c10::complex<T>(num * real, num * img));
+  ASSERT_EQ(c / num, c10::complex<T>(real / num, img / num));
+  ASSERT_EQ(
+      num / c,
+      c10::complex<T>(num * real / std::norm(c), -num * img / std::norm(c)));
+}
+
+template <typename T>
+void test_binary_ops_for_all_int_types_(T real, T img, int8_t i) {
+  test_binary_ops_for_int_type_<T, int8_t>(real, img, i);
+  test_binary_ops_for_int_type_<T, int16_t>(real, img, i);
+  test_binary_ops_for_int_type_<T, int32_t>(real, img, i);
+  test_binary_ops_for_int_type_<T, int64_t>(real, img, i);
+}
+
+TEST(TestArithmeticIntScalar, All) {
+  test_binary_ops_for_all_int_types_<float>(1.0, 0.1, 1);
+  test_binary_ops_for_all_int_types_<double>(-1.3, -0.2, -2);
+}
+
+} // namespace arithmetic
+
+namespace equality {
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_equality_() {
+  static_assert(
+      c10::complex<scalar_t>(1, 2) == c10::complex<scalar_t>(1, 2), "");
+  static_assert(c10::complex<scalar_t>(1, 0) == scalar_t(1), "");
+  static_assert(scalar_t(1) == c10::complex<scalar_t>(1, 0), "");
+  static_assert(
+      c10::complex<scalar_t>(1, 2) != c10::complex<scalar_t>(3, 4), "");
+  static_assert(c10::complex<scalar_t>(1, 2) != scalar_t(1), "");
+  static_assert(scalar_t(1) != c10::complex<scalar_t>(1, 2), "");
+}
+
+MAYBE_GLOBAL void test_equality() {
+  test_equality_<float>();
+  test_equality_<double>();
+}
+
+} // namespace equality
+
+namespace io {
+
+template <typename scalar_t>
+void test_io_() {
+  std::stringstream ss;
+  c10::complex<scalar_t> a(1, 2);
+  ss << a;
+  ASSERT_EQ(ss.str(), "(1,2)");
+  ss.str("(3,4)");
+  ss >> a;
+  ASSERT_TRUE(a == c10::complex<scalar_t>(3, 4));
+}
+
+TEST(TestIO, All) {
+  test_io_<float>();
+  test_io_<double>();
+}
+
+} // namespace io
+
+namespace test_std {
+
+template <typename scalar_t>
+C10_HOST_DEVICE void test_callable_() {
+  static_assert(std::real(c10::complex<scalar_t>(1, 2)) == scalar_t(1), "");
+  static_assert(std::imag(c10::complex<scalar_t>(1, 2)) == scalar_t(2), "");
+  std::abs(c10::complex<scalar_t>(1, 2));
+  std::arg(c10::complex<scalar_t>(1, 2));
+  static_assert(std::norm(c10::complex<scalar_t>(3, 4)) == scalar_t(25), "");
+  static_assert(
+      std::conj(c10::complex<scalar_t>(3, 4)) == c10::complex<scalar_t>(3, -4),
+      "");
+  c10::polar(float(1), float(PI / 2));
+  c10::polar(double(1), double(PI / 2));
+}
+
+MAYBE_GLOBAL void test_callable() {
+  test_callable_<float>();
+  test_callable_<double>();
+}
+
+template <typename scalar_t>
+void test_values_() {
+  ASSERT_EQ(std::abs(c10::complex<scalar_t>(3, 4)), scalar_t(5));
+  ASSERT_LT(std::abs(std::arg(c10::complex<scalar_t>(0, 1)) - PI / 2), 1e-6);
+  ASSERT_LT(
+      std::abs(
+          c10::polar(scalar_t(1), scalar_t(PI / 2)) -
+          c10::complex<scalar_t>(0, 1)),
+      1e-6);
+}
+
+TEST(TestStd, BasicFunctions) {
+  test_values_<float>();
+  test_values_<double>();
+  // CSQRT edge cases: checks for overflows which are likely to occur
+  // if square root is computed using polar form
+  ASSERT_LT(
+      std::abs(std::sqrt(c10::complex<float>(-1e20, -4988429.2)).real()), 3e-4);
+  ASSERT_LT(
+      std::abs(std::sqrt(c10::complex<double>(-1e60, -4988429.2)).real()),
+      3e-4);
+}
+
+} // namespace test_std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7bcaaa28af3871f95280a9bd764aea260405ca1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AbortHandler.h
@@ -0,0 +1,88 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/env.h>
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <optional>
+
+namespace c10 {
+class AbortHandlerHelper {
+ public:
+  static AbortHandlerHelper& getInstance() {
+#ifdef _WIN32
+    thread_local
+#endif // _WIN32
+        static AbortHandlerHelper instance;
+    return instance;
+  }
+
+  void set(std::terminate_handler handler) {
+    std::lock_guard<std::mutex> lk(mutex);
+    if (!inited) {
+      prev = std::set_terminate(handler);
+      curr = std::get_terminate();
+      inited = true;
+    }
+  }
+
+  std::terminate_handler getPrev() const {
+    return prev;
+  }
+
+ private:
+  std::terminate_handler prev = nullptr;
+  std::terminate_handler curr = nullptr;
+  bool inited = false;
+  std::mutex mutex;
+  AbortHandlerHelper() = default;
+  ~AbortHandlerHelper() {
+    // Only restore the handler if we are the current one
+    if (inited && curr == std::get_terminate()) {
+      std::set_terminate(prev);
+    }
+  }
+
+ public:
+  AbortHandlerHelper(AbortHandlerHelper const&) = delete;
+  void operator=(AbortHandlerHelper const&) = delete;
+  AbortHandlerHelper(AbortHandlerHelper&&) = delete;
+  void operator=(AbortHandlerHelper&&) = delete;
+};
+
+namespace detail {
+C10_ALWAYS_INLINE void terminate_handler() {
+  std::cout << "Unhandled exception caught in c10/util/AbortHandler.h" << '\n';
+  auto backtrace = get_backtrace();
+  std::cout << backtrace << '\n' << std::flush;
+  auto prev_handler = AbortHandlerHelper::getInstance().getPrev();
+  if (prev_handler) {
+    prev_handler();
+  } else {
+    std::abort();
+  }
+}
+} // namespace detail
+
+C10_ALWAYS_INLINE void set_terminate_handler() {
+  bool use_custom_terminate = false;
+  // On Windows it is enabled by default based on
+  // https://github.com/pytorch/pytorch/pull/50320#issuecomment-763147062
+#ifdef _WIN32
+  use_custom_terminate = true;
+#endif // _WIN32
+  auto result = c10::utils::check_env("TORCH_CUSTOM_TERMINATE");
+  if (result != std::nullopt) {
+    use_custom_terminate = result.value();
+  }
+  if (use_custom_terminate) {
+    AbortHandlerHelper::getInstance().set(detail::terminate_handler);
+  }
+}
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce9fe90961700f2a1dd3f9c25e120eaa9609fc03
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/AlignOf.h
@@ -0,0 +1,181 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AlignedCharArray and AlignedCharArrayUnion classes.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::AlignOf
+// replaced LLVM_ALIGNAS with alignas
+
+#pragma once
+
+#include <cstddef>
+
+namespace c10 {
+
+/// \struct AlignedCharArray
+/// \brief Helper for building an aligned character array type.
+///
+/// This template is used to explicitly build up a collection of aligned
+/// character array types. We have to build these up using a macro and explicit
+/// specialization to cope with MSVC (at least till 2015) where only an
+/// integer literal can be used to specify an alignment constraint. Once built
+/// up here, we can then begin to indirect between these using normal C++
+/// template parameters.
+
+// MSVC requires special handling here.
+#ifndef _MSC_VER
+
+template <size_t Alignment, size_t Size>
+struct AlignedCharArray {
+  // NOLINTNEXTLINE(*c-arrays)
+  alignas(Alignment) char buffer[Size];
+};
+
+#else // _MSC_VER
+
+/// \brief Create a type with an aligned char buffer.
+template <size_t Alignment, size_t Size>
+struct AlignedCharArray;
+
+// We provide special variations of this template for the most common
+// alignments because __declspec(align(...)) doesn't actually work when it is
+// a member of a by-value function argument in MSVC, even if the alignment
+// request is something reasonably like 8-byte or 16-byte. Note that we can't
+// even include the declspec with the union that forces the alignment because
+// MSVC warns on the existence of the declspec despite the union member forcing
+// proper alignment.
+
+template <size_t Size>
+struct AlignedCharArray<1, Size> {
+  union {
+    char aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<2, Size> {
+  union {
+    short aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<4, Size> {
+  union {
+    int aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<8, Size> {
+  union {
+    double aligned;
+    char buffer[Size];
+  };
+};
+
+// The rest of these are provided with a __declspec(align(...)) and we simply
+// can't pass them by-value as function arguments on MSVC.
+
+#define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <size_t Size>                          \
+  struct AlignedCharArray<x, Size> {              \
+    __declspec(align(x)) char buffer[Size];       \
+  };
+
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
+
+#undef AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#endif // _MSC_VER
+
+namespace detail {
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+class AlignerImpl {
+  T1 t1;
+  T2 t2;
+  T3 t3;
+  T4 t4;
+  T5 t5;
+  T6 t6;
+  T7 t7;
+  T8 t8;
+  T9 t9;
+  T10 t10;
+
+ public:
+  AlignerImpl() = delete;
+};
+
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+union SizerImpl {
+  // NOLINTNEXTLINE(*c-arrays)
+  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
+      arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+      arr9[sizeof(T9)], arr10[sizeof(T10)];
+};
+} // end namespace detail
+
+/// \brief This union template exposes a suitably aligned and sized character
+/// array member which can hold elements of any of up to ten types.
+///
+/// These types may be arrays, structs, or any other types. The goal is to
+/// expose a char array buffer member which can be used as suitable storage for
+/// a placement new of any of these types. Support for more than ten types can
+/// be added at the cost of more boilerplate.
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+struct AlignedCharArrayUnion
+    : AlignedCharArray<
+          alignof(detail::AlignerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>),
+          sizeof(::c10::detail::
+                     SizerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>)> {};
+} // end namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h
new file mode 100644
index 0000000000000000000000000000000000000000..7410fc4e829fa44aadb22f61e85e1f05f9a81134
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ApproximateClock.h
@@ -0,0 +1,120 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright 2023-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <array>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <type_traits>
+
+#if defined(C10_IOS) && defined(C10_MOBILE)
+#include <sys/time.h> // for gettimeofday()
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
+#define C10_RDTSC
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__CUDACC__) || defined(__HIPCC__)
+#undef C10_RDTSC
+#elif defined(__clang__)
+// `__rdtsc` is available by default.
+// NB: This has to be first, because Clang will also define `__GNUC__`
+#elif defined(__GNUC__)
+#include <x86intrin.h>
+#else
+#undef C10_RDTSC
+#endif
+#endif
+
+namespace c10 {
+
+using time_t = int64_t;
+using steady_clock_t = std::conditional_t<
+    std::chrono::high_resolution_clock::is_steady,
+    std::chrono::high_resolution_clock,
+    std::chrono::steady_clock>;
+
+inline time_t getTimeSinceEpoch() {
+  auto now = std::chrono::system_clock::now().time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
+}
+
+inline time_t getTime(bool allow_monotonic = false) {
+#if defined(C10_IOS) && defined(C10_MOBILE)
+  // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
+  // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
+  // is implemented or not
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<time_t>(now.tv_sec) * 1000000000 +
+      static_cast<time_t>(now.tv_usec) * 1000;
+#elif defined(_WIN32) || defined(__MACH__)
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             steady_clock_t::now().time_since_epoch())
+      .count();
+#else
+  // clock_gettime is *much* faster than std::chrono implementation on Linux
+  struct timespec t{};
+  auto mode = CLOCK_REALTIME;
+  if (allow_monotonic) {
+    mode = CLOCK_MONOTONIC;
+  }
+  clock_gettime(mode, &t);
+  return static_cast<time_t>(t.tv_sec) * 1000000000 +
+      static_cast<time_t>(t.tv_nsec);
+#endif
+}
+
+// We often do not need to capture true wall times. If a fast mechanism such
+// as TSC is available we can use that instead and convert back to epoch time
+// during post processing. This greatly reduce the clock's contribution to
+// profiling.
+//   http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
+//   https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
+// TODO: We should use
+// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
+inline auto getApproximateTime() {
+#if defined(C10_RDTSC)
+  return static_cast<uint64_t>(__rdtsc());
+#else
+  return getTime();
+#endif
+}
+
+using approx_time_t = decltype(getApproximateTime());
+static_assert(
+    std::is_same_v<approx_time_t, int64_t> ||
+        std::is_same_v<approx_time_t, uint64_t>,
+    "Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
+
+// Convert `getCount` results to Nanoseconds since unix epoch.
+class C10_API ApproximateClockToUnixTimeConverter final {
+ public:
+  ApproximateClockToUnixTimeConverter();
+  std::function<time_t(approx_time_t)> makeConverter();
+
+  struct UnixAndApproximateTimePair {
+    time_t t_;
+    approx_time_t approx_t_;
+  };
+  static UnixAndApproximateTimePair measurePair();
+
+ private:
+  static constexpr size_t replicates = 1001;
+  using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
+  time_pairs measurePairs();
+
+  time_pairs start_times_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..9da524e96ce718b7782e1584a795c919af0ecd78
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ArrayRef.h
@@ -0,0 +1,326 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::ArrayRef.
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+namespace c10 {
+/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// consecutively in memory), i.e. a start pointer and a length.  It allows
+/// various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the ArrayRef. For this reason, it is not in general
+/// safe to store an ArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+///
+/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
+/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
+/// the underlying constexpr calls, we rely on apparent-type dispatch for
+/// inheritance. This should be fine because their memory format is the same,
+/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
+/// However, you should prefer to use ArrayRef when possible, because its use
+/// of TORCH_CHECK will lead to better user-facing error messages.
+template <typename T>
+// ArrayRef cannot be derived from. Normally, we would use `final`
+// specifier to force this constraint at compile time.  However, Intel
+// compiler does not recognize ArrayRef as a class template (which is
+// required in the definition of at::TensorAccessor, for instance)
+// when `final` specifier is used. So, we cannot define ArrayRef as
+// final because of the Intel compiler issue.
+class ArrayRef : public HeaderOnlyArrayRef<T> {
+ public:
+  /// @name Constructors, all inherited from HeaderOnlyArrayRef except for
+  /// SmallVector. As inherited constructors won't work with class template
+  /// argument deduction (CTAD) until C++23, we add deduction guides after
+  /// the class definition to enable CTAD.
+  /// @{
+
+  using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
+
+  /// Construct an ArrayRef from a SmallVector. This is templated in order to
+  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+  /// copy-construct an ArrayRef.
+  /// NOTE: this is the only constructor that is not inherited from
+  /// HeaderOnlyArrayRef.
+  template <typename U>
+  /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
+  /// @{
+
+  /// front - Get the first element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
+  constexpr const T& front() const {
+    TORCH_CHECK(
+        !this->empty(), "ArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
+  }
+
+  /// back - Get the last element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
+  constexpr const T& back() const {
+    TORCH_CHECK(
+        !this->empty(), "ArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
+  constexpr ArrayRef<T> slice(size_t N, size_t M) const {
+    TORCH_CHECK(
+        N + M <= this->size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; M = ",
+        M,
+        "; size = ",
+        this->size());
+    return ArrayRef<T>(this->data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
+  constexpr ArrayRef<T> slice(size_t N) const {
+    TORCH_CHECK(
+        N <= this->size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N); // should this slice be this->slice?
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+
+  /// Vector compatibility
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
+  constexpr const T& at(size_t Index) const {
+    TORCH_CHECK(
+        Index < this->Length,
+        "ArrayRef: invalid index Index = ",
+        Index,
+        "; Length = ",
+        this->Length);
+    return this->Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+};
+
+/// Deduction guides for ArrayRef to support CTAD with inherited constructors
+/// These mirror the constructors inherited from HeaderOnlyArrayRef
+/// @{
+
+// Single element constructor
+template <typename T>
+ArrayRef(const T&) -> ArrayRef<T>;
+
+// Pointer and length constructor
+template <typename T>
+ArrayRef(const T*, size_t) -> ArrayRef<T>;
+
+// Range constructor (begin, end)
+template <typename T>
+ArrayRef(const T*, const T*) -> ArrayRef<T>;
+
+// Generic container constructor (anything with .data() and .size())
+template <typename Container>
+ArrayRef(const Container&) -> ArrayRef<
+    std::remove_pointer_t<decltype(std::declval<Container>().data())>>;
+
+// std::vector constructor
+template <typename T, typename A>
+ArrayRef(const std::vector<T, A>&) -> ArrayRef<T>;
+
+// std::array constructor
+template <typename T, size_t N>
+ArrayRef(const std::array<T, N>&) -> ArrayRef<T>;
+
+// C array constructor
+template <typename T, size_t N>
+ArrayRef(const T (&)[N]) -> ArrayRef<T>;
+
+// std::initializer_list constructor
+template <typename T>
+ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
+
+/// @}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
+  int i = 0;
+  out << '[';
+  for (const auto& e : list) {
+    if (i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << ']';
+  return out;
+}
+
+/// @name ArrayRef Convenience constructors
+/// @{
+
+/// Construct an ArrayRef from a single element.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T& OneElt) {
+  return OneElt;
+}
+
+/// Construct an ArrayRef from a pointer and length.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* data, size_t length) {
+  return ArrayRef<T>(data, length);
+}
+
+/// Construct an ArrayRef from a range.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* begin, const T* end) {
+  return ArrayRef<T>(begin, end);
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const SmallVectorImpl<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T, unsigned N>
+ArrayRef<T> makeArrayRef(const SmallVector<T, N>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::vector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const std::vector<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::array.
+template <typename T, std::size_t N>
+ArrayRef<T> makeArrayRef(const std::array<T, N>& Arr) {
+  return Arr;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+template <typename T>
+ArrayRef<T> makeArrayRef(const ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op)
+template <typename T>
+ArrayRef<T>& makeArrayRef(ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a C array.
+template <typename T, size_t N>
+// NOLINTNEXTLINE(*c-arrays*)
+ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+  return ArrayRef<T>(Arr);
+}
+
+// WARNING: Template instantiation will NOT be willing to do an implicit
+// conversions to get you to an c10::ArrayRef, which is why we need so
+// many overloads.
+
+template <typename T>
+bool operator==(c10::ArrayRef<T> a1, c10::ArrayRef<T> a2) {
+  return a1.equals(a2);
+}
+
+template <typename T>
+bool operator!=(c10::ArrayRef<T> a1, c10::ArrayRef<T> a2) {
+  return !a1.equals(a2);
+}
+
+template <typename T>
+bool operator==(const std::vector<T>& a1, c10::ArrayRef<T> a2) {
+  return c10::ArrayRef<T>(a1).equals(a2);
+}
+
+template <typename T>
+bool operator!=(const std::vector<T>& a1, c10::ArrayRef<T> a2) {
+  return !c10::ArrayRef<T>(a1).equals(a2);
+}
+
+template <typename T>
+bool operator==(c10::ArrayRef<T> a1, const std::vector<T>& a2) {
+  return a1.equals(c10::ArrayRef<T>(a2));
+}
+
+template <typename T>
+bool operator!=(c10::ArrayRef<T> a1, const std::vector<T>& a2) {
+  return !a1.equals(c10::ArrayRef<T>(a2));
+}
+
+using IntArrayRef = ArrayRef<int64_t>;
+
+using IntList [[deprecated(
+    "This alias is deprecated because it doesn't make ownership semantics obvious. Use IntArrayRef instead!")]] =
+    ArrayRef<int64_t>;
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..90ca6b677ab3740550f4700479497fd58c35536b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/BFloat16-inl.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/BFloat16.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e01d94ea590ccd96414ec760b09a48419de9de8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Bitset.h
@@ -0,0 +1,123 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstddef>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+namespace c10::utils {
+
+/**
+ * This is a simple bitset class with sizeof(long long int) bits.
+ * You can set bits, unset bits, query bits by index,
+ * and query for the first set bit.
+ * Before using this class, please also take a look at std::bitset,
+ * which has more functionality and is more generic. It is probably
+ * a better fit for your use case. The sole reason for c10::utils::bitset
+ * to exist is that std::bitset misses a find_first_set() method.
+ */
+struct bitset final {
+ private:
+#if defined(_MSC_VER)
+  // MSVCs _BitScanForward64 expects int64_t
+  using bitset_type = int64_t;
+#else
+  // POSIX ffsll expects long long int
+  using bitset_type = long long int;
+#endif
+ public:
+  static constexpr size_t NUM_BITS() {
+    return 8 * sizeof(bitset_type);
+  }
+
+  constexpr bitset() noexcept = default;
+  constexpr bitset(const bitset&) noexcept = default;
+  constexpr bitset(bitset&&) noexcept = default;
+  // there is an issue for gcc 5.3.0 when define default function as constexpr
+  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
+  bitset& operator=(const bitset&) noexcept = default;
+  bitset& operator=(bitset&&) noexcept = default;
+  ~bitset() = default;
+
+  constexpr void set(size_t index) noexcept {
+    bitset_ |= (static_cast<long long int>(1) << index);
+  }
+
+  constexpr void unset(size_t index) noexcept {
+    bitset_ &= ~(static_cast<long long int>(1) << index);
+  }
+
+  constexpr bool get(size_t index) const noexcept {
+    return bitset_ & (static_cast<long long int>(1) << index);
+  }
+
+  constexpr bool is_entirely_unset() const noexcept {
+    return 0 == bitset_;
+  }
+
+  // Call the given functor with the index of each bit that is set
+  template <class Func>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  void for_each_set_bit(Func&& func) const {
+    bitset cur = *this;
+    size_t index = cur.find_first_set();
+    while (0 != index) {
+      // -1 because find_first_set() is not one-indexed.
+      index -= 1;
+      func(index);
+      cur.unset(index);
+      index = cur.find_first_set();
+    }
+  }
+
+ private:
+  // Return the index of the first set bit. The returned index is one-indexed
+  // (i.e. if the very first bit is set, this function returns '1'), and a
+  // return of '0' means that there was no bit set.
+  size_t find_first_set() const {
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+    unsigned long result;
+    bool has_bits_set = (0 != _BitScanForward64(&result, bitset_));
+    if (!has_bits_set) {
+      return 0;
+    }
+    return result + 1;
+#elif defined(_MSC_VER) && defined(_M_IX86)
+    unsigned long result;
+    if (static_cast<uint32_t>(bitset_) != 0) {
+      bool has_bits_set =
+          (0 != _BitScanForward(&result, static_cast<uint32_t>(bitset_)));
+      if (!has_bits_set) {
+        return 0;
+      }
+      return result + 1;
+    } else {
+      bool has_bits_set =
+          (0 != _BitScanForward(&result, static_cast<uint32_t>(bitset_ >> 32)));
+      if (!has_bits_set) {
+        return 32;
+      }
+      return result + 33;
+    }
+#else
+    return __builtin_ffsll(bitset_);
+#endif
+  }
+
+  friend bool operator==(bitset lhs, bitset rhs) noexcept {
+    return lhs.bitset_ == rhs.bitset_;
+  }
+
+  bitset_type bitset_{0};
+};
+
+inline bool operator!=(bitset lhs, bitset rhs) noexcept {
+  return !(lhs == rhs);
+}
+
+} // namespace c10::utils
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9e010daa58b3e172456412c6478bfb1006b3e3b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/C++17.h
@@ -0,0 +1,75 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#ifndef C10_UTIL_CPP17_H_
+#define C10_UTIL_CPP17_H_
+
+#include <c10/macros/Macros.h>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 9
+#error \
+    "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later."
+#endif
+
+#if defined(__clang__) && __clang_major__ < 9
+#error \
+    "You're trying to build PyTorch with a too old version of Clang. We need Clang 9 or later."
+#endif
+
+#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
+    (!defined(_MSC_VER) && __cplusplus < 201703L)
+#error You need C++17 to compile PyTorch
+#endif
+
+#if defined(_WIN32) && (defined(min) || defined(max))
+#error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#endif
+
+/*
+ * This header adds some polyfills with C++17 functionality
+ */
+
+namespace c10 {
+
+namespace guts {
+
+#if defined(__HIP__)
+
+// Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
+// modified)
+// TODO This is an incomplete implementation of std::apply, not working for
+// member functions.
+namespace detail {
+template <class F, class Tuple, std::size_t... INDEX>
+C10_HOST_DEVICE constexpr auto apply_impl(
+    F&& f,
+    Tuple&& t,
+    std::index_sequence<INDEX...>) {
+  return std::forward<F>(f)(std::get<INDEX>(std::forward<Tuple>(t))...);
+}
+} // namespace detail
+
+template <class F, class Tuple>
+C10_HOST_DEVICE constexpr auto apply(F&& f, Tuple&& t) {
+  return detail::apply_impl(
+      std::forward<F>(f),
+      std::forward<Tuple>(t),
+      std::make_index_sequence<
+          std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
+}
+
+#endif
+
+} // namespace guts
+
+} // namespace c10
+
+#endif // C10_UTIL_CPP17_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h
new file mode 100644
index 0000000000000000000000000000000000000000..0037755f64a8fce82ae816391559c2123e3ad1cf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/CallOnce.h
@@ -0,0 +1,75 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/C++17.h>
+
+#include <atomic>
+#include <functional>
+#include <mutex>
+#include <utility>
+
+namespace c10 {
+
+// custom c10 call_once implementation to avoid the deadlock in std::call_once.
+// The implementation here is a simplified version from folly and likely much
+// much higher memory footprint.
+template <typename Flag, typename F, typename... Args>
+inline void call_once(Flag& flag, F&& f, Args&&... args) {
+  if (C10_LIKELY(flag.test_once())) {
+    return;
+  }
+  flag.call_once_slow(std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+class once_flag {
+ public:
+#ifndef _WIN32
+  // running into build error on MSVC. Can't seem to get a repro locally so I'm
+  // just avoiding constexpr
+  //
+  //   C:/actions-runner/_work/pytorch/pytorch\c10/util/CallOnce.h(26): error:
+  //   defaulted default constructor cannot be constexpr because the
+  //   corresponding implicitly declared default constructor would not be
+  //   constexpr 1 error detected in the compilation of
+  //   "C:/actions-runner/_work/pytorch/pytorch/aten/src/ATen/cuda/cub.cu".
+  constexpr
+#endif
+      once_flag() noexcept = default;
+  once_flag(const once_flag&) = delete;
+  once_flag& operator=(const once_flag&) = delete;
+  once_flag(once_flag&&) = delete;
+  once_flag& operator=(once_flag&&) = delete;
+  ~once_flag() = default;
+  bool test_once() {
+    return init_.load(std::memory_order_acquire);
+  }
+
+ private:
+  template <typename Flag, typename F, typename... Args>
+  friend void call_once(Flag& flag, F&& f, Args&&... args);
+
+  template <typename F, typename... Args>
+  void call_once_slow(F&& f, Args&&... args) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    if (init_.load(std::memory_order_relaxed)) {
+      return;
+    }
+    std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+    init_.store(true, std::memory_order_release);
+  }
+
+  void reset_once() {
+    init_.store(false, std::memory_order_release);
+  }
+
+ private:
+  std::mutex mutex_;
+  std::atomic<bool> init_{false};
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h
new file mode 100644
index 0000000000000000000000000000000000000000..56dd979ce833087e264e6e8faef8563019fa3ea5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ConstexprCrc.h
@@ -0,0 +1,137 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/IdWrapper.h>
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace c10::util {
+
+namespace detail {
+// NOLINTNEXTLINE(*c-arrays*)
+constexpr uint64_t crc64_table[] = {
+    0x0000000000000000, 0x7ad870c830358979, 0xf5b0e190606b12f2,
+    0x8f689158505e9b8b, 0xc038e5739841b68f, 0xbae095bba8743ff6,
+    0x358804e3f82aa47d, 0x4f50742bc81f2d04, 0xab28ecb46814fe75,
+    0xd1f09c7c5821770c, 0x5e980d24087fec87, 0x24407dec384a65fe,
+    0x6b1009c7f05548fa, 0x11c8790fc060c183, 0x9ea0e857903e5a08,
+    0xe478989fa00bd371, 0x7d08ff3b88be6f81, 0x07d08ff3b88be6f8,
+    0x88b81eabe8d57d73, 0xf2606e63d8e0f40a, 0xbd301a4810ffd90e,
+    0xc7e86a8020ca5077, 0x4880fbd87094cbfc, 0x32588b1040a14285,
+    0xd620138fe0aa91f4, 0xacf86347d09f188d, 0x2390f21f80c18306,
+    0x594882d7b0f40a7f, 0x1618f6fc78eb277b, 0x6cc0863448deae02,
+    0xe3a8176c18803589, 0x997067a428b5bcf0, 0xfa11fe77117cdf02,
+    0x80c98ebf2149567b, 0x0fa11fe77117cdf0, 0x75796f2f41224489,
+    0x3a291b04893d698d, 0x40f16bccb908e0f4, 0xcf99fa94e9567b7f,
+    0xb5418a5cd963f206, 0x513912c379682177, 0x2be1620b495da80e,
+    0xa489f35319033385, 0xde51839b2936bafc, 0x9101f7b0e12997f8,
+    0xebd98778d11c1e81, 0x64b116208142850a, 0x1e6966e8b1770c73,
+    0x8719014c99c2b083, 0xfdc17184a9f739fa, 0x72a9e0dcf9a9a271,
+    0x08719014c99c2b08, 0x4721e43f0183060c, 0x3df994f731b68f75,
+    0xb29105af61e814fe, 0xc849756751dd9d87, 0x2c31edf8f1d64ef6,
+    0x56e99d30c1e3c78f, 0xd9810c6891bd5c04, 0xa3597ca0a188d57d,
+    0xec09088b6997f879, 0x96d1784359a27100, 0x19b9e91b09fcea8b,
+    0x636199d339c963f2, 0xdf7adabd7a6e2d6f, 0xa5a2aa754a5ba416,
+    0x2aca3b2d1a053f9d, 0x50124be52a30b6e4, 0x1f423fcee22f9be0,
+    0x659a4f06d21a1299, 0xeaf2de5e82448912, 0x902aae96b271006b,
+    0x74523609127ad31a, 0x0e8a46c1224f5a63, 0x81e2d7997211c1e8,
+    0xfb3aa75142244891, 0xb46ad37a8a3b6595, 0xceb2a3b2ba0eecec,
+    0x41da32eaea507767, 0x3b024222da65fe1e, 0xa2722586f2d042ee,
+    0xd8aa554ec2e5cb97, 0x57c2c41692bb501c, 0x2d1ab4dea28ed965,
+    0x624ac0f56a91f461, 0x1892b03d5aa47d18, 0x97fa21650afae693,
+    0xed2251ad3acf6fea, 0x095ac9329ac4bc9b, 0x7382b9faaaf135e2,
+    0xfcea28a2faafae69, 0x8632586aca9a2710, 0xc9622c4102850a14,
+    0xb3ba5c8932b0836d, 0x3cd2cdd162ee18e6, 0x460abd1952db919f,
+    0x256b24ca6b12f26d, 0x5fb354025b277b14, 0xd0dbc55a0b79e09f,
+    0xaa03b5923b4c69e6, 0xe553c1b9f35344e2, 0x9f8bb171c366cd9b,
+    0x10e3202993385610, 0x6a3b50e1a30ddf69, 0x8e43c87e03060c18,
+    0xf49bb8b633338561, 0x7bf329ee636d1eea, 0x012b592653589793,
+    0x4e7b2d0d9b47ba97, 0x34a35dc5ab7233ee, 0xbbcbcc9dfb2ca865,
+    0xc113bc55cb19211c, 0x5863dbf1e3ac9dec, 0x22bbab39d3991495,
+    0xadd33a6183c78f1e, 0xd70b4aa9b3f20667, 0x985b3e827bed2b63,
+    0xe2834e4a4bd8a21a, 0x6debdf121b863991, 0x1733afda2bb3b0e8,
+    0xf34b37458bb86399, 0x8993478dbb8deae0, 0x06fbd6d5ebd3716b,
+    0x7c23a61ddbe6f812, 0x3373d23613f9d516, 0x49aba2fe23cc5c6f,
+    0xc6c333a67392c7e4, 0xbc1b436e43a74e9d, 0x95ac9329ac4bc9b5,
+    0xef74e3e19c7e40cc, 0x601c72b9cc20db47, 0x1ac40271fc15523e,
+    0x5594765a340a7f3a, 0x2f4c0692043ff643, 0xa02497ca54616dc8,
+    0xdafce7026454e4b1, 0x3e847f9dc45f37c0, 0x445c0f55f46abeb9,
+    0xcb349e0da4342532, 0xb1eceec59401ac4b, 0xfebc9aee5c1e814f,
+    0x8464ea266c2b0836, 0x0b0c7b7e3c7593bd, 0x71d40bb60c401ac4,
+    0xe8a46c1224f5a634, 0x927c1cda14c02f4d, 0x1d148d82449eb4c6,
+    0x67ccfd4a74ab3dbf, 0x289c8961bcb410bb, 0x5244f9a98c8199c2,
+    0xdd2c68f1dcdf0249, 0xa7f41839ecea8b30, 0x438c80a64ce15841,
+    0x3954f06e7cd4d138, 0xb63c61362c8a4ab3, 0xcce411fe1cbfc3ca,
+    0x83b465d5d4a0eece, 0xf96c151de49567b7, 0x76048445b4cbfc3c,
+    0x0cdcf48d84fe7545, 0x6fbd6d5ebd3716b7, 0x15651d968d029fce,
+    0x9a0d8ccedd5c0445, 0xe0d5fc06ed698d3c, 0xaf85882d2576a038,
+    0xd55df8e515432941, 0x5a3569bd451db2ca, 0x20ed197575283bb3,
+    0xc49581ead523e8c2, 0xbe4df122e51661bb, 0x3125607ab548fa30,
+    0x4bfd10b2857d7349, 0x04ad64994d625e4d, 0x7e7514517d57d734,
+    0xf11d85092d094cbf, 0x8bc5f5c11d3cc5c6, 0x12b5926535897936,
+    0x686de2ad05bcf04f, 0xe70573f555e26bc4, 0x9ddd033d65d7e2bd,
+    0xd28d7716adc8cfb9, 0xa85507de9dfd46c0, 0x273d9686cda3dd4b,
+    0x5de5e64efd965432, 0xb99d7ed15d9d8743, 0xc3450e196da80e3a,
+    0x4c2d9f413df695b1, 0x36f5ef890dc31cc8, 0x79a59ba2c5dc31cc,
+    0x037deb6af5e9b8b5, 0x8c157a32a5b7233e, 0xf6cd0afa9582aa47,
+    0x4ad64994d625e4da, 0x300e395ce6106da3, 0xbf66a804b64ef628,
+    0xc5bed8cc867b7f51, 0x8aeeace74e645255, 0xf036dc2f7e51db2c,
+    0x7f5e4d772e0f40a7, 0x05863dbf1e3ac9de, 0xe1fea520be311aaf,
+    0x9b26d5e88e0493d6, 0x144e44b0de5a085d, 0x6e963478ee6f8124,
+    0x21c640532670ac20, 0x5b1e309b16452559, 0xd476a1c3461bbed2,
+    0xaeaed10b762e37ab, 0x37deb6af5e9b8b5b, 0x4d06c6676eae0222,
+    0xc26e573f3ef099a9, 0xb8b627f70ec510d0, 0xf7e653dcc6da3dd4,
+    0x8d3e2314f6efb4ad, 0x0256b24ca6b12f26, 0x788ec2849684a65f,
+    0x9cf65a1b368f752e, 0xe62e2ad306bafc57, 0x6946bb8b56e467dc,
+    0x139ecb4366d1eea5, 0x5ccebf68aecec3a1, 0x2616cfa09efb4ad8,
+    0xa97e5ef8cea5d153, 0xd3a62e30fe90582a, 0xb0c7b7e3c7593bd8,
+    0xca1fc72bf76cb2a1, 0x45775673a732292a, 0x3faf26bb9707a053,
+    0x70ff52905f188d57, 0x0a2722586f2d042e, 0x854fb3003f739fa5,
+    0xff97c3c80f4616dc, 0x1bef5b57af4dc5ad, 0x61372b9f9f784cd4,
+    0xee5fbac7cf26d75f, 0x9487ca0fff135e26, 0xdbd7be24370c7322,
+    0xa10fceec0739fa5b, 0x2e675fb4576761d0, 0x54bf2f7c6752e8a9,
+    0xcdcf48d84fe75459, 0xb71738107fd2dd20, 0x387fa9482f8c46ab,
+    0x42a7d9801fb9cfd2, 0x0df7adabd7a6e2d6, 0x772fdd63e7936baf,
+    0xf8474c3bb7cdf024, 0x829f3cf387f8795d, 0x66e7a46c27f3aa2c,
+    0x1c3fd4a417c62355, 0x935745fc4798b8de, 0xe98f353477ad31a7,
+    0xa6df411fbfb21ca3, 0xdc0731d78f8795da, 0x536fa08fdfd90e51,
+    0x29b7d047efec8728,
+};
+
+inline constexpr uint64_t crc64impl(
+    uint64_t accumulator,
+    const char* data,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    accumulator =
+        crc64_table[(accumulator ^ data[i]) & 0xFF] ^ (accumulator >> 8);
+  }
+  return accumulator;
+}
+} // namespace detail
+
+struct crc64_t final : IdWrapper<crc64_t, uint64_t> {
+  constexpr crc64_t(uint64_t checksum) : IdWrapper(checksum) {}
+  constexpr uint64_t checksum() const {
+    return this->underlyingId();
+  }
+};
+
+// CRC64 with Jones coefficients and an init value of 0.
+inline constexpr crc64_t crc64(const char* str, size_t size) {
+  return crc64_t{detail::crc64impl(0, str, size)};
+}
+
+inline constexpr crc64_t crc64(std::string_view str) {
+  return crc64(str.data(), str.size());
+}
+} // namespace c10::util
+
+// Allow usage of crc64_t in std::unordered_set
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::crc64_t)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccd1ac50400d3dcdc160c42e8745bac7139c8217
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Deprecated.h
@@ -0,0 +1,7 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <torch/headeronly/util/Deprecated.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..682b8f364a2094c0feec2b6c19a8e2e54d296ee1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DimVector.h
@@ -0,0 +1,22 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/core/impl/SizesAndStrides.h>
+#include <c10/util/SmallVector.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace c10 {
+
+constexpr size_t kDimVectorStaticSize = C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE;
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, kDimVectorStaticSize>;
+using SymDimVector = SmallVector<c10::SymInt, kDimVectorStaticSize>;
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h
new file mode 100644
index 0000000000000000000000000000000000000000..37e0af4319435c223442cc52d2b34d8e12e2715b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/DynamicCounter.h
@@ -0,0 +1,54 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string_view>
+
+#include <c10/macros/Macros.h>
+
+namespace c10::monitor {
+
+class C10_API DynamicCounter {
+ public:
+  using Callback = std::function<int64_t()>;
+
+  // Creates a dynamic counter that can be queried at any point in time by
+  // multiple backends. Only one counter with a given key can exist at any point
+  // in time.
+  //
+  // The callback is invoked every time the counter is queried.
+  // The callback must be thread-safe.
+  // The callback must not throw.
+  // The callback must not block.
+  DynamicCounter(std::string_view key, Callback getCounterCallback);
+
+  // Unregisters the callback.
+  // Waits for all ongoing callback invocations to finish.
+  ~DynamicCounter();
+
+ private:
+  struct Guard;
+  std::unique_ptr<Guard> guard_;
+};
+
+namespace detail {
+class DynamicCounterBackendIf {
+ public:
+  virtual ~DynamicCounterBackendIf() = default;
+
+  virtual void registerCounter(
+      std::string_view key,
+      DynamicCounter::Callback getCounterCallback) = 0;
+  // MUST wait for all ongoing callback invocations to finish
+  virtual void unregisterCounter(std::string_view key) = 0;
+};
+
+void C10_API registerDynamicCounterBackend(
+    std::unique_ptr<DynamicCounterBackendIf> /*backend*/);
+} // namespace detail
+} // namespace c10::monitor
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6b4a7fa25013fa413504a69fb177b0e1d6febcc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Exception.h
@@ -0,0 +1,875 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef C10_UTIL_EXCEPTION_H_
+#define C10_UTIL_EXCEPTION_H_
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/Lazy.h>
+#include <c10/util/StringUtil.h>
+
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+namespace c10 {
+
+/// The primary ATen error class.
+/// Provides a complete error message with source location information via
+/// `what()`, and a more concise message via `what_without_backtrace()`.
+/// Don't throw this directly; use TORCH_CHECK/TORCH_INTERNAL_ASSERT instead.
+///
+/// NB: c10::Error is handled specially by the default torch to suppress the
+/// backtrace, see torch/csrc/Exceptions.h
+class C10_API Error : public std::exception {
+ private:
+  // The actual error message.
+  std::string msg_;
+
+  // Context for the message (in order of decreasing specificity).  Context will
+  // be automatically formatted appropriately, so it is not necessary to add
+  // extra leading/trailing newlines to strings inside this vector
+  std::vector<std::string> context_;
+
+  // The C++ backtrace at the point when this exception was raised.  This
+  // may be empty if there is no valid backtrace.  (We don't use optional
+  // here to reduce the dependencies this file has.)
+  Backtrace backtrace_;
+
+  // These two are derived fields from msg_stack_ and backtrace_, but we need
+  // fields for the strings so that we can return a const char* (as the
+  // signature of std::exception requires).  Currently, the invariant
+  // is that these fields are ALWAYS populated consistently with respect
+  // to msg_stack_ and backtrace_.
+  mutable OptimisticLazy<std::string> what_;
+  std::string what_without_backtrace_;
+
+  // This is a little debugging trick: you can stash a relevant pointer
+  // in caller, and then when you catch the exception, you can compare
+  // against pointers you have on hand to get more information about
+  // where the exception came from.  In Caffe2, this is used to figure
+  // out which operator raised an exception.
+  const void* caller_;
+
+ public:
+  // PyTorch-style Error constructor.  NB: the implementation of this
+  // is actually in Logging.cpp
+  Error(SourceLocation source_location, std::string msg);
+
+  // Caffe2-style error message
+  Error(
+      const char* file,
+      const uint32_t line,
+      const char* condition,
+      const std::string& msg,
+      Backtrace backtrace,
+      const void* caller = nullptr);
+
+  // Base constructor
+  Error(
+      std::string msg,
+      Backtrace backtrace = nullptr,
+      const void* caller = nullptr);
+
+  // Add some new context to the message stack.  The last added context
+  // will be formatted at the end of the context list upon printing.
+  // WARNING: This method is O(n) in the size of the stack, so don't go
+  // wild adding a ridiculous amount of context to error messages.
+  void add_context(std::string msg);
+
+  const std::string& msg() const {
+    return msg_;
+  }
+
+  const std::vector<std::string>& context() const {
+    return context_;
+  }
+
+  const Backtrace& backtrace() const;
+
+  /// Returns the complete error message, including the source location.
+  /// The returned pointer is invalidated if you call add_context() on
+  /// this object.
+  const char* what() const noexcept override;
+
+  const void* caller() const noexcept {
+    return caller_;
+  }
+
+  /// Returns only the error message string, without source location.
+  /// The returned pointer is invalidated if you call add_context() on
+  /// this object.
+  virtual const char* what_without_backtrace() const noexcept {
+    return what_without_backtrace_.c_str();
+  }
+
+ private:
+  void refresh_what();
+  std::string compute_what(bool include_backtrace) const;
+};
+
+class C10_API Warning {
+ public:
+  class C10_API UserWarning{};
+  class C10_API DeprecationWarning{};
+
+  using warning_variant_t = std::variant<UserWarning, DeprecationWarning>;
+
+  Warning(
+      warning_variant_t type,
+      const SourceLocation& source_location,
+      std::string msg,
+      bool verbatim);
+
+  Warning(
+      warning_variant_t type,
+      SourceLocation source_location,
+      const char* msg,
+      bool verbatim);
+
+  Warning(
+      warning_variant_t type,
+      SourceLocation source_location,
+      ::c10::detail::CompileTimeEmptyString msg,
+      bool verbatim);
+
+  // Getters for members
+  warning_variant_t type() const;
+  const SourceLocation& source_location() const;
+  const std::string& msg() const;
+  bool verbatim() const;
+
+ private:
+  // The type of warning
+  warning_variant_t type_;
+
+  // Where the warning happened.
+  SourceLocation source_location_;
+
+  // The actual warning message.
+  std::string msg_;
+
+  // See note: [Verbatim Warnings]
+  bool verbatim_;
+};
+
+using UserWarning = Warning::UserWarning;
+using DeprecationWarning = Warning::DeprecationWarning;
+
+// Issue a warning with a given message. Dispatched to the current
+// warning handler.
+void C10_API warn(const Warning& warning);
+
+class C10_API WarningHandler {
+ public:
+  virtual ~WarningHandler() = default;
+  /// The default warning handler. Prints the message to stderr.
+  virtual void process(const Warning& warning);
+};
+
+namespace WarningUtils {
+
+// Note: [Verbatim Warnings]
+// Warnings originating in C++ code can appear out-of-place to Python users:
+// a user runs a line in Python, but the warning references a line in C++.
+// Some parts of PyTorch, like the JIT, are cognizant of this mismatch
+// and take care to map warnings back to the user's program, but most
+// of PyTorch simply throws a context-free warning. To allow warning
+// handlers to add context where appropriate, warn takes the
+// "verbatim" flag. When this is false a warning handler might append
+// the C++ warning to a Python warning message that relates the warning
+// back to the user's program. Callers who have already accounted for
+// context in their warnings should set verbatim to true so their warnings
+// appear without modification.
+
+/// Sets the global warning handler. This is not thread-safe, so it should
+/// generally be called once during initialization or while holding the GIL
+/// for programs that use python.
+/// User is responsible for keeping the WarningHandler alive until
+/// it is not needed.
+C10_API void set_warning_handler(WarningHandler* handler) noexcept(true);
+/// Gets the global warning handler.
+C10_API WarningHandler* get_warning_handler() noexcept(true);
+
+class C10_API WarningHandlerGuard {
+  WarningHandler* prev_handler_;
+
+ public:
+  WarningHandlerGuard(WarningHandler* new_handler)
+      : prev_handler_(c10::WarningUtils::get_warning_handler()) {
+    c10::WarningUtils::set_warning_handler(new_handler);
+  }
+  WarningHandlerGuard(WarningHandlerGuard&& other) = delete;
+  WarningHandlerGuard(const WarningHandlerGuard&) = delete;
+  WarningHandlerGuard& operator=(const WarningHandlerGuard&) = delete;
+  WarningHandlerGuard& operator=(WarningHandlerGuard&&) = delete;
+  ~WarningHandlerGuard() {
+    c10::WarningUtils::set_warning_handler(prev_handler_);
+  }
+};
+
+/// The TORCH_WARN_ONCE macro is difficult to test for. Use
+/// setWarnAlways(true) to turn it into TORCH_WARN, which can be
+/// tested for more easily.
+C10_API void set_warnAlways(bool /*setting*/) noexcept(true);
+C10_API bool get_warnAlways() noexcept(true);
+
+// A RAII guard that sets warn_always (not thread-local) on
+// construction, and sets it back to the original value upon destruction.
+struct C10_API WarnAlways {
+ public:
+  explicit WarnAlways(bool setting = true);
+  ~WarnAlways();
+
+ private:
+  bool prev_setting;
+};
+
+} // namespace WarningUtils
+
+// Like Error, but we always report the C++ backtrace, instead of only
+// reporting when TORCH_SHOW_CPP_STACKTRACES
+class C10_API ErrorAlwaysShowCppStacktrace : public Error {
+  using Error::Error;
+  const char* what_without_backtrace() const noexcept override {
+    return what();
+  }
+};
+
+// Used in ATen for out-of-bound indices that can reasonably only be detected
+// lazily inside a kernel (See: advanced indexing).  These turn into
+// IndexError when they cross to Python.
+class C10_API IndexError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for invalid values.  These turn into
+// ValueError when they cross to Python.
+class C10_API ValueError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for invalid types.  These turn into
+// TypeError when they cross to Python.
+class C10_API TypeError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for functionality that is not implemented.  These turn into
+// NotImplementedError when they cross to Python.
+class C10_API NotImplementedError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for buffer-related errors, e.g. trying to create a DLPack of
+// an unsupported device.  These turn into BufferError when they cross to
+// Python.
+class C10_API BufferError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for non finite indices.  These turn into
+// ExitException when they cross to Python.
+class C10_API EnforceFiniteError : public Error {
+  using Error::Error;
+};
+
+// Used in Onnxifi backend lowering.  These turn into
+// ExitException when they cross to Python.
+class C10_API OnnxfiBackendSystemError : public Error {
+  using Error::Error;
+};
+
+// Used for numerical errors from the linalg module. These
+// turn into LinAlgError when they cross into Python.
+class C10_API LinAlgError : public Error {
+  using Error::Error;
+};
+
+class C10_API OutOfMemoryError : public Error {
+  using Error::Error;
+};
+
+// Used for handling syntactic errors in input arguments.
+// These turn into SyntaxError when the cross into Python.
+class C10_API SyntaxError : public Error {
+  using Error::Error;
+};
+
+// Raised when accelerator API call hits an error.
+// These turn into AcceleratorError when the cross into Python
+class C10_API AcceleratorError : public Error {
+  int32_t error_code;
+
+ public:
+  AcceleratorError(SourceLocation loc, int32_t code, const std::string& msg)
+      : Error(loc, msg), error_code(code) {}
+  int32_t get_error_code() const {
+    return error_code;
+  }
+};
+
+// Base error type for all distributed errors.
+// These turn into DistError when they cross into Python.
+class C10_API DistError : public Error {
+  using Error::Error;
+};
+
+// Used for collective communication library errors from the distributed module.
+// These turn into DistBackendError when they cross into Python.
+class C10_API DistBackendError : public DistError {
+  using DistError::DistError;
+};
+
+// Used for errors originating from the store.
+// These turn into DistStoreError when they cross into Python.
+class C10_API DistStoreError : public DistError {
+  using DistError::DistError;
+};
+
+// Used for errors originating from the TCP/IP stack and not from collective
+// libraries. These turn into DistNetworkError when they cross into Python.
+class C10_API DistNetworkError : public DistError {
+  using DistError::DistError;
+};
+
+// Raised when a queue is empty and a non-blocking pop is called.
+// Translated to torch.distributed.QueueEmptyError in Python
+class C10_API DistQueueEmptyError : public DistStoreError {
+  using DistStoreError::DistStoreError;
+};
+
+// A utility function to return an exception std::string by prepending its
+// exception type before its what() content
+C10_API std::string GetExceptionString(const std::exception& e);
+
+} // namespace c10
+
+// Private helper macro for implementing TORCH_INTERNAL_ASSERT and TORCH_CHECK
+//
+// Note: In the debug build With MSVC, __LINE__ might be of long type (a.k.a
+// int32_t), which is different from the definition of `SourceLocation` that
+// requires unsigned int (a.k.a uint32_t) and may cause a compile error with the
+// message: error C2397: conversion from 'long' to 'uint32_t' requires a
+// narrowing conversion Here the static cast is used to pass the build. if this
+// is used inside a lambda the __func__ macro expands to operator(), which isn't
+// very useful, but hard to fix in a macro so suppressing the warning.
+#define C10_THROW_ERROR(err_type, msg) \
+  throw ::c10::err_type(               \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
+
+#define C10_BUILD_ERROR(err_type, msg) \
+  ::c10::err_type({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
+
+// Private helper macro for workaround MSVC misexpansion of nested macro
+// invocations involving __VA_ARGS__.  See
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define C10_EXPAND_MSVC_WORKAROUND(x) x
+
+#include <torch/headeronly/util/Exception.h>
+
+// ----------------------------------------------------------------------------
+// Error reporting macros
+// ----------------------------------------------------------------------------
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_RETHROW(e, ...)                       \
+  do {                                              \
+    (void)e; /* Suppress unused variable warning */ \
+    throw;                                          \
+  } while (false)
+#else
+#define TORCH_RETHROW(e, ...)               \
+  do {                                      \
+    e.add_context(::c10::str(__VA_ARGS__)); \
+    throw;                                  \
+  } while (false)
+#endif
+
+// A utility macro to provide assert()-like functionality; that is, enforcement
+// of internal invariants in code.  It supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the assert
+// failure message using operator<< (this is useful to print some variables
+// which may be useful for debugging.)
+//
+// Usage:
+//    TORCH_INTERNAL_ASSERT(should_be_true);
+//    TORCH_INTERNAL_ASSERT(x == 0, "x = ", x);
+//
+// Assuming no bugs in PyTorch, the conditions tested by this macro should
+// always be true; e.g., it should be possible to disable all of these
+// conditions without changing observable user behavior.  If you would like to
+// do error reporting for user input, please use TORCH_CHECK instead.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike assert()).
+//
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_INTERNAL_ASSERT(cond, ...)                              \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                               \
+    ::c10::detail::torchCheckFail(                                    \
+        __func__,                                                     \
+        __FILE__,                                                     \
+        static_cast<uint32_t>(__LINE__),                              \
+        #cond " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__)); \
+  }
+#else
+// It would be nice if we could build a combined string literal out of
+// the TORCH_INTERNAL_ASSERT prefix and a user-provided string literal
+// as the first argument, but there doesn't seem to be any good way to
+// do that while still supporting having a first argument that isn't a
+// string literal.
+#define TORCH_INTERNAL_ASSERT(cond, ...)                                         \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                          \
+    ::c10::detail::torchInternalAssertFail(                                      \
+        __func__,                                                                \
+        __FILE__,                                                                \
+        static_cast<uint32_t>(__LINE__),                                         \
+        #cond                                                                    \
+        " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \
+            __LINE__) ", please report a bug to PyTorch. ",                      \
+        c10::str(__VA_ARGS__));                                                  \
+  }
+#endif
+
+// A utility macro to make it easier to test for error conditions from user
+// input.  Like TORCH_INTERNAL_ASSERT, it supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the error
+// message using operator<< (e.g., you can pass any object which has
+// operator<< defined.  Most objects in PyTorch have these definitions!)
+//
+// Usage:
+//    TORCH_CHECK(should_be_true); // A default error message will be provided
+//                                 // in this case; but we recommend writing an
+//                                 // explicit error message, as it is more
+//                                 // user friendly.
+//    TORCH_CHECK(x == 0, "Expected x to be 0, but got ", x);
+//
+// On failure, this macro will raise an exception.  If this exception propagates
+// to Python, it will convert into a Python RuntimeError.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike CHECK() from glog.)
+//
+#define TORCH_CHECK_WITH(error_t, cond, ...) \
+  TORCH_CHECK_WITH_MSG(error_t, cond, "", __VA_ARGS__)
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_CHECK_MSG(cond, type, ...) \
+  (#cond #type " CHECK FAILED at " C10_STRINGIZE(__FILE__))
+#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...)                \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                               \
+    C10_THROW_ERROR(Error, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \
+  }
+#else
+
+namespace c10::detail {
+template <typename... Args>
+auto torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
+  return ::c10::str(args...);
+}
+inline C10_API const char* torchCheckMsgImpl(const char* msg) {
+  return msg;
+}
+// If there is just 1 user-provided C-string argument, use it.
+inline C10_API const char* torchCheckMsgImpl(
+    const char* /*msg*/,
+    const char* args) {
+  return args;
+}
+} // namespace c10::detail
+
+#define TORCH_CHECK_MSG(cond, type, ...)                   \
+  (::c10::detail::torchCheckMsgImpl(                       \
+      "Expected " #cond                                    \
+      " to be true, but got false.  "                      \
+      "(Could this error message be improved?  If so, "    \
+      "please report an enhancement request to PyTorch.)", \
+      ##__VA_ARGS__))
+#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...)                  \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                 \
+    C10_THROW_ERROR(error_t, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \
+  }
+#endif
+
+namespace c10::detail {
+
+[[noreturn]] C10_API void torchCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const std::string& msg);
+[[noreturn]] C10_API void torchCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+// The c10::str() call that creates userMsg can have 1 of 3 return
+// types depending on the number and types of arguments passed to
+// TORCH_INTERNAL_ASSERT.  0 arguments will get a
+// CompileTimeEmptyString, 1 const char * will be passed straight
+// through, and anything else will get converted to std::string.
+[[noreturn]] C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    const char* userMsg);
+[[noreturn]] inline C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    ::c10::detail::CompileTimeEmptyString /*userMsg*/) {
+  torchCheckFail(func, file, line, condMsg);
+}
+[[noreturn]] C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    const std::string& userMsg);
+
+} // namespace c10::detail
+
+#ifdef STANDALONE_TORCH_HEADER
+
+// TORCH_CHECK throws std::runtime_error instead of c10::Error which is
+// useful when certain headers are used in a libtorch-independent way,
+// e.g. when Vectorized<T> is used in AOTInductor generated code.
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_CHECK(cond, ...)                \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {       \
+    throw std::runtime_error(TORCH_CHECK_MSG( \
+        cond,                                 \
+        "",                                   \
+        __func__,                             \
+        ", ",                                 \
+        __FILE__,                             \
+        ":",                                  \
+        __LINE__,                             \
+        ", ",                                 \
+        __VA_ARGS__));                        \
+  }
+#else
+#define TORCH_CHECK(cond, ...)                \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {       \
+    throw std::runtime_error(TORCH_CHECK_MSG( \
+        cond,                                 \
+        "",                                   \
+        __func__,                             \
+        ", ",                                 \
+        __FILE__,                             \
+        ":",                                  \
+        __LINE__,                             \
+        ", ",                                 \
+        ##__VA_ARGS__));                      \
+  }
+#endif
+
+#else
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_CHECK(cond, ...)                   \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {          \
+    ::c10::detail::torchCheckFail(               \
+        __func__,                                \
+        __FILE__,                                \
+        static_cast<uint32_t>(__LINE__),         \
+        TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \
+  }
+#else
+#define TORCH_CHECK(cond, ...)                     \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {            \
+    ::c10::detail::torchCheckFail(                 \
+        __func__,                                  \
+        __FILE__,                                  \
+        static_cast<uint32_t>(__LINE__),           \
+        TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \
+  }
+#endif
+
+#endif
+
+// An utility macro that does what `TORCH_CHECK` does if compiled in the host
+// code, otherwise does nothing. Supposed to be used in the code shared between
+// host and device code as an alternative for `TORCH_CHECK`.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...)
+#else
+#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...) TORCH_CHECK(cond, ##__VA_ARGS__)
+#endif
+
+// Debug only version of TORCH_INTERNAL_ASSERT. This macro only checks in debug
+// build, and does nothing in release build.  It is appropriate to use
+// in situations where you want to add an assert to a hotpath, but it is
+// too expensive to run this assert on production builds.
+#ifdef NDEBUG
+// Optimized version - generates no code.
+#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \
+  while (false)                               \
+  C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__))
+#else
+#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \
+  C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__))
+#endif
+
+// TODO: We're going to get a lot of similar looking string literals
+// this way; check if this actually affects binary size.
+
+// Like TORCH_CHECK, but raises LinAlgError instead of Error.
+#define TORCH_CHECK_LINALG(cond, ...) \
+  TORCH_CHECK_WITH_MSG(LinAlgError, cond, "LINALG", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises IndexErrors instead of Errors.
+#define TORCH_CHECK_INDEX(cond, ...) \
+  TORCH_CHECK_WITH_MSG(IndexError, cond, "INDEX", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises ValueErrors instead of Errors.
+#define TORCH_CHECK_VALUE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(ValueError, cond, "VALUE", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises TypeErrors instead of Errors.
+#define TORCH_CHECK_TYPE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(TypeError, cond, "TYPE", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises NotImplementedErrors instead of Errors.
+#define TORCH_CHECK_NOT_IMPLEMENTED(cond, ...) \
+  TORCH_CHECK_WITH_MSG(NotImplementedError, cond, "TYPE", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises BufferError instead of Errors.
+#define TORCH_CHECK_BUFFER(cond, ...) \
+  TORCH_CHECK_WITH_MSG(BufferError, cond, "TYPE", __VA_ARGS__)
+
+#define TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(                                   \
+      ErrorAlwaysShowCppStacktrace, cond, "TYPE", ##__VA_ARGS__)
+
+#ifdef STRIP_ERROR_MESSAGES
+#define WARNING_MESSAGE_STRING(...) \
+  ::c10::detail::CompileTimeEmptyString {}
+#else
+#define WARNING_MESSAGE_STRING(...) ::c10::str(__VA_ARGS__)
+#endif
+
+// Report a warning to the user.  Accepts an arbitrary number of extra
+// arguments which are concatenated into the warning message using operator<<
+//
+#ifdef DISABLE_WARN
+#define _TORCH_WARN_WITH(...) ((void)0);
+#else
+#define _TORCH_WARN_WITH(warning_t, ...)                     \
+  ::c10::warn(::c10::Warning(                                \
+      warning_t(),                                           \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+      WARNING_MESSAGE_STRING(__VA_ARGS__),                   \
+      false));
+#endif
+
+#define TORCH_WARN(...) _TORCH_WARN_WITH(::c10::UserWarning, __VA_ARGS__);
+
+#define TORCH_WARN_DEPRECATION(...) \
+  _TORCH_WARN_WITH(::c10::DeprecationWarning, __VA_ARGS__);
+
+// Report a warning to the user only once.  Accepts an arbitrary number of extra
+// arguments which are concatenated into the warning message using operator<<
+//
+#define _TORCH_WARN_ONCE(...)                                \
+  [[maybe_unused]] static const auto C10_ANONYMOUS_VARIABLE( \
+      torch_warn_once_) = [&] {                              \
+    TORCH_WARN(__VA_ARGS__);                                 \
+    return true;                                             \
+  }()
+
+#ifdef DISABLE_WARN
+#define TORCH_WARN_ONCE(...) ((void)0);
+#else
+#define TORCH_WARN_ONCE(...)                   \
+  if (::c10::WarningUtils::get_warnAlways()) { \
+    TORCH_WARN(__VA_ARGS__);                   \
+  } else {                                     \
+    _TORCH_WARN_ONCE(__VA_ARGS__);             \
+  }
+#endif
+
+// Report an error with a specific argument
+// NOTE: using the argument name in TORCH_CHECK's message is preferred
+#define TORCH_CHECK_ARG(cond, argN, ...) \
+  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
+
+#ifndef FATAL_IF
+#ifdef C10_USE_GLOG
+#define FATAL_IF(condition)                                              \
+  condition ? (void)0                                                    \
+            : ::c10::LoggerVoidify() &                                   \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
+              .stream()
+#else
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+#endif
+#endif
+
+#ifndef NON_FATAL_IF
+#ifdef C10_USE_GLOG
+#define NON_FATAL_IF(condition)                                \
+  condition ? (void)0                                          \
+            : ::c10::LoggerVoidify() &                         \
+          ::c10::MessageLogger(                                \
+              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
+              .stream()
+#else
+#define NON_FATAL_IF(condition)                                              \
+  condition ? (void)0                                                        \
+            : ::c10::LoggerVoidify() &                                       \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
+              .stream()
+#endif
+#endif
+
+// Binary comparison check macros
+#define TORCH_CHECK_OP(val1, val2, op)                                      \
+  NON_FATAL_IF(((val1)op(val2)))                                            \
+      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
+      << (val2) << "). "
+
+#define TORCH_DCHECK_OP(val1, val2, op)                                       \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << "). "
+
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+// Debug versions of TORCH_CHECK_OP macros
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// Optimized versions - generate no code
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Null pointer check macro
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
+#else // !NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
+// ----------------------------------------------------------------------------
+// Deprecated macros
+// ----------------------------------------------------------------------------
+
+namespace c10::detail {
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+[[deprecated("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg)
+instead.")]]
+*/
+inline void deprecated_AT_ERROR() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+[[deprecated("AT_ASSERT is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")]]
+*/
+inline void deprecated_AT_ASSERT() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+[[deprecated("AT_ASSERTM is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")]]
+*/
+inline void deprecated_AT_ASSERTM() {}
+
+} // namespace c10::detail
+
+// Deprecated alias; this alias was deprecated because people kept mistakenly
+// using it for user error checking.  Use TORCH_INTERNAL_ASSERT or TORCH_CHECK
+// instead. See https://github.com/pytorch/pytorch/issues/20287 for more
+// details.
+#define AT_ASSERT(...)                                              \
+  do {                                                              \
+    ::c10::detail::deprecated_AT_ASSERT();                          \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)); \
+  } while (false)
+
+// Deprecated alias, like AT_ASSERT.  The new TORCH_INTERNAL_ASSERT macro
+// supports both 0-ary and variadic calls, so having a separate
+// message-accepting macro is not necessary.
+//
+// NB: we MUST include cond explicitly here, as MSVC will miscompile the macro
+// expansion, shunting all of __VA_ARGS__ to cond.  An alternate workaround
+// can be seen at
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define AT_ASSERTM(cond, ...)                                             \
+  do {                                                                    \
+    ::c10::detail::deprecated_AT_ASSERTM();                               \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__)); \
+  } while (false)
+
+// Deprecated alias; this alias was deprecated because it represents extra API
+// surface that makes it hard for people to understand what macro to use.
+// Use TORCH_CHECK(false, ...) or TORCH_INTERNAL_ASSERT(false, ...) to
+// unconditionally fail at a line of code.
+#define AT_ERROR(...)                                                        \
+  do {                                                                       \
+    ::c10::detail::deprecated_AT_ERROR();                                    \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
+  } while (false)
+
+#endif // C10_UTIL_EXCEPTION_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h
new file mode 100644
index 0000000000000000000000000000000000000000..24cdba8d3ea3d9850b673974971c9eca37ff365f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ExclusivelyOwned.h
@@ -0,0 +1,145 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <utility>
+
+namespace c10 {
+
+// See example implementation in TensorBase.h and TensorBody.h.
+// Synopsis:
+//
+// repr_type -- type to use to store an owned T in ExclusivelyOwned.
+//
+// pointer_type -- pointer-esque type to return from
+// ExclusivelyOwned's get() and operator*() methods.
+//
+// const_pointer_type -- similar to pointer_type, used for the const methods.
+//
+// static repr_type nullRepr() -- return a null instance of repr_type.
+//
+// template <class... Args>
+// static repr_type createInPlace(Args&&... args) -- used by the in-place
+// ExclusivelyOwned constructor.
+//
+// static repr_type moveToRepr(T&& x) -- move the given x into an
+// instance of repr_type. used by the ExclusivelyOwned(T&&)
+// constructor.
+//
+// static void destroyOwned(repr_type x) -- free memory for a
+// known-exclusively-owned instance of x. Replaces calling repr_type's
+// destructor. Being able to implement this more efficiently than
+// repr_type's destructor is the main reason to use ExclusivelyOwned
+// for a type.
+//
+// static T take(repr_type&) -- move out of the given repr_type into an owned T.
+//
+// static pointer_type getImpl(const repr_type&) -- return a pointer
+// to the given repr_type. May take repr_type by value if that is more
+// efficient.
+template <typename T>
+struct ExclusivelyOwnedTraits;
+
+/// ExclusivelyOwned is a smart-pointer-like wrapper around an
+/// exclusively-owned instance of some type T that normally has
+/// mandatory reference counting (currently just Tensor). If you have
+/// an isolated piece of code that knows that it has sole ownership of
+/// an object of one of these types (i.e., because you created it
+/// directly or using a factory function) and that object will not
+/// escape from that isolated piece of code, then moving the object
+/// into an ExclusivelyOwned will avoid an atomic reference count
+/// decrement at destruction time.
+///
+/// If you directly create the Tensor in the first
+/// place, you can use the in_place constructor of ExclusivelyOwned to
+/// additionally avoid doing any stores to initialize the refcount &
+/// weakcount.
+template <typename T>
+class ExclusivelyOwned {
+  using EOT = ExclusivelyOwnedTraits<T>;
+  typename ExclusivelyOwnedTraits<T>::repr_type repr_;
+
+ public:
+  ExclusivelyOwned() : repr_(EOT::nullRepr()) {}
+
+  explicit ExclusivelyOwned(T&& t) : repr_(EOT::moveToRepr(std::move(t))) {}
+
+  template <class... Args>
+  explicit ExclusivelyOwned(std::in_place_t /*unused*/, Args&&... args)
+      : repr_(EOT::createInPlace(std::forward<Args>(args)...)) {}
+
+  ExclusivelyOwned(const ExclusivelyOwned&) = delete;
+
+  ExclusivelyOwned(ExclusivelyOwned&& rhs) noexcept
+      : repr_(std::move(rhs.repr_)) {
+    rhs.repr_ = EOT::nullRepr();
+  }
+
+  ExclusivelyOwned& operator=(const ExclusivelyOwned&) = delete;
+
+  ExclusivelyOwned& operator=(ExclusivelyOwned&& rhs) noexcept {
+    EOT::destroyOwned(repr_);
+    repr_ = std::move(rhs.repr_);
+    rhs.repr_ = EOT::nullRepr();
+    return *this;
+  }
+
+  ExclusivelyOwned& operator=(T&& rhs) noexcept {
+    EOT::destroyOwned(repr_);
+    repr_ = EOT::moveToRepr(std::move(rhs));
+    return *this;
+  }
+
+  ~ExclusivelyOwned() {
+    EOT::destroyOwned(repr_);
+    // Don't bother to call the destructor of repr_, since we already
+    // did specialized destruction for the exclusively-owned case in
+    // destroyOwned!
+  }
+
+  // We don't provide this because it would require us to be able to
+  // differentiate an owned-but-empty T from a lack of T. This is
+  // particularly problematic for Tensor, which wants to use an
+  // undefined Tensor as its null state.
+  explicit operator bool() const noexcept = delete;
+
+  operator T() && {
+    return take();
+  }
+
+  // NOTE: the equivalent operation on MaybeOwned is a moving
+  // operator*. For ExclusivelyOwned, take() and operator*() may well
+  // have different return types, so they are different functions.
+  T take() && {
+    return EOT::take(repr_);
+  }
+
+  typename EOT::const_pointer_type operator->() const {
+    return get();
+  }
+
+  typename EOT::const_pointer_type get() const {
+    return EOT::getImpl(repr_);
+  }
+
+  typename EOT::pointer_type operator->() {
+    return get();
+  }
+
+  typename EOT::pointer_type get() {
+    return EOT::getImpl(repr_);
+  }
+
+  std::remove_pointer_t<typename EOT::const_pointer_type>& operator*() const {
+    return *get();
+  }
+
+  std::remove_pointer_t<typename EOT::pointer_type>& operator*() {
+    return *get();
+  }
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..964c57668f629d342576a50f173247192e9f6c4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/FileSystem.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Shim header for filesystem for compilers that are too old to have it not
+// in the experimental namespace
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+#else
+#error "Neither <filesystem> nor <experimental/filesystem> is available."
+#endif
+
+namespace c10 {
+
+#if __has_include(<filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace filesystem = std::experimental::filesystem;
+#endif
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2485bfdebae3a17f0fc8131cfcf24c01052c2a9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Flags.h
@@ -0,0 +1,247 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef C10_UTIL_FLAGS_H_
+#define C10_UTIL_FLAGS_H_
+
+/* Commandline flags support for C10.
+ *
+ * This is a portable commandline flags tool for c10, so we can optionally
+ * choose to use gflags or a lightweight custom implementation if gflags is
+ * not possible on a certain platform. If you have gflags installed, set the
+ * macro C10_USE_GFLAGS will seamlessly route everything to gflags.
+ *
+ * To define a flag foo of type bool default to true, do the following in the
+ * *global* namespace:
+ *     C10_DEFINE_bool(foo, true, "An example.");
+ *
+ * To use it in another .cc file, you can use C10_DECLARE_* as follows:
+ *     C10_DECLARE_bool(foo);
+ *
+ * In both cases, you can then access the flag via FLAGS_foo.
+ *
+ * It is recommended that you build with gflags. To learn more about the flags
+ * usage, refer to the gflags page here:
+ *
+ * https://gflags.github.io/gflags/
+ *
+ * Note about Python users / devs: gflags is initiated from a C++ function
+ * ParseCommandLineFlags, and is usually done in native binaries in the main
+ * function. As Python does not have a modifiable main function, it is usually
+ * difficult to change the flags after Python starts. Hence, it is recommended
+ * that one sets the default value of the flags to one that's acceptable in
+ * general - that will allow Python to run without wrong flags.
+ */
+
+#include <c10/macros/Export.h>
+#include <string>
+
+#include <c10/util/Registry.h>
+
+namespace c10 {
+/**
+ * Sets the usage message when a commandline tool is called with "--help".
+ */
+C10_API void SetUsageMessage(const std::string& str);
+
+/**
+ * Returns the usage message for the commandline tool set by SetUsageMessage.
+ */
+C10_API const char* UsageMessage();
+
+/**
+ * Parses the commandline flags.
+ *
+ * This command parses all the commandline arguments passed in via pargc
+ * and argv. Once it is finished, partc and argv will contain the remaining
+ * commandline args that c10 does not deal with. Note that following
+ * convention, argv[0] contains the binary name and is not parsed.
+ */
+C10_API bool ParseCommandLineFlags(int* pargc, char*** pargv);
+
+/**
+ * Checks if the commandline flags has already been passed.
+ */
+C10_API bool CommandLineFlagsHasBeenParsed();
+
+} // namespace c10
+
+////////////////////////////////////////////////////////////////////////////////
+// Below are gflags and non-gflags specific implementations.
+// In general, they define the following macros for one to declare (use
+// C10_DECLARE) or define (use C10_DEFINE) flags:
+// C10_{DECLARE,DEFINE}_{int,int64,double,bool,string}
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef C10_USE_GFLAGS
+
+////////////////////////////////////////////////////////////////////////////////
+// Begin gflags section: most functions are basically rerouted to gflags.
+////////////////////////////////////////////////////////////////////////////////
+#include <gflags/gflags.h>
+
+// C10 uses hidden visibility by default. However, in gflags, it only uses
+// export on Windows platform (with dllexport) but not on linux/mac (with
+// default visibility). As a result, to ensure that we are always exporting
+// global variables, we will redefine the GFLAGS_DLL_DEFINE_FLAG macro if we
+// are building C10 as a shared library.
+// This has to be done after the inclusion of gflags, because some early
+// versions of gflags.h (e.g. 2.0 on ubuntu 14.04) directly defines the
+// macros, so we need to do definition after gflags is done.
+#ifdef GFLAGS_DLL_DEFINE_FLAG
+#undef GFLAGS_DLL_DEFINE_FLAG
+#endif // GFLAGS_DLL_DEFINE_FLAG
+#ifdef GFLAGS_DLL_DECLARE_FLAG
+#undef GFLAGS_DLL_DECLARE_FLAG
+#endif // GFLAGS_DLL_DECLARE_FLAG
+#define GFLAGS_DLL_DEFINE_FLAG C10_EXPORT
+#define GFLAGS_DLL_DECLARE_FLAG C10_IMPORT
+
+// gflags before 2.0 uses namespace google and after 2.1 uses namespace gflags.
+// Using GFLAGS_GFLAGS_H_ to capture this change.
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif // GFLAGS_GFLAGS_H_
+
+// Motivation about the gflags wrapper:
+// (1) We would need to make sure that the gflags version and the non-gflags
+// version of C10 are going to expose the same flags abstraction. One should
+// explicitly use FLAGS_flag_name to access the flags.
+// (2) For flag names, it is recommended to start with c10_ to distinguish it
+// from regular gflags flags. For example, do
+//    C10_DEFINE_BOOL(c10_my_flag, true, "An example");
+// to allow one to use FLAGS_c10_my_flag.
+// (3) Gflags has a design issue that does not properly expose the global flags,
+// if one builds the library with -fvisibility=hidden. The current gflags (as of
+// Aug 2018) only deals with the Windows case using dllexport, and not the Linux
+// counterparts. As a result, we will explicitly use C10_EXPORT to export the
+// flags defined in C10. This is done via a global reference, so the flag
+// itself is not duplicated - under the hood it is the same global gflags flag.
+#define C10_GFLAGS_DEF_WRAPPER(type, real_type, name, default_value, help_str) \
+  DEFINE_##type(name, default_value, help_str);
+
+#define C10_DEFINE_int(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(int32, gflags::int32, name, default_value, help_str)
+#define C10_DEFINE_int32(name, default_value, help_str) \
+  C10_DEFINE_int(name, default_value, help_str)
+#define C10_DEFINE_int64(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(int64, gflags::int64, name, default_value, help_str)
+#define C10_DEFINE_double(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(double, double, name, default_value, help_str)
+#define C10_DEFINE_bool(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(bool, bool, name, default_value, help_str)
+#define C10_DEFINE_string(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(string, ::fLS::clstring, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define C10_GFLAGS_DECLARE_WRAPPER(type, real_type, name) DECLARE_##type(name);
+
+#define C10_DECLARE_int(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name)
+#define C10_DECLARE_int32(name) C10_DECLARE_int(name)
+#define C10_DECLARE_int64(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(int64, gflags::int64, name)
+#define C10_DECLARE_double(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(double, double, name)
+#define C10_DECLARE_bool(name) C10_GFLAGS_DECLARE_WRAPPER(bool, bool, name)
+#define C10_DECLARE_string(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(string, ::fLS::clstring, name)
+
+#define TORCH_DECLARE_int(name) C10_DECLARE_int(name)
+#define TORCH_DECLARE_int32(name) C10_DECLARE_int32(name)
+#define TORCH_DECLARE_int64(name) C10_DECLARE_int64(name)
+#define TORCH_DECLARE_double(name) C10_DECLARE_double(name)
+#define TORCH_DECLARE_bool(name) C10_DECLARE_bool(name)
+#define TORCH_DECLARE_string(name) C10_DECLARE_string(name)
+
+////////////////////////////////////////////////////////////////////////////////
+// End gflags section.
+////////////////////////////////////////////////////////////////////////////////
+
+#else // C10_USE_GFLAGS
+
+////////////////////////////////////////////////////////////////////////////////
+// Begin non-gflags section: providing equivalent functionality.
+////////////////////////////////////////////////////////////////////////////////
+
+namespace c10 {
+
+class C10_API C10FlagParser {
+ public:
+  bool success() {
+    return success_;
+  }
+
+ protected:
+  template <typename T>
+  bool Parse(const std::string& content, T* value);
+  bool success_{false};
+};
+
+C10_DECLARE_REGISTRY(C10FlagsRegistry, C10FlagParser, const std::string&);
+
+} // namespace c10
+
+// The macros are defined outside the c10 namespace. In your code, you should
+// write the C10_DEFINE_* and C10_DECLARE_* macros outside any namespace
+// as well.
+
+#define C10_DEFINE_typed_var(type, name, default_value, help_str)       \
+  C10_EXPORT type FLAGS_##name = default_value;                         \
+  namespace c10 {                                                       \
+  namespace {                                                           \
+  class C10FlagParser_##name : public C10FlagParser {                   \
+   public:                                                              \
+    explicit C10FlagParser_##name(const std::string& content) {         \
+      success_ = C10FlagParser::Parse<type>(content, &FLAGS_##name);    \
+    }                                                                   \
+  };                                                                    \
+  RegistererC10FlagsRegistry g_C10FlagsRegistry_##name(                 \
+      #name,                                                            \
+      C10FlagsRegistry(),                                               \
+      RegistererC10FlagsRegistry::DefaultCreator<C10FlagParser_##name>, \
+      "(" #type ", default " #default_value ") " help_str);             \
+  }                                                                     \
+  }
+
+#define C10_DEFINE_int(name, default_value, help_str) \
+  C10_DEFINE_typed_var(int, name, default_value, help_str)
+#define C10_DEFINE_int32(name, default_value, help_str) \
+  C10_DEFINE_int(name, default_value, help_str)
+#define C10_DEFINE_int64(name, default_value, help_str) \
+  C10_DEFINE_typed_var(int64_t, name, default_value, help_str)
+#define C10_DEFINE_double(name, default_value, help_str) \
+  C10_DEFINE_typed_var(double, name, default_value, help_str)
+#define C10_DEFINE_bool(name, default_value, help_str) \
+  C10_DEFINE_typed_var(bool, name, default_value, help_str)
+#define C10_DEFINE_string(name, default_value, help_str) \
+  C10_DEFINE_typed_var(std::string, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define C10_DECLARE_typed_var(type, name) C10_API extern type FLAGS_##name
+
+#define C10_DECLARE_int(name) C10_DECLARE_typed_var(int, name)
+#define C10_DECLARE_int32(name) C10_DECLARE_int(name)
+#define C10_DECLARE_int64(name) C10_DECLARE_typed_var(int64_t, name)
+#define C10_DECLARE_double(name) C10_DECLARE_typed_var(double, name)
+#define C10_DECLARE_bool(name) C10_DECLARE_typed_var(bool, name)
+#define C10_DECLARE_string(name) C10_DECLARE_typed_var(std::string, name)
+
+#define TORCH_DECLARE_typed_var(type, name) TORCH_API extern type FLAGS_##name
+
+#define TORCH_DECLARE_int(name) TORCH_DECLARE_typed_var(int, name)
+#define TORCH_DECLARE_int32(name) TORCH_DECLARE_int(name)
+#define TORCH_DECLARE_int64(name) TORCH_DECLARE_typed_var(int64_t, name)
+#define TORCH_DECLARE_double(name) TORCH_DECLARE_typed_var(double, name)
+#define TORCH_DECLARE_bool(name) TORCH_DECLARE_typed_var(bool, name)
+#define TORCH_DECLARE_string(name) TORCH_DECLARE_typed_var(std::string, name)
+
+////////////////////////////////////////////////////////////////////////////////
+// End non-gflags section.
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // C10_USE_GFLAGS
+
+#endif // C10_UTIL_FLAGS_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd690e5aa345ac097a2b4022b6e5a42677e403f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float4_e2m1fn_x2.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float4_e2m1fn_x2.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed07b955168f7ab08b4a20657d8f36ea7cd4123c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fn.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e4m3fn.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h
new file mode 100644
index 0000000000000000000000000000000000000000..30481a62430fdf08f2107bc1ab50e811314767f3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e4m3fnuz.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4e0802e2f7b1a6712f95dea5b82267d8a8498dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2-inl.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e5m2.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4e0802e2f7b1a6712f95dea5b82267d8a8498dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e5m2.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e8c25099a630204f3c4ee345fd2a3653c14116
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e5m2fnuz.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h
new file mode 100644
index 0000000000000000000000000000000000000000..030b23d64750b7378c8fc281c96d2fe662e38d88
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Float8_e8m0fnu.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Float8_e8m0fnu.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..78c3d37c1698db15f05b3b3367765075be2d9046
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Half-inl.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/Half.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..b985cd3e51c325b50dd5ee368c216689888123d6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IdWrapper.h
@@ -0,0 +1,82 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstddef>
+#include <functional>
+#include <utility>
+
+namespace c10 {
+
+/**
+ * This template simplifies generation of simple classes that wrap an id
+ * in a typesafe way. Namely, you can use it to create a very lightweight
+ * type that only offers equality comparators and hashing. Example:
+ *
+ *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
+ *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
+ *   };
+ *
+ * Then in the global top level namespace:
+ *
+ *   C10_DEFINE_HASH_FOR_IDWRAPPER(MyIdType);
+ *
+ * That's it - equality operators and hash functions are automatically defined
+ * for you, given the underlying type supports it.
+ */
+template <class ConcreteType, class UnderlyingType>
+class IdWrapper {
+ public:
+  using underlying_type = UnderlyingType;
+  using concrete_type = ConcreteType;
+
+ protected:
+  constexpr explicit IdWrapper(underlying_type id) noexcept(
+      noexcept(underlying_type(std::declval<underlying_type>())))
+      : id_(id) {}
+
+  constexpr underlying_type underlyingId() const
+      noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
+    return id_;
+  }
+
+ private:
+  friend size_t hash_value(const concrete_type& v) {
+    return std::hash<underlying_type>()(v.id_);
+  }
+
+  // TODO Making operator== noexcept if underlying type is noexcept equality
+  // comparable doesn't work with GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator==(
+      const concrete_type& lhs,
+      const concrete_type& rhs) noexcept {
+    return lhs.id_ == rhs.id_;
+  }
+
+  // TODO Making operator!= noexcept if operator== is noexcept doesn't work with
+  // GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator!=(
+      const concrete_type& lhs,
+      const concrete_type& rhs) noexcept {
+    return !(lhs == rhs);
+  }
+
+  underlying_type id_;
+};
+
+} // namespace c10
+
+#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \
+  namespace std {                                \
+  template <>                                    \
+  struct hash<ClassName> {                       \
+    size_t operator()(ClassName x) const {       \
+      return hash_value(x);                      \
+    }                                            \
+  };                                             \
+  }
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h
new file mode 100644
index 0000000000000000000000000000000000000000..a28803082f7b641b92dae8acf320b7b9be348d74
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/IntrusiveList.h
@@ -0,0 +1,211 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+template <typename T>
+class IntrusiveList;
+
+class IntrusiveListHook {
+  template <typename P, typename T>
+  friend class ListIterator;
+
+  template <typename T>
+  friend class IntrusiveList;
+
+  IntrusiveListHook* next_{nullptr};
+  IntrusiveListHook* prev_{nullptr};
+
+  void link_before(IntrusiveListHook* next_node) {
+    next_ = next_node;
+    prev_ = next_node->prev_;
+    next_node->prev_ = this;
+    prev_->next_ = this;
+  }
+
+ public:
+  IntrusiveListHook() : next_(this), prev_(this) {}
+
+  IntrusiveListHook(const IntrusiveListHook&) = delete;
+  IntrusiveListHook& operator=(const IntrusiveListHook&) = delete;
+  IntrusiveListHook(IntrusiveListHook&&) = delete;
+  IntrusiveListHook& operator=(IntrusiveListHook&&) = delete;
+
+  void unlink() {
+    TORCH_CHECK(is_linked());
+    next_->prev_ = prev_;
+    prev_->next_ = next_;
+    next_ = this;
+    prev_ = this;
+  }
+
+  ~IntrusiveListHook() {
+    if (is_linked()) {
+      unlink();
+    }
+  }
+
+  bool is_linked() const {
+    return next_ != this;
+  }
+};
+
+template <typename P, typename T>
+class ListIterator {
+  static_assert(std::is_same_v<std::remove_const_t<P>, IntrusiveListHook>);
+  static_assert(std::is_base_of_v<IntrusiveListHook, T>);
+  P* ptr_;
+
+  friend class IntrusiveList<T>;
+
+ public:
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = std::conditional_t<std::is_const_v<P>, const T, T>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  explicit ListIterator(P* ptr) : ptr_(ptr) {}
+  ~ListIterator() = default;
+
+  ListIterator(const ListIterator&) = default;
+  ListIterator& operator=(const ListIterator&) = default;
+  ListIterator(ListIterator&&) = default;
+  ListIterator& operator=(ListIterator&&) = default;
+
+  template <
+      typename Q,
+      class = std::enable_if_t<std::is_const_v<P> && !std::is_const_v<Q>>>
+  ListIterator(const ListIterator<Q, T>& rhs) : ptr_(rhs.ptr_) {}
+
+  template <
+      typename Q,
+      class = std::enable_if_t<std::is_const_v<P> && !std::is_const_v<Q>>>
+  ListIterator& operator=(const ListIterator<Q, T>& rhs) {
+    ptr_ = rhs.ptr_;
+    return *this;
+  }
+
+  template <typename Q>
+  bool operator==(const ListIterator<Q, T>& other) const {
+    return ptr_ == other.ptr_;
+  }
+
+  template <typename Q>
+  bool operator!=(const ListIterator<Q, T>& other) const {
+    return !(*this == other);
+  }
+
+  auto& operator*() const {
+    return static_cast<reference>(*ptr_);
+  }
+
+  ListIterator& operator++() {
+    TORCH_CHECK(ptr_);
+    ptr_ = ptr_->next_;
+    return *this;
+  }
+
+  ListIterator& operator--() {
+    TORCH_CHECK(ptr_);
+    ptr_ = ptr_->prev_;
+    return *this;
+  }
+
+  auto* operator->() const {
+    return static_cast<pointer>(ptr_);
+  }
+};
+
+template <typename T>
+class IntrusiveList {
+  static_assert(std::is_base_of_v<IntrusiveListHook, T>);
+
+ public:
+  IntrusiveList() = default;
+  IntrusiveList(const std::initializer_list<std::reference_wrapper<T>>& items) {
+    for (auto& item : items) {
+      insert(this->end(), item);
+    }
+  }
+  ~IntrusiveList() {
+    while (head_.is_linked()) {
+      head_.next_->unlink();
+    }
+  }
+  IntrusiveList(const IntrusiveList&) = delete;
+  IntrusiveList& operator=(const IntrusiveList&) = delete;
+  IntrusiveList(IntrusiveList&&) = delete;
+  IntrusiveList& operator=(IntrusiveList&&) = delete;
+
+  using iterator = ListIterator<IntrusiveListHook, T>;
+  using const_iterator = ListIterator<const IntrusiveListHook, T>;
+
+  auto begin() const {
+    return ++const_iterator{&head_};
+  }
+
+  auto begin() {
+    return ++iterator{&head_};
+  }
+
+  auto end() const {
+    return const_iterator{&head_};
+  }
+
+  auto end() {
+    return iterator{&head_};
+  }
+
+  auto rbegin() const {
+    return std::reverse_iterator{end()};
+  }
+
+  auto rbegin() {
+    return std::reverse_iterator{end()};
+  }
+
+  auto rend() const {
+    return std::reverse_iterator{begin()};
+  }
+
+  auto rend() {
+    return std::reverse_iterator{begin()};
+  }
+
+  auto iterator_to(const T& n) const {
+    return const_iterator{&n};
+  }
+
+  auto iterator_to(T& n) {
+    return iterator{&n};
+  }
+
+  iterator insert(iterator pos, T& n) {
+    n.link_before(pos.ptr_);
+    return iterator{&n};
+  }
+
+  size_t size() const {
+    size_t ret = 0;
+    for ([[maybe_unused]] auto& _ : *this) {
+      ret++;
+    }
+    return ret;
+  }
+
+  bool empty() const {
+    return !head_.is_linked();
+  }
+
+ private:
+  IntrusiveListHook head_;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e86ce2e1da5bc4d1d40ad22e4f31280ac16c2e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/MathConstants.h
@@ -0,0 +1,147 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+namespace c10 {
+// TODO: Replace me with inline constexpr variable when C++17 becomes available
+namespace detail {
+template <typename T>
+C10_HOST_DEVICE inline constexpr T e() {
+  return static_cast<T>(2.718281828459045235360287471352662);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T euler() {
+  return static_cast<T>(0.577215664901532860606512090082402);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_1_pi() {
+  return static_cast<T>(0.318309886183790671537767526745028);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_1_sqrt_pi() {
+  return static_cast<T>(0.564189583547756286948079451560772);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_sqrt_2() {
+  return static_cast<T>(0.707106781186547524400844362104849);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_sqrt_3() {
+  return static_cast<T>(0.577350269189625764509148780501957);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T golden_ratio() {
+  return static_cast<T>(1.618033988749894848204586834365638);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T ln_10() {
+  return static_cast<T>(2.302585092994045684017991454684364);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T ln_2() {
+  return static_cast<T>(0.693147180559945309417232121458176);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T log_10_e() {
+  return static_cast<T>(0.434294481903251827651128918916605);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T log_2_e() {
+  return static_cast<T>(1.442695040888963407359924681001892);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T pi() {
+  return static_cast<T>(3.141592653589793238462643383279502);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T sqrt_2() {
+  return static_cast<T>(1.414213562373095048801688724209698);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T sqrt_3() {
+  return static_cast<T>(1.732050807568877293527446341505872);
+}
+
+template <>
+C10_HOST_DEVICE inline constexpr BFloat16 pi<BFloat16>() {
+  // According to
+  // https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Special_values
+  // pi is encoded as 4049
+  return BFloat16(0x4049, BFloat16::from_bits());
+}
+
+template <>
+C10_HOST_DEVICE inline constexpr Half pi<Half>() {
+  return Half(0x4248, Half::from_bits());
+}
+} // namespace detail
+
+template <typename T>
+constexpr T e = c10::detail::e<T>();
+
+template <typename T>
+constexpr T euler = c10::detail::euler<T>();
+
+template <typename T>
+constexpr T frac_1_pi = c10::detail::frac_1_pi<T>();
+
+template <typename T>
+constexpr T frac_1_sqrt_pi = c10::detail::frac_1_sqrt_pi<T>();
+
+template <typename T>
+constexpr T frac_sqrt_2 = c10::detail::frac_sqrt_2<T>();
+
+template <typename T>
+constexpr T frac_sqrt_3 = c10::detail::frac_sqrt_3<T>();
+
+template <typename T>
+constexpr T golden_ratio = c10::detail::golden_ratio<T>();
+
+template <typename T>
+constexpr T ln_10 = c10::detail::ln_10<T>();
+
+template <typename T>
+constexpr T ln_2 = c10::detail::ln_2<T>();
+
+template <typename T>
+constexpr T log_10_e = c10::detail::log_10_e<T>();
+
+template <typename T>
+constexpr T log_2_e = c10::detail::log_2_e<T>();
+
+template <typename T>
+constexpr T pi = c10::detail::pi<T>();
+
+template <typename T>
+constexpr T sqrt_2 = c10::detail::sqrt_2<T>();
+
+template <typename T>
+constexpr T sqrt_3 = c10::detail::sqrt_3<T>();
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h
new file mode 100644
index 0000000000000000000000000000000000000000..55c4697368c60f86b69db1b1bc65cf0cb2e99404
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Optional.h
@@ -0,0 +1,65 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef C10_UTIL_OPTIONAL_H_
+#define C10_UTIL_OPTIONAL_H_
+
+#include <optional>
+#include <type_traits>
+
+// Macros.h is not needed, but it does namespace shenanigans that lots
+// of downstream code seems to rely on. Feel free to remove it and fix
+// up builds.
+
+namespace c10 {
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::bad_optional_access;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::make_optional;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt_t;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::optional;
+#endif
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+namespace detail_ {
+// the call to convert<A>(b) has return type A and converts b to type A iff b
+// decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) {
+  return v;
+}
+} // namespace detail_
+template <class T, class F>
+[[deprecated(
+    "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T
+value_or_else(const std::optional<T>& v, F&& func) {
+  static_assert(
+      std::is_convertible_v<typename std::invoke_result_t<F>, T>,
+      "func parameters must be a callable that returns a type convertible to the value stored in the optional");
+  return v.has_value() ? *v : detail_::convert<T>(std::forward<F>(func)());
+}
+
+template <class T, class F>
+[[deprecated(
+    "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T
+value_or_else(std::optional<T>&& v, F&& func) {
+  static_assert(
+      std::is_convertible_v<typename std::invoke_result_t<F>, T>,
+      "func parameters must be a callable that returns a type convertible to the value stored in the optional");
+  return v.has_value() ? constexpr_move(std::move(v).contained_val())
+                       : detail_::convert<T>(std::forward<F>(func)());
+}
+
+#endif
+
+} // namespace c10
+#endif // C10_UTIL_OPTIONAL_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa4eaaceadd2588bbe53fcd51d3cbffde5d3b220
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ScopeExit.h
@@ -0,0 +1,55 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/**
+ * Mostly copied from https://llvm.org/doxygen/ScopeExit_8h_source.html
+ */
+template <typename Callable>
+class scope_exit {
+  Callable ExitFunction;
+  bool Engaged = true; // False once moved-from or release()d.
+
+ public:
+  template <typename Fp>
+  // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+  explicit scope_exit(Fp&& F) : ExitFunction(std::forward<Fp>(F)) {}
+
+  scope_exit(scope_exit&& Rhs) noexcept
+      : ExitFunction(std::move(Rhs.ExitFunction)), Engaged(Rhs.Engaged) {
+    Rhs.release();
+  }
+  scope_exit(const scope_exit&) = delete;
+  scope_exit& operator=(scope_exit&&) = delete;
+  scope_exit& operator=(const scope_exit&) = delete;
+
+  void release() {
+    Engaged = false;
+  }
+
+  ~scope_exit() {
+    if (Engaged) {
+      ExitFunction();
+    }
+  }
+};
+
+// Keeps the callable object that is passed in, and execute it at the
+// destruction of the returned object (usually at the scope exit where the
+// returned object is kept).
+//
+// Interface is specified by p0052r2.
+template <typename Callable>
+scope_exit<std::decay_t<Callable>> make_scope_exit(Callable&& F) {
+  return scope_exit<std::decay_t<Callable>>(std::forward<Callable>(F));
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c40d21a692f0470d02d25bc8794f1b8d58c55a0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallBuffer.h
@@ -0,0 +1,92 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+/** Helper class for allocating temporary fixed size arrays with SBO.
+ *
+ * This is intentionally much simpler than SmallVector, to improve performance
+ * at the expense of many features:
+ * - No zero-initialization for numeric types
+ * - No resizing after construction
+ * - No copy/move
+ * - No non-trivial types
+ */
+
+namespace c10 {
+
+template <typename T, size_t N>
+class SmallBuffer {
+  static_assert(std::is_trivial_v<T>, "SmallBuffer is intended for POD types");
+
+  std::array<T, N> storage_;
+  size_t size_{};
+  T* data_{};
+
+ public:
+  SmallBuffer(size_t size) : size_(size) {
+    if (size > N) {
+      data_ = new T[size];
+    } else {
+      data_ = &storage_[0];
+    }
+  }
+
+  SmallBuffer(const SmallBuffer&) = delete;
+  SmallBuffer& operator=(const SmallBuffer&) = delete;
+
+  // move constructor is needed in function return
+  SmallBuffer(SmallBuffer&& rhs) noexcept : size_{rhs.size_} {
+    rhs.size_ = 0;
+    if (size_ > N) {
+      data_ = rhs.data_;
+      rhs.data_ = nullptr;
+    } else {
+      storage_ = std::move(rhs.storage_);
+      data_ = &storage_[0];
+    }
+  }
+
+  SmallBuffer& operator=(SmallBuffer&&) = delete;
+
+  ~SmallBuffer() {
+    if (size_ > N) {
+      delete[] data_;
+    }
+  }
+  T& operator[](size_t idx) {
+    return data()[idx];
+  }
+  const T& operator[](size_t idx) const {
+    return data()[idx];
+  }
+  T* data() {
+    return data_;
+  }
+  const T* data() const {
+    return data_;
+  }
+  size_t size() const {
+    return size_;
+  }
+  T* begin() {
+    return data_;
+  }
+  const T* begin() const {
+    return data_;
+  }
+  T* end() {
+    return data_ + size_;
+  }
+  const T* end() const {
+    return data_ + size_;
+  }
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2a4dbb0f92f530cd21dc8a63ee48f82f430393d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/SmallVector.h
@@ -0,0 +1,1472 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// used std::is_trivially_{copy,move}_constructible
+// replaced iterator_range constructor with inline Container&& constructor
+// replaced LLVM_NODISCARD, LLVM_LIKELY, and LLVM_UNLIKELY with c10 equivalents
+// removed LLVM_GSL_OWNER
+// added SmallVector::at
+// added operator<< for std::ostream
+// added C10_API to export SmallVectorBase
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/AlignOf.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/// This is all the stuff common to all SmallVectors.
+///
+/// The template parameter specifies the type which should be used to hold the
+/// Size and Capacity of the SmallVector, so it can be adjusted.
+/// Using 32 bit size is desirable to shrink the size of the SmallVector.
+/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
+/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
+/// buffering bitcode output - which can exceed 4GB.
+template <class Size_T>
+class C10_API SmallVectorBase {
+ protected:
+  void* BeginX;
+  Size_T Size = 0, Capacity;
+
+  /// The maximum value of the Size_T used.
+  static constexpr size_t SizeTypeMax() {
+    return std::numeric_limits<Size_T>::max();
+  }
+
+  SmallVectorBase(void* FirstEl, size_t TotalCapacity)
+      : BeginX(FirstEl), Capacity(TotalCapacity) {}
+
+  /// This is a helper for \a grow() that's out of line to reduce code
+  /// duplication.  This function will report a fatal error if it can't grow at
+  /// least to \p MinSize.
+  void* mallocForGrow(size_t MinSize, size_t TSize, size_t& NewCapacity);
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  /// This function will report a fatal error if it cannot increase capacity.
+  void grow_pod(const void* FirstEl, size_t MinSize, size_t TSize);
+
+ public:
+  SmallVectorBase() = delete;
+  size_t size() const {
+    return Size;
+  }
+  size_t capacity() const {
+    return Capacity;
+  }
+
+  [[nodiscard]] bool empty() const {
+    return !Size;
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_t N) {
+    assert(N <= capacity());
+    Size = N;
+  }
+};
+
+template <class T>
+using SmallVectorSizeType =
+    std::conditional_t<sizeof(T) < 4 && sizeof(void*) >= 8, uint64_t, uint32_t>;
+
+/// Figure out the offset of the first element.
+template <class T, typename = void>
+struct SmallVectorAlignmentAndSize {
+  // NOLINTNEXTLINE(*c-arrays*)
+  alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
+      SmallVectorBase<SmallVectorSizeType<T>>)];
+  // NOLINTNEXTLINE(*c-arrays*)
+  alignas(T) char FirstEl[sizeof(T)];
+};
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon
+    : public SmallVectorBase<SmallVectorSizeType<T>> {
+  using Base = SmallVectorBase<SmallVectorSizeType<T>>;
+
+  /// Find the address of the first element.  For this pointer math to be valid
+  /// with small-size of 0 for T with lots of alignment, it's important that
+  /// SmallVectorStorage is properly-aligned even for small-size of 0.
+  void* getFirstEl() const {
+    return const_cast<void*>(reinterpret_cast<const void*>(
+        reinterpret_cast<const char*>(this) +
+        offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)));
+  }
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+ protected:
+  SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
+
+  void grow_pod(size_t MinSize, size_t TSize) {
+    Base::grow_pod(getFirstEl(), MinSize, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return this->BeginX == getFirstEl();
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    this->BeginX = getFirstEl();
+    this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
+  }
+
+  /// Return true if V is an internal reference to the given range.
+  bool isReferenceToRange(const void* V, const void* First, const void* Last)
+      const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(V, First) && LessThan(V, Last);
+  }
+
+  /// Return true if V is an internal reference to this vector.
+  bool isReferenceToStorage(const void* V) const {
+    return isReferenceToRange(V, this->begin(), this->end());
+  }
+
+  /// Return true if First and Last form a valid (possibly empty) range in this
+  /// vector's storage.
+  bool isRangeInStorage(const void* First, const void* Last) const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(First, this->begin()) && !LessThan(Last, First) &&
+        !LessThan(this->end(), Last);
+  }
+
+  /// Return true unless Elt will be invalidated by resizing the vector to
+  /// NewSize.
+  bool isSafeToReferenceAfterResize(const void* Elt, size_t NewSize) {
+    // Past the end.
+    if (C10_LIKELY(!isReferenceToStorage(Elt)))
+      return true;
+
+    // Return false if Elt will be destroyed by shrinking.
+    if (NewSize <= this->size())
+      return Elt < this->begin() + NewSize;
+
+    // Return false if we need to grow.
+    return NewSize <= this->capacity();
+  }
+
+  /// Check whether Elt will be invalidated by resizing the vector to NewSize.
+  void assertSafeToReferenceAfterResize(const void* Elt, size_t NewSize) {
+    (void)Elt; // Suppress unused variable warning
+    (void)NewSize; // Suppress unused variable warning
+    assert(
+        isSafeToReferenceAfterResize(Elt, NewSize) &&
+        "Attempting to reference an element of the vector in an operation "
+        "that invalidates it");
+  }
+
+  /// Check whether Elt will be invalidated by increasing the size of the
+  /// vector by N.
+  void assertSafeToAdd(const void* Elt, size_t N = 1) {
+    this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
+  }
+
+  /// Check whether any part of the range will be invalidated by clearing.
+  void assertSafeToReferenceAfterClear(const T* From, const T* To) {
+    if (From == To)
+      return;
+    this->assertSafeToReferenceAfterResize(From, 0);
+    this->assertSafeToReferenceAfterResize(To - 1, 0);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
+          false>
+  void assertSafeToReferenceAfterClear(ItTy /*unused*/, ItTy /*unused*/) {}
+
+  /// Check whether any part of the range will be invalidated by growing.
+  void assertSafeToAddRange(const T* From, const T* To) {
+    if (From == To)
+      return;
+    this->assertSafeToAdd(From, To - From);
+    this->assertSafeToAdd(To - 1, To - From);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
+          false>
+  void assertSafeToAddRange(ItTy /*unused*/, ItTy /*unused*/) {}
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  template <class U>
+  static const T* reserveForParamAndGetAddressImpl(
+      U* This,
+      const T& Elt,
+      size_t N) {
+    size_t NewSize = This->size() + N;
+    if (C10_LIKELY(NewSize <= This->capacity()))
+      return &Elt;
+
+    bool ReferencesStorage = false;
+    int64_t Index = -1;
+    if constexpr (!U::TakesParamByValue) {
+      if (C10_UNLIKELY(This->isReferenceToStorage(&Elt))) {
+        ReferencesStorage = true;
+        Index = &Elt - This->begin();
+      }
+    }
+    This->grow(NewSize);
+    return ReferencesStorage ? This->begin() + Index : &Elt;
+  }
+
+ public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+  using const_pointer = const T*;
+
+  using Base::capacity;
+  using Base::empty;
+  using Base::size;
+
+  // forward iterator creation methods.
+  iterator begin() {
+    return (iterator)this->BeginX;
+  }
+  const_iterator begin() const {
+    return (const_iterator)this->BeginX;
+  }
+  iterator end() {
+    return begin() + size();
+  }
+  const_iterator end() const {
+    return begin() + size();
+  }
+
+  // reverse iterator creation methods.
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  size_type size_in_bytes() const {
+    return size() * sizeof(T);
+  }
+  constexpr size_type max_size() const {
+    return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
+  }
+
+  size_t capacity_in_bytes() const {
+    return capacity() * sizeof(T);
+  }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() {
+    return pointer(begin());
+  }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const {
+    return const_pointer(begin());
+  }
+
+  // SmallVector::at is NOT from LLVM.
+  reference at(size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference at(size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  reference operator[](size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference operator[](size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    assert(!empty());
+    return begin()[0];
+  }
+  const_reference front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    assert(!empty());
+    return end()[-1];
+  }
+  const_reference back() const {
+    assert(!empty());
+    return end()[-1];
+  }
+};
+
+/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
+/// method implementations that are designed to work with non-trivial T's.
+///
+/// We approximate is_trivially_copyable with trivial move/copy construction and
+/// trivial destruction. While the standard doesn't specify that you're allowed
+/// copy these types with memcpy, there is no way for the type to observe this.
+/// This catches the important case of std::pair<POD, POD>, which is not
+/// trivially assignable.
+///
+/// XXX: if build fails here fall back to C10_IS_TRIVIALLY_COPYABLE and make a
+/// note
+template <
+    typename T,
+    bool = (std::is_trivially_copy_constructible_v<T>) &&
+        (std::is_trivially_move_constructible_v<T>) &&
+        std::is_trivially_destructible_v<T>>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
+ protected:
+  static constexpr bool TakesParamByValue = false;
+  using ValueParamT = const T&;
+
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T* S, T* E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(
+        std::make_move_iterator(I), std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+  /// Create a new allocation big enough for \p MinSize and pass back its size
+  /// in \p NewCapacity. This is the first section of \a grow().
+  T* mallocForGrow(size_t MinSize, size_t& NewCapacity) {
+    return static_cast<T*>(
+        SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
+            MinSize, sizeof(T), NewCapacity));
+  }
+
+  /// Move existing elements over to the new allocation \p NewElts, the middle
+  /// section of \a grow().
+  void moveElementsForGrow(T* NewElts);
+
+  /// Transfer ownership of the allocation, finishing up \a grow().
+  void takeAllocationForGrow(T* NewElts, size_t NewCapacity);
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) {
+    return const_cast<T*>(this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  static T&& forward_value_param(T&& V) {
+    return std::move(V);
+  }
+  static const T& forward_value_param(const T& V) {
+    return V;
+  }
+
+  void growAndAssign(size_t NumElts, const T& Elt) {
+    // Grow manually in case Elt is an internal reference.
+    size_t NewCapacity = 0;
+    T* NewElts = mallocForGrow(NumElts, NewCapacity);
+    std::uninitialized_fill_n(NewElts, NumElts, Elt);
+    this->destroy_range(this->begin(), this->end());
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes>
+  T& growAndEmplaceBack(ArgTypes&&... Args) {
+    // Grow manually in case one of Args is an internal reference.
+    size_t NewCapacity = 0;
+    T* NewElts = mallocForGrow(0, NewCapacity);
+    ::new ((void*)(NewElts + this->size())) T(std::forward<ArgTypes>(Args)...);
+    moveElementsForGrow(NewElts);
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(this->size() + 1);
+    return this->back();
+  }
+
+ public:
+  void push_back(const T& Elt) {
+    const T* EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void*)this->end()) T(*EltPtr);
+    this->set_size(this->size() + 1);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  void push_back(T&& Elt) {
+    T* EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void*)this->end()) T(::std::move(*EltPtr));
+    this->set_size(this->size() + 1);
+  }
+
+  void pop_back() {
+    this->set_size(this->size() - 1);
+    this->end()->~T();
+  }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
+  size_t NewCapacity = 0;
+  T* NewElts = mallocForGrow(MinSize, NewCapacity);
+  moveElementsForGrow(NewElts);
+  takeAllocationForGrow(NewElts, NewCapacity);
+}
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
+    T* NewElts) {
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+}
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
+    T* NewElts,
+    size_t NewCapacity) {
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    free(this->begin());
+
+  this->BeginX = NewElts;
+  this->Capacity = NewCapacity;
+}
+
+/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
+/// method implementations that are designed to work with trivially copyable
+/// T's. This allows using memcpy in place of copy/move construction and
+/// skipping destruction.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
+ protected:
+  /// True if it's cheap enough to take parameters by value. Doing so avoids
+  /// overhead related to mitigations for reference invalidation.
+  static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void*);
+
+  /// Either const T& or T, depending on whether it's cheap enough to take
+  /// parameters by value.
+  using ValueParamT = std::conditional_t<TakesParamByValue, T, const T&>;
+
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T* /*unused*/, T* /*unused*/) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1* I,
+      T1* E,
+      T2* Dest,
+      std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* /*unused*/
+      = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(reinterpret_cast<void*>(Dest), I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize, sizeof(T));
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) {
+    return const_cast<T*>(this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  /// Copy \p V or return a reference, depending on \a ValueParamT.
+  static ValueParamT forward_value_param(ValueParamT V) {
+    return V;
+  }
+
+  void growAndAssign(size_t NumElts, T Elt) {
+    // Elt has been copied in case it's an internal reference, side-stepping
+    // reference invalidation problems without losing the realloc optimization.
+    this->set_size(0);
+    this->grow(NumElts);
+    std::uninitialized_fill_n(this->begin(), NumElts, Elt);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes>
+  T& growAndEmplaceBack(ArgTypes&&... Args) {
+    // Use push_back with a copy in case Args has an internal reference,
+    // side-stepping reference invalidation problems without losing the realloc
+    // optimization.
+    push_back(T(std::forward<ArgTypes>(Args)...));
+    return this->back();
+  }
+
+ public:
+  void push_back(ValueParamT Elt) {
+    const T* EltPtr = reserveForParamAndGetAddress(Elt);
+    memcpy(reinterpret_cast<void*>(this->end()), EltPtr, sizeof(T));
+    this->set_size(this->size() + 1);
+  }
+
+  void pop_back() {
+    this->set_size(this->size() - 1);
+  }
+};
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T> {
+  using SuperClass = SmallVectorTemplateBase<T>;
+
+ public:
+  using iterator = typename SuperClass::iterator;
+  using const_iterator = typename SuperClass::const_iterator;
+  using reference = typename SuperClass::reference;
+  using size_type = typename SuperClass::size_type;
+
+ protected:
+  using SmallVectorTemplateBase<T>::TakesParamByValue;
+  using ValueParamT = typename SuperClass::ValueParamT;
+
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N) : SmallVectorTemplateBase<T>(N) {}
+
+ public:
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
+
+  ~SmallVectorImpl() {
+    // Subclass has already destructed this vector's elements.
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      free(this->begin());
+  }
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->Size = 0;
+  }
+
+ private:
+  template <bool ForOverwrite>
+  void resizeImpl(size_type N) {
+    if (N < this->size()) {
+      this->pop_back_n(this->size() - N);
+    } else if (N > this->size()) {
+      this->reserve(N);
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        if (ForOverwrite)
+          new (&*I) T;
+        else
+          new (&*I) T();
+      this->set_size(N);
+    }
+  }
+
+ public:
+  void resize(size_type N) {
+    resizeImpl<false>(N);
+  }
+
+  /// Like resize, but \ref T is POD, the new values won't be initialized.
+  void resize_for_overwrite(size_type N) {
+    resizeImpl<true>(N);
+  }
+
+  void resize(size_type N, ValueParamT NV) {
+    if (N == this->size())
+      return;
+
+    if (N < this->size()) {
+      this->pop_back_n(this->size() - N);
+      return;
+    }
+
+    // N > this->size(). Defer to append.
+    this->append(N - this->size(), NV);
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  void pop_back_n(size_type NumItems) {
+    assert(this->size() >= NumItems);
+    this->destroy_range(this->end() - NumItems, this->end());
+    this->set_size(this->size() - NumItems);
+  }
+
+  [[nodiscard]] T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl& RHS) noexcept;
+
+  /// Add the specified range to the end of the SmallVector.
+  template <
+      typename in_iter,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>>>
+  void append(in_iter in_start, in_iter in_end) {
+    this->assertSafeToAddRange(in_start, in_end);
+    size_type NumInputs = std::distance(in_start, in_end);
+    this->reserve(this->size() + NumInputs);
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->set_size(this->size() + NumInputs);
+  }
+
+  /// Append \p NumInputs copies of \p Elt to the end.
+  void append(size_type NumInputs, ValueParamT Elt) {
+    const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs);
+    std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr);
+    this->set_size(this->size() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  void append(const SmallVectorImpl& RHS) {
+    append(RHS.begin(), RHS.end());
+  }
+
+  void assign(size_type NumElts, ValueParamT Elt) {
+    // Note that Elt could be an internal reference.
+    if (NumElts > this->capacity()) {
+      this->growAndAssign(NumElts, Elt);
+      return;
+    }
+
+    // Assign over existing elements.
+    std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
+    if (NumElts > this->size())
+      std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
+    else if (NumElts < this->size())
+      this->destroy_range(this->begin() + NumElts, this->end());
+    this->set_size(NumElts);
+  }
+
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
+  template <
+      typename in_iter,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>>>
+  void assign(in_iter in_start, in_iter in_end) {
+    this->assertSafeToReferenceAfterClear(in_start, in_end);
+    clear();
+    append(in_start, in_end);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  void assign(const SmallVectorImpl& RHS) {
+    assign(RHS.begin(), RHS.end());
+  }
+
+  iterator erase(iterator I) {
+    assert(
+        this->isReferenceToStorage(I) && "Iterator to erase is out of bounds.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I + 1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return N;
+  }
+
+  iterator erase(iterator S, iterator E) {
+    assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->set_size(I - this->begin());
+    return N;
+  }
+
+ private:
+  template <class ArgType>
+  iterator insert_one_impl(iterator I, ArgType&& Elt) {
+    // Callers ensure that ArgType is derived from T.
+    static_assert(
+        std::is_same<std::remove_const_t<std::remove_reference_t<ArgType>>, T>::
+            value,
+        "ArgType must be derived from T!");
+
+    if (I == this->end()) { // Important special case for empty vector.
+      this->push_back(::std::forward<ArgType>(Elt));
+      return this->end() - 1;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Grow if necessary.
+    size_t Index = I - this->begin();
+    std::remove_reference_t<ArgType>* EltPtr =
+        this->reserveForParamAndGetAddress(Elt);
+    I = this->begin() + Index;
+
+    ::new ((void*)this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end() - 1, this->end());
+    this->set_size(this->size() + 1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference (never happens if TakesParamByValue).
+    static_assert(
+        !TakesParamByValue || std::is_same_v<ArgType, T>,
+        "ArgType must be 'T' when taking by value!");
+    if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
+      ++EltPtr;
+
+    *I = ::std::forward<ArgType>(*EltPtr);
+    return I;
+  }
+
+ public:
+  iterator insert(iterator I, T&& Elt) {
+    return insert_one_impl(I, this->forward_value_param(std::move(Elt)));
+  }
+
+  iterator insert(iterator I, const T& Elt) {
+    return insert_one_impl(I, this->forward_value_param(Elt));
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin() + InsertElt;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Ensure there is enough space, and get the (maybe updated) address of
+    // Elt.
+    const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      // If we just moved the element we're inserting, be sure to update
+      // the reference (never happens if TakesParamByValue).
+      if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+        EltPtr += NumToInsert;
+
+      std::fill_n(I, NumToInsert, *EltPtr);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->set_size(this->size() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference (never happens if TakesParamByValue).
+    if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+      EltPtr += NumToInsert;
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, *EltPtr);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr);
+    return I;
+  }
+
+  template <
+      typename ItTy,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>>>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(From, To);
+      return this->begin() + InsertElt;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Check that the reserve that follows doesn't invalidate the iterators.
+    this->assertSafeToAddRange(From, To);
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->set_size(this->size() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T* J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J;
+      ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes>
+  reference emplace_back(ArgTypes&&... Args) {
+    if (C10_UNLIKELY(this->size() >= this->capacity()))
+      return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
+
+    ::new ((void*)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->set_size(this->size() + 1);
+    return this->back();
+  }
+
+  SmallVectorImpl& operator=(const SmallVectorImpl& RHS);
+
+  SmallVectorImpl& operator=(SmallVectorImpl&& RHS) noexcept(
+      std::is_nothrow_move_constructible_v<T> &&
+      std::is_nothrow_destructible_v<T>);
+
+  bool operator==(const SmallVectorImpl& RHS) const {
+    if (this->size() != RHS.size())
+      return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl& RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl& RHS) const {
+    return std::lexicographical_compare(
+        this->begin(), this->end(), RHS.begin(), RHS.end());
+  }
+};
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T>& RHS) noexcept {
+  if (this == &RHS)
+    return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->Size, RHS.Size);
+    std::swap(this->Capacity, RHS.Capacity);
+    return;
+  }
+  this->reserve(RHS.size());
+  RHS.reserve(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size())
+    NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end());
+    RHS.set_size(RHS.size() + EltDiff);
+    this->destroy_range(this->begin() + NumShared, this->end());
+    this->set_size(NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end());
+    this->set_size(this->size() + EltDiff);
+    this->destroy_range(RHS.begin() + NumShared, RHS.end());
+    RHS.set_size(NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
+    const SmallVectorImpl<T>& RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->set_size(RHSSize);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->clear();
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->set_size(RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::
+operator=(SmallVectorImpl<T>&& RHS) noexcept(
+    std::is_nothrow_move_constructible_v<T> &&
+    std::is_nothrow_destructible_v<T>) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall())
+      free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->Size = RHS.Size;
+    this->Capacity = RHS.Capacity;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->set_size(RHSSize);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->clear();
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->set_size(RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/// Storage for the SmallVector elements.  This is specialized for the N=0 case
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  alignas(T) char InlineElts[N * sizeof(T)];
+};
+
+/// We need the storage to be properly aligned even for small-size of 0 so that
+/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
+/// well-defined.
+template <typename T>
+struct alignas(T) SmallVectorStorage<T, 0> {};
+
+/// Forward declaration of SmallVector so that
+/// calculateSmallVectorDefaultInlinedElements can reference
+/// `sizeof(SmallVector<T, 0>)`.
+template <typename T, unsigned N>
+class /* LLVM_GSL_OWNER */ SmallVector;
+
+/// Helper class for calculating the default number of inline elements for
+/// `SmallVector<T>`.
+///
+/// This should be migrated to a constexpr function when our minimum
+/// compiler support is enough for multi-statement constexpr functions.
+template <typename T>
+struct CalculateSmallVectorDefaultInlinedElements {
+  // Parameter controlling the default number of inlined elements
+  // for `SmallVector<T>`.
+  //
+  // The default number of inlined elements ensures that
+  // 1. There is at least one inlined element.
+  // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
+  // it contradicts 1.
+  static constexpr size_t kPreferredSmallVectorSizeof = 64;
+
+  // static_assert that sizeof(T) is not "too big".
+  //
+  // Because our policy guarantees at least one inlined element, it is possible
+  // for an arbitrarily large inlined element to allocate an arbitrarily large
+  // amount of inline storage. We generally consider it an antipattern for a
+  // SmallVector to allocate an excessive amount of inline storage, so we want
+  // to call attention to these cases and make sure that users are making an
+  // intentional decision if they request a lot of inline storage.
+  //
+  // We want this assertion to trigger in pathological cases, but otherwise
+  // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
+  // larger than kPreferredSmallVectorSizeof (otherwise,
+  // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
+  // pattern seems useful in practice).
+  //
+  // One wrinkle is that this assertion is in theory non-portable, since
+  // sizeof(T) is in general platform-dependent. However, we don't expect this
+  // to be much of an issue, because most LLVM development happens on 64-bit
+  // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
+  // 32-bit hosts, dodging the issue. The reverse situation, where development
+  // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
+  // 64-bit host, is expected to be very rare.
+  static_assert(
+      sizeof(T) <= 256,
+      "You are trying to use a default number of inlined elements for "
+      "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `SmallVector<T, N>` to make "
+      "sure you really want that much inline storage.");
+
+  // Discount the size of the header itself when calculating the maximum inline
+  // bytes.
+  static constexpr size_t PreferredInlineBytes =
+      kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
+  static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
+  static constexpr size_t value =
+      NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
+};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small.  It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold.  This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// \note
+/// In the absence of a well-motivated choice for the number of inlined
+/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
+/// omitting the \p N). This will choose a default number of inlined elements
+/// reasonable for allocation on the stack (for example, trying to keep \c
+/// sizeof(SmallVector<T>) around 64 bytes).
+///
+/// \warning This does not attempt to be exception safe.
+///
+/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
+template <
+    typename T,
+    unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
+class /* LLVM_GSL_OWNER */ SmallVector : public SmallVectorImpl<T>,
+                                         SmallVectorStorage<T, N> {
+ public:
+  SmallVector() : SmallVectorImpl<T>(N) {}
+
+  ~SmallVector() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+  }
+
+  explicit SmallVector(size_t Size, const T& Value = T())
+      : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  template <
+      typename ItTy,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>>>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  explicit SmallVector(Container&& c) : SmallVectorImpl<T>(N) {
+    this->append(c.begin(), c.end());
+  }
+
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  SmallVector(const SmallVector& RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  SmallVector& operator=(const SmallVector& RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  SmallVector(SmallVector&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>)
+      : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  SmallVector& operator=(const Container& RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
+  SmallVector(SmallVectorImpl<T>&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>)
+      : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  SmallVector& operator=(SmallVector&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  SmallVector& operator=(SmallVectorImpl<T>&& RHS) noexcept(
+      std::is_nothrow_move_constructible_v<SmallVectorImpl<T>>) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  SmallVector& operator=(Container&& C) {
+    this->assign(C.begin(), C.end());
+    return *this;
+  }
+
+  SmallVector& operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template <typename T, unsigned N>
+inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
+  return X.capacity_in_bytes();
+}
+
+template <typename T, unsigned N>
+std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
+  int i = 0;
+  out << '[';
+  for (auto e : list) {
+    if (i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << ']';
+  return out;
+}
+
+template <typename RangeType>
+using ValueTypeFromRangeType = std::remove_const_t<
+    std::remove_reference_t<decltype(*std::begin(std::declval<RangeType&>()))>>;
+
+/// Given a range of type R, iterate the entire range and return a
+/// SmallVector with elements of the vector.  This is useful, for example,
+/// when you want to iterate a range and then sort the results.
+template <unsigned Size, typename R>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+SmallVector<ValueTypeFromRangeType<R>, Size> to_vector(R&& Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+template <typename R>
+SmallVector<
+    ValueTypeFromRangeType<R>,
+    CalculateSmallVectorDefaultInlinedElements<
+        ValueTypeFromRangeType<R>>::value>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+to_vector(R&& Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+
+} // end namespace c10
+
+namespace std {
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T>
+inline void swap(
+    c10::SmallVectorImpl<T>& LHS,
+    c10::SmallVectorImpl<T>& RHS) noexcept {
+  LHS.swap(RHS);
+}
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T, unsigned N>
+inline void swap(
+    c10::SmallVector<T, N>& LHS,
+    c10::SmallVector<T, N>& RHS) noexcept {
+  LHS.swap(RHS);
+}
+
+} // end namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c77905085305f5b2884985df2857a219a760c56
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/StringUtil.h
@@ -0,0 +1,267 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef C10_UTIL_STRINGUTIL_H_
+#define C10_UTIL_STRINGUTIL_H_
+
+#include <c10/macros/Macros.h>
+#include <c10/util/string_utils.h>
+
+#include <cstddef>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace c10 {
+
+namespace detail {
+
+// Obtains the base name from a full path.
+C10_API std::string StripBasename(const std::string& full_path);
+
+C10_API std::string ExcludeFileExtension(const std::string& full_path);
+
+struct CompileTimeEmptyString {
+  operator const std::string&() const {
+    static const std::string empty_string_literal;
+    return empty_string_literal;
+  }
+  operator const char*() const {
+    return "";
+  }
+};
+
+template <typename T>
+struct CanonicalizeStrTypes {
+  using type = const T&;
+};
+
+template <size_t N>
+// NOLINTNEXTLINE(*c-arrays*)
+struct CanonicalizeStrTypes<char[N]> {
+  using type = const char*;
+};
+
+inline std::ostream& _str(std::ostream& ss) {
+  return ss;
+}
+
+template <class T, class = std::ostream&>
+struct Streamable : std::false_type {};
+
+template <class T>
+struct Streamable<T, decltype(std::declval<std::ostream&>() << T{})>
+    : std::true_type {};
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  if constexpr (std::is_enum_v<T> && !Streamable<T>::value) {
+    // NOLINTNEXTLINE(modernize-type-traits)
+    return _str(ss, static_cast<typename std::underlying_type<T>::type>(t));
+  } else {
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    ss << t;
+    return ss;
+  }
+}
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const std::optional<T>& t) {
+  if (t.has_value()) {
+    return _str(ss, t.value());
+  }
+  ss << "std::nullopt";
+  return ss;
+}
+// Overloads of _str for wide types; forces narrowing.
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t* wCStr);
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t& wChar);
+C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString);
+
+template <>
+inline std::ostream& _str<CompileTimeEmptyString>(
+    std::ostream& ss,
+    const CompileTimeEmptyString& /*unused*/) {
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+template <typename... Args>
+struct _str_wrapper final {
+  static std::string call(const Args&... args) {
+    std::ostringstream ss;
+    _str(ss, args...);
+    return ss.str();
+  }
+};
+
+// Specializations for already-a-string types.
+template <>
+struct _str_wrapper<std::string> final {
+  // return by reference to avoid the binary size of a string copy
+  static const std::string& call(const std::string& str) {
+    return str;
+  }
+};
+
+template <>
+struct _str_wrapper<const char*> final {
+  static const char* call(const char* str) {
+    return str;
+  }
+};
+
+// For c10::str() with an empty argument list (which is common in our assert
+// macros), we don't want to pay the binary size for constructing and
+// destructing a stringstream or even constructing a string.
+template <>
+struct _str_wrapper<> final {
+  static CompileTimeEmptyString call() {
+    return CompileTimeEmptyString();
+  }
+};
+
+} // namespace detail
+
+// Convert a list of string-like arguments into a single string.
+template <typename... Args>
+inline auto str(const Args&... args) {
+  return detail::_str_wrapper<
+      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);
+}
+
+template <class Container>
+inline std::string Join(const std::string& delimiter, const Container& v) {
+  std::stringstream s;
+  int cnt = static_cast<int64_t>(v.size()) - 1;
+  for (auto i = v.begin(); i != v.end(); ++i, --cnt) {
+    s << (*i) << (cnt ? delimiter : "");
+  }
+  return std::move(s).str();
+}
+
+// Replace all occurrences of "from" substring to "to" string.
+// Returns number of replacements
+size_t C10_API
+ReplaceAll(std::string& s, std::string_view from, std::string_view to);
+
+/// Represents a location in source code (for debugging).
+struct C10_API SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+// unix isprint but insensitive to locale
+inline bool isPrint(char s) {
+  return s > 0x1f && s < 0x7f;
+}
+
+inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
+  stmt << '"';
+  for (auto s : str) {
+    switch (s) {
+      case '\\':
+        stmt << "\\\\";
+        break;
+      case '\'':
+        stmt << "\\'";
+        break;
+      case '\"':
+        stmt << "\\\"";
+        break;
+      case '\a':
+        stmt << "\\a";
+        break;
+      case '\b':
+        stmt << "\\b";
+        break;
+      case '\f':
+        stmt << "\\f";
+        break;
+      case '\n':
+        stmt << "\\n";
+        break;
+      case '\r':
+        stmt << "\\r";
+        break;
+      case '\t':
+        stmt << "\\t";
+        break;
+      case '\v':
+        stmt << "\\v";
+        break;
+      default:
+        if (isPrint(s)) {
+          stmt << s;
+        } else {
+          // C++ io has stateful formatting settings. Messing with
+          // them is probably worse than doing this manually.
+          // NOLINTNEXTLINE(*c-arrays*)
+          char buf[4] = "000";
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[2] += s % 8;
+          s /= 8;
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[1] += s % 8;
+          s /= 8;
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[0] += s;
+          stmt << "\\" << buf;
+        }
+        break;
+    }
+  }
+  stmt << '"';
+}
+
+template <typename T>
+std::optional<T> tryToNumber(const char* symbol) = delete;
+template <typename T>
+std::optional<T> tryToNumber(const std::string& symbol) = delete;
+
+/*
+ * Convert a string to a 64 bit integer. Trailing whitespaces are not supported.
+ * Similarly, integer string with trailing characters like "123abc" will be
+ * rejected.
+ */
+template <>
+C10_API std::optional<int64_t> tryToNumber<int64_t>(const char* symbol);
+template <>
+C10_API std::optional<int64_t> tryToNumber<int64_t>(const std::string& symbol);
+
+/*
+ * Convert a string to a double. Trailing whitespaces are not supported.
+ * Similarly, integer string with trailing characters like "123abc" will
+ * be rejected.
+ */
+template <>
+C10_API std::optional<double> tryToNumber<double>(const char* symbol);
+template <>
+C10_API std::optional<double> tryToNumber<double>(const std::string& symbol);
+
+C10_API std::vector<std::string_view> split(
+    std::string_view target,
+    char delimiter);
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#endif // C10_UTIL_STRINGUTIL_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h
new file mode 100644
index 0000000000000000000000000000000000000000..c78564263ebfe172abcb5c097a8c222606e8f019
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Synchronized.h
@@ -0,0 +1,67 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <mutex>
+
+namespace c10 {
+
+/**
+ * A very simple Synchronization class for error-free use of data
+ * in a multi-threaded context. See folly/docs/Synchronized.md for
+ * the inspiration of this class.
+ *
+ * Full URL:
+ * https://github.com/facebook/folly/blob/main/folly/docs/Synchronized.md
+ *
+ * This class implements a small subset of the generic functionality
+ * implemented by folly:Synchronized<T>. Specifically, only withLock<T>
+ * is implemented here since it's the smallest possible API that is
+ * able to cover a large surface area of functionality offered by
+ * folly::Synchronized<T>.
+ */
+template <typename T>
+class Synchronized final {
+  mutable std::mutex mutex_;
+  T data_;
+
+ public:
+  Synchronized() = default;
+  Synchronized(T const& data) : data_(data) {}
+  Synchronized(T&& data) : data_(std::move(data)) {}
+
+  // Don't permit copy construction, move, assignment, or
+  // move assignment, since the underlying std::mutex
+  //  isn't necessarily copyable/moveable.
+  Synchronized(Synchronized const&) = delete;
+  Synchronized(Synchronized&&) = delete;
+  Synchronized operator=(Synchronized const&) = delete;
+  Synchronized operator=(Synchronized&&) = delete;
+  ~Synchronized() = default;
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by reference. Use the protected variable in the
+   * provided callback safely.
+   */
+  template <typename CB>
+  auto withLock(CB&& cb) {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return std::forward<CB>(cb)(this->data_);
+  }
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by const reference. Use the protected variable in
+   * the provided callback safely.
+   */
+  template <typename CB>
+  auto withLock(CB&& cb) const {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return std::forward<CB>(cb)(this->data_);
+  }
+};
+} // end namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..03ba6f5b39ba567f65bfa375df66c413a88c171b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h
@@ -0,0 +1,90 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace c10 {
+
+enum class C10_API_ENUM DebugInfoKind : uint8_t {
+  PRODUCER_INFO = 0,
+  MOBILE_RUNTIME_INFO,
+  PROFILER_STATE,
+  INFERENCE_CONTEXT, // for inference usage
+  PARAM_COMMS_INFO,
+
+  TEST_INFO, // used only in tests
+  TEST_INFO_2, // used only in tests
+};
+
+class C10_API DebugInfoBase {
+ public:
+  DebugInfoBase() = default;
+  virtual ~DebugInfoBase() = default;
+};
+
+// Thread local debug information is propagated across the forward
+// (including async fork tasks) and backward passes and is supposed
+// to be utilized by the user's code to pass extra information from
+// the higher layers (e.g. model id) down to the lower levels
+// (e.g. to the operator observers used for debugging, logging,
+// profiling, etc)
+class C10_API ThreadLocalDebugInfo {
+ public:
+  static DebugInfoBase* get(DebugInfoKind kind);
+
+  // Get current ThreadLocalDebugInfo
+  static std::shared_ptr<ThreadLocalDebugInfo> current();
+
+  // Internal, use DebugInfoGuard/ThreadLocalStateGuard
+  static void _forceCurrentDebugInfo(
+      std::shared_ptr<ThreadLocalDebugInfo> info);
+
+  // Push debug info struct of a given kind
+  static void _push(DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info);
+  // Pop debug info, throws in case the last pushed
+  // debug info is not of a given kind
+  static std::shared_ptr<DebugInfoBase> _pop(DebugInfoKind kind);
+  // Peek debug info, throws in case the last pushed debug info is not of the
+  // given kind
+  static std::shared_ptr<DebugInfoBase> _peek(DebugInfoKind kind);
+
+ private:
+  std::shared_ptr<DebugInfoBase> info_;
+  DebugInfoKind kind_;
+  std::shared_ptr<ThreadLocalDebugInfo> parent_info_;
+
+  friend class DebugInfoGuard;
+};
+
+// DebugInfoGuard is used to set debug information,
+// ThreadLocalDebugInfo is semantically immutable, the values are set
+// through the scope-based guard object.
+// Nested DebugInfoGuard adds/overrides existing values in the scope,
+// restoring the original values after exiting the scope.
+// Users can access the values through the ThreadLocalDebugInfo::get() call;
+class C10_API DebugInfoGuard {
+ public:
+  DebugInfoGuard(DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info);
+
+  explicit DebugInfoGuard(std::shared_ptr<ThreadLocalDebugInfo> info);
+
+  ~DebugInfoGuard();
+
+  DebugInfoGuard(const DebugInfoGuard&) = delete;
+  DebugInfoGuard(DebugInfoGuard&&) = delete;
+  DebugInfoGuard& operator=(const DebugInfoGuard&) = delete;
+  DebugInfoGuard& operator=(DebugInfoGuard&&) = delete;
+
+ private:
+  bool active_ = false;
+  std::shared_ptr<ThreadLocalDebugInfo> prev_info_ = nullptr;
+};
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe2282d2973c030f2abb788009acf8ce661f3fd8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeIndex.h
@@ -0,0 +1,132 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/ConstexprCrc.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/string_view.h>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
+#define C10_TYPENAME_CONSTEXPR constexpr
+#endif
+
+namespace c10::util {
+
+struct type_index final : IdWrapper<type_index, uint64_t> {
+  constexpr explicit type_index(uint64_t checksum) : IdWrapper(checksum) {}
+
+  // Allow usage in std::map / std::set
+  // TODO Disallow this and rather use std::unordered_map/set everywhere
+  friend constexpr bool operator<(type_index lhs, type_index rhs) noexcept {
+    return lhs.underlyingId() < rhs.underlyingId();
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream, type_index typeId) {
+    return stream << typeId.underlyingId();
+  }
+};
+
+namespace detail {
+
+template <typename T>
+inline constexpr c10::c10_string_view fully_qualified_type_name_impl() {
+#if defined(_MSC_VER) && !defined(__clang__)
+  constexpr std::string_view fun_sig = __FUNCSIG__;
+#if defined(__NVCC__)
+  constexpr std::string_view prefix =
+      "c10::basic_string_view<char> c10::util::detail::fully_qualified_type_name_impl<";
+  constexpr std::string_view suffix = ">()";
+#else
+  constexpr std::string_view prefix =
+      "class c10::basic_string_view<char> __cdecl c10::util::detail::fully_qualified_type_name_impl<";
+  constexpr std::string_view suffix = ">(void)";
+#endif
+#elif defined(__clang__)
+  constexpr std::string_view fun_sig = __PRETTY_FUNCTION__;
+  constexpr std::string_view prefix =
+      "c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [T = ";
+  constexpr std::string_view suffix = "]";
+#elif defined(__GNUC__)
+  constexpr std::string_view fun_sig = __PRETTY_FUNCTION__;
+  constexpr std::string_view prefix =
+      "constexpr c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ";
+  constexpr std::string_view suffix =
+      "; c10::c10_string_view = c10::basic_string_view<char>]";
+#endif
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+  static_assert(c10::starts_with(
+      static_cast<std::string_view>(fun_sig),
+      static_cast<std::string_view>(prefix)));
+  static_assert(c10::ends_with(
+      static_cast<std::string_view>(fun_sig),
+      static_cast<std::string_view>(suffix)));
+#endif
+  return fun_sig.substr(
+      prefix.size(), fun_sig.size() - prefix.size() - suffix.size());
+}
+
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+template <typename T>
+inline constexpr uint64_t type_index_impl() {
+// Idea: __PRETTY_FUNCTION__ (or __FUNCSIG__ on msvc) contains a qualified name
+// of this function, including its template parameter, i.e. including the
+// type we want an id for. We use this name and run crc64 on it to get a type
+// id.
+#if defined(_MSC_VER) && !defined(__clang__)
+  return crc64(__FUNCSIG__, sizeof(__FUNCSIG__)).checksum();
+#elif defined(__clang__)
+  return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum();
+#elif defined(__GNUC__)
+  return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum();
+#endif
+}
+#endif
+
+} // namespace detail
+
+template <typename T>
+inline constexpr type_index get_type_index() {
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+  // To enforce that this is really computed at compile time, we pass the
+  // type index through std::integral_constant.
+  return type_index{std::integral_constant<
+      uint64_t,
+      detail::type_index_impl<std::decay_t<T>>()>::value};
+#else
+  // There's nothing in theory preventing us from running this on device code
+  // except for nvcc throwing a compiler error if we enable it.
+  return (abort(), type_index(0));
+#endif
+}
+
+#if !defined(TORCH_PEDANTIC)
+// Use precomputed hashsum for std::string
+// Needed to workaround ambiguity in class name resolution
+// into __PRETTY_FUNCTION__ when abovementioned class is defined in inlined
+// namespace. In multi-ABI C++ library, `std::string` is an alias to
+// `std::__cxx11::basic_string<char>` which depending on compiler flags can be
+// resolved to `basic_string<char>` either in `std` namespace or in
+// `std::__cxx11` one (`__cxx11` is an inline namespace)
+template <>
+inline constexpr type_index get_type_index<std::string>() {
+  // hashsum for std::basic_string<char>
+  return type_index{4193213214807308375ULL};
+}
+#endif
+
+template <typename T>
+inline constexpr std::string_view get_fully_qualified_type_name() noexcept {
+  return static_cast<std::string_view>(
+      detail::fully_qualified_type_name_impl<T>());
+}
+} // namespace c10::util
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::type_index)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..f511333fc7d9ca2b9e29fd7512e4cd0cb8776b25
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeSafeSignMath.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/TypeSafeSignMath.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d49c82cbd8948cdd7bb2b9fd758f7875e5dfdb7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/TypeTraits.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/TypeTraits.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h
new file mode 100644
index 0000000000000000000000000000000000000000..68d2c2ce7feac15b4fab16f4124e41633433a213
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/Unicode.h
@@ -0,0 +1,19 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#if defined(_WIN32)
+#include <c10/util/Exception.h>
+#include <c10/util/win32-headers.h>
+#include <string>
+#endif
+
+namespace c10 {
+#if defined(_WIN32)
+C10_API std::wstring u8u16(const std::string& str);
+C10_API std::string u16u8(const std::wstring& wstr);
+#endif
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccae4f78e54b38345cab1f41b97b70293d0a35a8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/WaitCounter.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ScopeExit.h>
+#include <c10/util/SmallVector.h>
+
+namespace c10::monitor {
+namespace detail {
+class WaitCounterImpl;
+
+class WaitCounterBackendIf {
+ public:
+  virtual ~WaitCounterBackendIf() = default;
+
+  virtual intptr_t start(
+      std::chrono::steady_clock::time_point now) noexcept = 0;
+  virtual void stop(
+      std::chrono::steady_clock::time_point now,
+      intptr_t ctx) noexcept = 0;
+};
+
+class WaitCounterBackendFactoryIf {
+ public:
+  virtual ~WaitCounterBackendFactoryIf() = default;
+
+  // May return nullptr.
+  // In this case the counter will be ignored by the given backend.
+  virtual std::unique_ptr<WaitCounterBackendIf> create(
+      std::string_view key) noexcept = 0;
+};
+
+C10_API void registerWaitCounterBackend(
+    std::unique_ptr<WaitCounterBackendFactoryIf> /*factory*/);
+
+C10_API std::vector<std::shared_ptr<WaitCounterBackendFactoryIf>>
+getRegisteredWaitCounterBackends();
+} // namespace detail
+
+// A handle to a wait counter.
+class C10_API WaitCounterHandle {
+ public:
+  explicit WaitCounterHandle(std::string_view key);
+
+  class WaitGuard {
+   public:
+    WaitGuard(WaitGuard&& other) noexcept
+        : handle_{std::exchange(other.handle_, {})},
+          ctxs_{std::move(other.ctxs_)} {}
+    WaitGuard(const WaitGuard&) = delete;
+    WaitGuard& operator=(const WaitGuard&) = delete;
+    WaitGuard& operator=(WaitGuard&&) = delete;
+
+    ~WaitGuard() {
+      stop();
+    }
+
+    void stop() {
+      if (auto handle = std::exchange(handle_, nullptr)) {
+        handle->stop(ctxs_);
+      }
+    }
+
+   private:
+    WaitGuard(WaitCounterHandle& handle, SmallVector<intptr_t>&& ctxs)
+        : handle_{&handle}, ctxs_{std::move(ctxs)} {}
+
+    friend class WaitCounterHandle;
+
+    WaitCounterHandle* handle_;
+    SmallVector<intptr_t> ctxs_;
+  };
+
+  // Starts a waiter
+  WaitGuard start();
+
+ private:
+  // Stops the waiter. Each start() call should be matched by exactly one stop()
+  // call.
+  void stop(const SmallVector<intptr_t>& ctxs);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  detail::WaitCounterImpl& impl_;
+};
+} // namespace c10::monitor
+
+#define STATIC_WAIT_COUNTER(_key)                           \
+  []() -> ::c10::monitor::WaitCounterHandle& {              \
+    static ::c10::monitor::WaitCounterHandle handle(#_key); \
+    return handle;                                          \
+  }()
+
+#define STATIC_SCOPED_WAIT_COUNTER(_name) \
+  auto C10_ANONYMOUS_VARIABLE(SCOPE_GUARD) = STATIC_WAIT_COUNTER(_name).start();
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..948d03d509175254b3f54c60a4b501dd62f870b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bit_cast.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/bit_cast.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe5b67c454490e06d88752b708d9543cda0ae6d1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/bits.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/bits.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..33da59051855d7e726fe83d19b9c39e8ab355317
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_math.h
@@ -0,0 +1,411 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+namespace c10_complex_math {
+
+// Exponential functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> exp(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::exp(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::exp(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log10(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log10(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log10(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log2(const c10::complex<T>& x) {
+  const c10::complex<T> log2 = c10::complex<T>(::log(2.0), 0.0);
+  return c10_complex_math::log(x) / log2;
+}
+
+// Power functions
+//
+#if defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
+namespace _detail {
+C10_API c10::complex<float> sqrt(const c10::complex<float>& in);
+C10_API c10::complex<double> sqrt(const c10::complex<double>& in);
+C10_API c10::complex<float> acos(const c10::complex<float>& in);
+C10_API c10::complex<double> acos(const c10::complex<double>& in);
+} // namespace _detail
+#endif
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sqrt(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sqrt(static_cast<thrust::complex<T>>(x)));
+#elif !(                        \
+    defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)))
+  return static_cast<c10::complex<T>>(
+      std::sqrt(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::sqrt(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const T& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const T& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const U& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const T& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+// Trigonometric functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cos(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cos(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acos(static_cast<thrust::complex<T>>(x)));
+#elif !defined(_LIBCPP_VERSION)
+  return static_cast<c10::complex<T>>(
+      std::acos(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::acos(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+// Hyperbolic functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::acosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log1p(const c10::complex<T>& z) {
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \
+    defined(__HIPCC__)
+  // For Mac, the new implementation yielded a high relative error. Falling back
+  // to the old version for now.
+  // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  // For CUDA we also use this one, as thrust::log(thrust::complex) takes
+  // *forever* to compile
+
+  // log1p(z) = log(1 + z)
+  // Let's define 1 + z = r * e ^ (i * a), then we have
+  // log(r * e ^ (i * a)) = log(r) + i * a
+  // With z = x + iy, the term r can be written as
+  // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5
+  //   = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5
+  // So, log(r) is
+  // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2)
+  //        = 0.5 * log1p(x * (x + 2) + y ^ 2)
+  // we need to use the expression only on certain condition to avoid overflow
+  // and underflow from `(x * (x + 2) + y ^ 2)`
+  T x = z.real();
+  T y = z.imag();
+  T zabs = std::abs(z);
+  T theta = std::atan2(y, x + T(1));
+  if (zabs < 0.5) {
+    T r = x * (T(2) + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {T(0.5) * std::log1p(r), theta};
+  } else {
+    T z0 = std::hypot(x + 1, y);
+    return {std::log(z0), theta};
+  }
+#else
+  // CPU path
+  // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  c10::complex<T> u = z + T(1);
+  if (u == T(1)) {
+    return z;
+  } else {
+    auto log_u = log(u);
+    if (u - T(1) == z) {
+      return log_u;
+    }
+    return log_u * (z / (u - T(1)));
+  }
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> expm1(const c10::complex<T>& z) {
+  // expm1(z) = exp(z) - 1
+  // Define z = x + i * y
+  // f = e ^ (x + i * y) - 1
+  //   = e ^ x * e ^ (i * y) - 1
+  //   = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y))
+  //   = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y)
+  //   = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y)
+  T x = z.real();
+  T y = z.imag();
+  T a = std::sin(y / 2);
+  T er = std::expm1(x) * std::cos(y) - T(2) * a * a;
+  T ei = std::exp(x) * std::sin(y);
+  return {er, ei};
+}
+
+} // namespace c10_complex_math
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+namespace std {
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..44152b72cb35b7df727ece02b089350be04a9f7f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/complex_utils.h
@@ -0,0 +1,51 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+#include <limits>
+
+namespace c10 {
+
+template <typename T>
+struct is_complex : public std::false_type {};
+
+template <typename T>
+struct is_complex<std::complex<T>> : public std::true_type {};
+
+template <typename T>
+struct is_complex<c10::complex<T>> : public std::true_type {};
+
+// Extract double from std::complex<double>; is identity otherwise
+// TODO: Write in more idiomatic C++17
+template <typename T>
+struct scalar_value_type {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<std::complex<T>> {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<c10::complex<T>> {
+  using type = T;
+};
+
+} // namespace c10
+
+namespace std {
+
+template <typename T>
+class numeric_limits<c10::complex<T>> : public numeric_limits<T> {};
+
+template <typename T>
+bool isnan(const c10::complex<T>& v) {
+  return std::isnan(v.real()) || std::isnan(v.imag());
+}
+
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bc7c7956f3986ca3c3f10252bd6eb06a7fd1104
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/copysign.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+namespace c10 {
+
+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template <typename T, typename U>
+inline auto copysign(const T& a, const U& b) {
+  return std::copysign(a, b);
+}
+
+// Implement copysign for half precision floats using bit ops
+// Sign is the most significant bit for both half and bfloat16 types
+inline c10::Half copysign(c10::Half a, c10::Half b) {
+  return c10::Half((a.x & 0x7fff) | (b.x & 0x8000), c10::Half::from_bits());
+}
+
+inline c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
+  return c10::BFloat16(
+      (a.x & 0x7fff) | (b.x & 0x8000), c10::BFloat16::from_bits());
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..538a6e271f9d56564bcd8ff73071974991513009
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/env.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <optional>
+#include <string>
+
+namespace c10::utils {
+
+// Set an environment variable.
+C10_API void set_env(
+    const char* name,
+    const char* value,
+    bool overwrite = true);
+
+// Checks an environment variable is set.
+C10_API bool has_env(const char* name) noexcept;
+
+// Reads an environment variable and returns
+// - std::optional<true>,              if set equal to "1"
+// - std::optional<false>,             if set equal to "0"
+// - nullopt,   otherwise
+//
+// NB:
+// Issues a warning if the value of the environment variable is not 0 or 1.
+C10_API std::optional<bool> check_env(const char* name);
+
+// Reads the value of an environment variable if it is set.
+// However, check_env should be used if the value is assumed to be a flag.
+C10_API std::optional<std::string> get_env(const char* name) noexcept;
+
+} // namespace c10::utils
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b83f9c931e4cf13b648336b4331a6f33b0a6fda2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/floating_point_utils.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/floating_point_utils.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h
new file mode 100644
index 0000000000000000000000000000000000000000..73687a69d1bbc0bfe4a0d449cbf43f10437e29bd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/int128.h
@@ -0,0 +1,403 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// This file is based on the uint128 implementation of protobuf at
+// https://github.com/protocolbuffers/protobuf/blob/1e88936fce10cf773cb72b44c6a7f48b38c7578b/src/google/protobuf/stubs/int128.h
+//
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <cstdint>
+#include <iosfwd>
+
+namespace c10 {
+
+struct uint128_pod;
+
+// TODO(xiaofeng): Define GOOGLE_PROTOBUF_HAS_CONSTEXPR when constexpr is
+// available.
+#ifdef GOOGLE_PROTOBUF_HAS_CONSTEXPR
+#define UINT128_CONSTEXPR constexpr
+#else
+#define UINT128_CONSTEXPR
+#endif
+
+class uint128;
+inline uint128& operator<<=(uint128& self, int amount);
+
+// An unsigned 128-bit integer type. Thread-compatible.
+class C10_API uint128 {
+ public:
+  UINT128_CONSTEXPR uint128(); // Sets to 0, but don't trust on this behavior.
+  UINT128_CONSTEXPR uint128(uint64_t top, uint64_t bottom);
+#ifndef SWIG
+  UINT128_CONSTEXPR uint128(int bottom);
+  UINT128_CONSTEXPR uint128(uint32_t bottom); // Top 96 bits = 0
+#endif
+  UINT128_CONSTEXPR uint128(uint64_t bottom); // hi_ = 0
+  UINT128_CONSTEXPR uint128(const uint128_pod& val);
+
+  // Trivial copy constructor, assignment operator and destructor.
+
+  void Initialize(uint64_t top, uint64_t bottom);
+
+  // Arithmetic operators.
+  uint128& operator+=(const uint128& b);
+  uint128& operator-=(const uint128& b);
+  uint128& operator*=(const uint128& b);
+  // Long division/modulo for uint128.
+  uint128& operator/=(const uint128& b);
+  uint128& operator%=(const uint128& b);
+  uint128 operator++(int);
+  uint128 operator--(int);
+  // Make msvc happy with using operator<<= from DivModImpl
+  // which is a static function, and linker complained about missing
+  // static version of this overload
+  friend uint128& operator<<=(uint128& /*self*/, int /*amount*/);
+  uint128& operator>>=(int /*amount*/);
+  uint128& operator&=(const uint128& b);
+  uint128& operator|=(const uint128& b);
+  uint128& operator^=(const uint128& b);
+  uint128& operator++();
+  uint128& operator--();
+
+  friend uint64_t Uint128Low64(const uint128& v);
+  friend uint64_t Uint128High64(const uint128& v);
+
+  // We add "std::" to avoid including all of port.h.
+  C10_API friend std::ostream& operator<<(std::ostream& o, const uint128& b);
+
+ private:
+  static void DivModImpl(
+      uint128 dividend,
+      uint128 divisor,
+      uint128* quotient_ret,
+      uint128* remainder_ret);
+
+  // Little-endian memory order optimizations can benefit from
+  // having lo_ first, hi_ last.
+  // See util/endian/endian.h and Load128/Store128 for storing a uint128.
+  uint64_t lo_;
+  uint64_t hi_;
+
+  // Not implemented, just declared for catching automatic type conversions.
+  uint128(uint8_t);
+  uint128(uint16_t);
+  uint128(float v);
+  uint128(double v);
+};
+
+// This is a POD form of uint128 which can be used for static variables which
+// need to be operated on as uint128.
+struct uint128_pod {
+  // Note: The ordering of fields is different than 'class uint128' but the
+  // same as its 2-arg constructor.  This enables more obvious initialization
+  // of static instances, which is the primary reason for this struct in the
+  // first place.  This does not seem to defeat any optimizations wrt
+  // operations involving this struct.
+  uint64_t hi;
+  uint64_t lo;
+};
+
+C10_API extern const uint128_pod kuint128max;
+
+// allow uint128 to be logged
+C10_API extern std::ostream& operator<<(std::ostream& o, const uint128& b);
+
+// Methods to access low and high pieces of 128-bit value.
+// Defined externally from uint128 to facilitate conversion
+// to native 128-bit types when compilers support them.
+inline uint64_t Uint128Low64(const uint128& v) {
+  return v.lo_;
+}
+inline uint64_t Uint128High64(const uint128& v) {
+  return v.hi_;
+}
+
+// TODO: perhaps it would be nice to have int128, a signed 128-bit type?
+
+// --------------------------------------------------------------------------
+//                      Implementation details follow
+// --------------------------------------------------------------------------
+inline bool operator==(const uint128& lhs, const uint128& rhs) {
+  return (
+      Uint128Low64(lhs) == Uint128Low64(rhs) &&
+      Uint128High64(lhs) == Uint128High64(rhs));
+}
+inline bool operator!=(const uint128& lhs, const uint128& rhs) {
+  return !(lhs == rhs);
+}
+
+inline UINT128_CONSTEXPR uint128::uint128() : lo_(0), hi_(0) {}
+inline UINT128_CONSTEXPR uint128::uint128(uint64_t top, uint64_t bottom)
+    : lo_(bottom), hi_(top) {}
+inline UINT128_CONSTEXPR uint128::uint128(const uint128_pod& v)
+    : lo_(v.lo), hi_(v.hi) {}
+inline UINT128_CONSTEXPR uint128::uint128(uint64_t bottom)
+    : lo_(bottom), hi_(0) {}
+#ifndef SWIG
+inline UINT128_CONSTEXPR uint128::uint128(uint32_t bottom)
+    : lo_(bottom), hi_(0) {}
+inline UINT128_CONSTEXPR uint128::uint128(int bottom)
+    : lo_(bottom), hi_(static_cast<int64_t>((bottom < 0) ? -1 : 0)) {}
+#endif
+
+#undef UINT128_CONSTEXPR
+
+inline void uint128::Initialize(uint64_t top, uint64_t bottom) {
+  hi_ = top;
+  lo_ = bottom;
+}
+
+// Comparison operators.
+
+#define CMP128(op)                                                  \
+  inline bool operator op(const uint128& lhs, const uint128& rhs) { \
+    return (Uint128High64(lhs) == Uint128High64(rhs))               \
+        ? (Uint128Low64(lhs) op Uint128Low64(rhs))                  \
+        : (Uint128High64(lhs) op Uint128High64(rhs));               \
+  }
+
+CMP128(<)
+CMP128(>)
+CMP128(>=)
+CMP128(<=)
+
+#undef CMP128
+
+// Unary operators
+
+inline uint128 operator-(const uint128& val) {
+  const uint64_t hi_flip = ~Uint128High64(val);
+  const uint64_t lo_flip = ~Uint128Low64(val);
+  const uint64_t lo_add = lo_flip + 1;
+  if (lo_add < lo_flip) {
+    return uint128(hi_flip + 1, lo_add);
+  }
+  return uint128(hi_flip, lo_add);
+}
+
+inline bool operator!(const uint128& val) {
+  return !Uint128High64(val) && !Uint128Low64(val);
+}
+
+// Logical operators.
+
+inline uint128 operator~(const uint128& val) {
+  return uint128(~Uint128High64(val), ~Uint128Low64(val));
+}
+
+#define LOGIC128(op)                                                   \
+  inline uint128 operator op(const uint128& lhs, const uint128& rhs) { \
+    return uint128(                                                    \
+        Uint128High64(lhs) op Uint128High64(rhs),                      \
+        Uint128Low64(lhs) op Uint128Low64(rhs));                       \
+  }
+
+LOGIC128(|)
+LOGIC128(&)
+LOGIC128(^)
+
+#undef LOGIC128
+
+#define LOGICASSIGN128(op)                                      \
+  inline uint128& uint128::operator op(const uint128 & other) { \
+    hi_ op other.hi_;                                           \
+    lo_ op other.lo_;                                           \
+    return *this;                                               \
+  }
+
+LOGICASSIGN128(|=)
+LOGICASSIGN128(&=)
+LOGICASSIGN128(^=)
+
+#undef LOGICASSIGN128
+
+// Shift operators.
+
+inline uint128 operator<<(const uint128& val, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount == 0) {
+      return val;
+    }
+    uint64_t new_hi =
+        (Uint128High64(val) << amount) | (Uint128Low64(val) >> (64 - amount));
+    uint64_t new_lo = Uint128Low64(val) << amount;
+    return uint128(new_hi, new_lo);
+  } else if (amount < 128) {
+    return uint128(Uint128Low64(val) << (amount - 64), 0);
+  } else {
+    return uint128(0, 0);
+  }
+}
+
+inline uint128 operator>>(const uint128& val, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount == 0) {
+      return val;
+    }
+    uint64_t new_hi = Uint128High64(val) >> amount;
+    uint64_t new_lo =
+        (Uint128Low64(val) >> amount) | (Uint128High64(val) << (64 - amount));
+    return uint128(new_hi, new_lo);
+  } else if (amount < 128) {
+    return uint128(0, Uint128High64(val) >> (amount - 64));
+  } else {
+    return uint128(0, 0);
+  }
+}
+
+inline uint128& operator<<=(uint128& self, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount != 0) {
+      self.hi_ = (self.hi_ << amount) | (self.lo_ >> (64 - amount));
+      self.lo_ = self.lo_ << amount;
+    }
+  } else if (amount < 128) {
+    self.hi_ = self.lo_ << (amount - 64);
+    self.lo_ = 0;
+  } else {
+    self.hi_ = 0;
+    self.lo_ = 0;
+  }
+  return self;
+}
+
+inline uint128& uint128::operator>>=(int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount != 0) {
+      lo_ = (lo_ >> amount) | (hi_ << (64 - amount));
+      hi_ = hi_ >> amount;
+    }
+  } else if (amount < 128) {
+    lo_ = hi_ >> (amount - 64);
+    hi_ = 0;
+  } else {
+    lo_ = 0;
+    hi_ = 0;
+  }
+  return *this;
+}
+
+inline uint128 operator+(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) += rhs;
+}
+
+inline uint128 operator-(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) -= rhs;
+}
+
+inline uint128 operator*(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) *= rhs;
+}
+
+inline uint128 operator/(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) /= rhs;
+}
+
+inline uint128 operator%(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) %= rhs;
+}
+
+inline uint128& uint128::operator+=(const uint128& b) {
+  hi_ += b.hi_;
+  uint64_t lolo = lo_ + b.lo_;
+  if (lolo < lo_)
+    ++hi_;
+  lo_ = lolo;
+  return *this;
+}
+
+inline uint128& uint128::operator-=(const uint128& b) {
+  hi_ -= b.hi_;
+  if (b.lo_ > lo_)
+    --hi_;
+  lo_ -= b.lo_;
+  return *this;
+}
+
+inline uint128& uint128::operator*=(const uint128& b) {
+  uint64_t a96 = hi_ >> 32;
+  uint64_t a64 = hi_ & 0xffffffffu;
+  uint64_t a32 = lo_ >> 32;
+  uint64_t a00 = lo_ & 0xffffffffu;
+  uint64_t b96 = b.hi_ >> 32;
+  uint64_t b64 = b.hi_ & 0xffffffffu;
+  uint64_t b32 = b.lo_ >> 32;
+  uint64_t b00 = b.lo_ & 0xffffffffu;
+  // multiply [a96 .. a00] x [b96 .. b00]
+  // terms higher than c96 disappear off the high side
+  // terms c96 and c64 are safe to ignore carry bit
+  uint64_t c96 = a96 * b00 + a64 * b32 + a32 * b64 + a00 * b96;
+  uint64_t c64 = a64 * b00 + a32 * b32 + a00 * b64;
+  this->hi_ = (c96 << 32) + c64;
+  this->lo_ = 0;
+  // add terms after this one at a time to capture carry
+  *this += uint128(a32 * b00) << 32;
+  *this += uint128(a00 * b32) << 32;
+  *this += a00 * b00;
+  return *this;
+}
+
+inline uint128 uint128::operator++(int) {
+  uint128 tmp(*this);
+  *this += 1;
+  return tmp;
+}
+
+inline uint128 uint128::operator--(int) {
+  uint128 tmp(*this);
+  *this -= 1;
+  return tmp;
+}
+
+inline uint128& uint128::operator++() {
+  *this += 1;
+  return *this;
+}
+
+inline uint128& uint128::operator--() {
+  *this -= 1;
+  return *this;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..148a9bf4a20002de4396c9e0a26ea695b8ed1c98
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/intrusive_ptr.h
@@ -0,0 +1,1278 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/MaybeOwned.h>
+#include <atomic>
+#include <climits>
+#include <memory>
+#include <type_traits>
+
+namespace pybind11 {
+template <typename, typename...>
+class class_;
+}
+
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
+namespace c10 {
+class intrusive_ptr_target;
+namespace raw {
+namespace weak_intrusive_ptr {
+inline void incref(intrusive_ptr_target* self);
+}
+namespace intrusive_ptr {
+inline void incref(intrusive_ptr_target* self);
+}
+
+// constructor tag used by intrusive_ptr constructors
+struct DontIncreaseRefcount {};
+} // namespace raw
+
+namespace detail {
+constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
+    (kImpracticallyHugeReferenceCount << 32);
+constexpr uint64_t kReferenceCountOne = 1;
+constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
+constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
+
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+inline uint32_t refcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount);
+}
+
+inline uint32_t weakcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
+}
+
+inline bool is_uniquely_owned(uint64_t combined_refcount) {
+  return (combined_refcount & ~detail::kHasPyObject) == detail::kUniqueRef;
+}
+
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
+inline uint64_t atomic_combined_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t inc) {
+  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
+}
+
+inline uint32_t atomic_weakcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_increment(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
+inline uint64_t atomic_combined_refcount_decrement(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t dec) {
+  return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec;
+}
+
+inline uint32_t atomic_weakcount_decrement(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_decrement(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
+} // namespace detail
+
+/**
+ * intrusive_ptr<T> is an alternative to shared_ptr<T> that has better
+ * performance because it does the refcounting intrusively
+ * (i.e. in a member of the object itself).
+ * Your class T needs to inherit from intrusive_ptr_target to allow it to be
+ * used in an intrusive_ptr<T>. Your class's constructor should not allow
+ *`this` to escape to other threads or create an intrusive_ptr from `this`.
+ */
+
+// Note [Stack allocated intrusive_ptr_target safety]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A well known problem with std::enable_shared_from_this is that it
+// allows you to create a std::shared_ptr from a stack allocated object,
+// which is totally bogus because the object will die once you return
+// from the stack.  In intrusive_ptr, we can detect that this has occurred,
+// because we set the refcount/weakcount of objects which inherit from
+// intrusive_ptr_target to zero, *unless* we can prove that the object
+// was dynamically allocated (e.g., via make_intrusive).
+//
+// Thus, whenever you transmute a T* into a intrusive_ptr<T>, we check
+// and make sure that the refcount isn't zero (or, a more subtle
+// test for weak_intrusive_ptr<T>, for which the refcount may validly
+// be zero, but the weak refcount better not be zero), because that
+// tells us if the object was allocated by us.  If it wasn't, no
+// intrusive_ptr for you!
+
+// NOLINTNEXTLINE(cppcoreguidelines-virtual-class-destructor)
+class C10_API intrusive_ptr_target {
+  // Note [Weak references for intrusive refcounting]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Here's the scheme:
+  //
+  //  - refcount == number of strong references to the object
+  //    weakcount == number of weak references to the object,
+  //      plus one more if refcount > 0
+  //    An invariant: refcount > 0  =>  weakcount > 0
+  //
+  //  - c10::StorageImpl stays live as long as there are any strong
+  //    or weak pointers to it (weakcount > 0, since strong
+  //    references count as a +1 to weakcount)
+  //
+  //  - finalizers are called and data_ptr is deallocated when refcount == 0
+  //
+  //  - Once refcount == 0, it can never again be > 0 (the transition
+  //    from > 0 to == 0 is monotonic)
+  //
+  //  - When you access c10::StorageImpl via a weak pointer, you must
+  //    atomically increment the use count, if it is greater than 0.
+  //    If it is not, you must report that the storage is dead.
+  //
+  //.We use a single combined count for refcount and weakcount so that
+  // we can atomically operate on both at the same time for performance
+  // and defined behaviors.
+  //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
+  mutable std::atomic<uint64_t> combined_refcount_;
+  static_assert(sizeof(std::atomic<uint64_t>) == 8);
+  static_assert(alignof(std::atomic<uint64_t>) == 8);
+  static_assert(std::atomic<uint64_t>::is_always_lock_free);
+
+  template <typename T, typename NullType>
+  friend class intrusive_ptr;
+  friend inline void raw::intrusive_ptr::incref(intrusive_ptr_target* self);
+
+  template <typename T, typename NullType>
+  friend class weak_intrusive_ptr;
+  friend inline void raw::weak_intrusive_ptr::incref(
+      intrusive_ptr_target* self);
+
+  template <typename T>
+  friend struct ExclusivelyOwnedTensorTraits;
+
+  friend class torch::utils::PyObjectPreservation;
+
+ protected:
+  // protected destructor. We never want to destruct intrusive_ptr_target*
+  // directly.
+  virtual ~intrusive_ptr_target() {
+// Disable -Wterminate and -Wexceptions so we're allowed to use assertions
+// (i.e. throw exceptions) in a destructor.
+// We also have to disable -Wunknown-warning-option and -Wpragmas, because
+// some other compilers don't know about -Wterminate or -Wexceptions and
+// will show a warning about unknown warning options otherwise.
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning( \
+    disable : 4297) // function assumed not to throw an exception but does
+#else
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wterminate"
+#pragma GCC diagnostic ignored "-Wexceptions"
+#endif
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // Second condition is there to accommodate
+        // unsafe_adapt_non_heap_allocated: since we are doing our own
+        // deallocation in that case, it is correct for each
+        // expected_decref to have happened (some user code tried to
+        // decref and thus free the object, but it didn't happen right
+        // away) or not (no user code tried to free the object, and
+        // now it's getting destroyed through whatever mechanism the
+        // caller of unsafe_adapt_non_heap_allocated wanted to
+        // use). We choose our reference count such that the count
+        // will not dip below kImpracticallyHugeReferenceCount regardless.
+        refcount() == 0 ||
+            refcount() >= detail::kImpracticallyHugeReferenceCount,
+        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
+        refcount());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // See ~intrusive_ptr for optimization that will frequently result in 1
+        // at destruction time.
+        weakcount() == 1 || weakcount() == 0 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount,
+        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#else
+#pragma GCC diagnostic pop
+#endif
+  }
+
+  constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}
+
+  // intrusive_ptr_target supports copy and move: but refcount and weakcount
+  // don't participate (since they are intrinsic properties of the memory
+  // location)
+  intrusive_ptr_target(intrusive_ptr_target&& /*other*/) noexcept
+      : intrusive_ptr_target() {}
+
+  intrusive_ptr_target& operator=(intrusive_ptr_target&& /*other*/) noexcept {
+    return *this;
+  }
+
+  intrusive_ptr_target(const intrusive_ptr_target& /*other*/) noexcept
+      : intrusive_ptr_target() {}
+
+  intrusive_ptr_target& operator=(
+      const intrusive_ptr_target& /*other*/) noexcept {
+    return *this;
+  }
+
+ private:
+  /**
+   * This is called when refcount reaches zero.
+   * You can override this to release expensive resources.
+   * There might still be weak references, so your object might not get
+   * destructed yet, but you can assume the object isn't used anymore,
+   * i.e. no more calls to methods or accesses to members (we just can't
+   * destruct it yet because we need the weakcount accessible).
+   *
+   * If there are no weak references (i.e. your class is about to be
+   * destructed), this function WILL NOT be called.
+   */
+  virtual void release_resources() {}
+
+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const noexcept {}
+  virtual void decref_pyobject() const noexcept {}
+  virtual bool try_incref_pyobject() const noexcept {
+    return false;
+  }
+
+  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
+    return detail::refcount(combined_refcount_.load(order));
+  }
+
+  uint32_t weakcount(
+      std::memory_order order = std::memory_order_relaxed) const {
+    return detail::weakcount(combined_refcount_.load(order));
+  }
+};
+
+namespace detail {
+
+#ifndef C10_MOBILE
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
+template <class TTarget, class NullType>
+class weak_intrusive_ptr;
+
+template <
+    class TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>>
+class intrusive_ptr final {
+ private:
+//  the following static assert would be nice to have but it requires
+//  the target class T to be fully defined when intrusive_ptr<T> is instantiated
+//  this is a problem for classes that contain pointers to themselves
+//  static_assert(
+//      std::is_base_of_v<intrusive_ptr_target, TTarget>,
+//      "intrusive_ptr can only be used for classes that inherit from
+//      intrusive_ptr_target.");
+#ifndef _WIN32
+  // This static_assert triggers on MSVC
+  //  error C2131: expression did not evaluate to a constant
+  static_assert(
+      // NOLINTNEXTLINE(misc-redundant-expression)
+      NullType::singleton() == NullType::singleton(),
+      "NullType must have a constexpr singleton() method");
+#endif
+  static_assert(
+      std::is_base_of_v<
+          TTarget,
+          std::remove_pointer_t<decltype(NullType::singleton())>>,
+      "NullType::singleton() must return a element_type* pointer");
+
+  TTarget* target_;
+
+  template <typename T>
+  friend struct ExclusivelyOwnedTensorTraits;
+  template <class TTarget2, class NullType2>
+  friend class intrusive_ptr;
+  friend class weak_intrusive_ptr<TTarget, NullType>;
+
+  // Make pybind11::class_ be a friend class of intrusive_ptr, so that custom
+  // smart holder in pybind11 could access the private constructor of
+  // intrusive_ptr(T*) which took the ownership of the object. This is required
+  // by customer holder macro PYBIND11_DECLARE_HOLDER_TYPE, where it uses
+  // intrusive_ptr(TTarget*) to initialize and take ownership of the object. For
+  // details, see
+  // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers
+  template <typename, typename...>
+  friend class pybind11::class_;
+
+  void retain_() noexcept {
+    if (target_ != NullType::singleton()) {
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          new_refcount != 1,
+          "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
+    }
+  }
+
+  void reset_() noexcept {
+    if (target_ != NullType::singleton()) {
+      reset_not_null_(target_);
+    }
+  }
+
+  // C10_NOINLINE to keep binary size a bit smaller. We pass TTarget* here
+  // to avoid an extra pointer dereference in the call from reset_().
+  C10_NOINLINE static void reset_not_null_(TTarget* target) noexcept {
+    if (detail::is_uniquely_owned(
+            target->combined_refcount_.load(std::memory_order_acquire))) {
+      // Both counts are 1, so there are no weak references and
+      // we are releasing the last strong reference. No other
+      // threads can observe the effects of this target deletion
+      // call (e.g. calling use_count()) without a data race.
+      target->combined_refcount_.store(0, std::memory_order_relaxed);
+      delete target;
+      return;
+    }
+
+    auto combined_refcount = detail::atomic_combined_refcount_decrement(
+        target->combined_refcount_, detail::kReferenceCountOne);
+    uint32_t new_refcount = detail::refcount(combined_refcount);
+    bool has_pyobject = detail::has_pyobject(combined_refcount);
+    if (new_refcount == 0) {
+      if (detail::weakcount(combined_refcount) == 1) {
+        delete target;
+        return;
+      }
+      // See comment above about weakcount. As long as refcount>0,
+      // weakcount is one larger than the actual number of weak references.
+      // So we need to decrement it here.
+      release_resources_and_decrement_weakrefs_(target);
+    } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+      // If the refcount transitioned from 2 to 1, we need to decref the
+      // PyObject. In other words, we don't want to keep the PyObject alive if
+      // there are no C++ references to this object other than the PyObject
+      // itself.
+      if (has_pyobject && new_refcount == 1) {
+        target->decref_pyobject();
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          !has_pyobject,
+          "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+    }
+  }
+
+  C10_NOINLINE static void release_resources_and_decrement_weakrefs_(
+      TTarget* target) noexcept {
+    // justification for const_cast: release_resources is basically a
+    // destructor and a destructor always mutates the object, even for
+    // const objects.
+    const_cast<std::remove_const_t<TTarget>*>(target)->release_resources();
+    if (detail::atomic_weakcount_decrement(target->combined_refcount_) == 0) {
+      delete target;
+    }
+  }
+
+  // raw pointer constructors are not public because we shouldn't make
+  // intrusive_ptr out of raw pointers except from inside the make_intrusive(),
+  // reclaim() and weak_intrusive_ptr::lock() implementations.
+
+  // This constructor will increase the ref counter for you.
+  // This constructor will be used by the make_intrusive(), and also pybind11,
+  // which wrap the intrusive_ptr holder around the raw pointer and incref
+  // correspondingly (pybind11 requires raw pointer constructor to incref by
+  // default).
+  explicit intrusive_ptr(TTarget* target)
+      : intrusive_ptr(target, raw::DontIncreaseRefcount{}) {
+    if (target_ != NullType::singleton()) {
+      // We just created result.target_, so we know no other thread has
+      // access to it, so we know we needn't care about memory ordering.
+      // (On x86_64, a store with memory_order_relaxed generates a plain old
+      // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
+      // much more expensive: https://godbolt.org/z/eKPzj8.)
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
+          "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
+          "constructor do something strange like incref or create an "
+          "intrusive_ptr from `this`?");
+      target_->combined_refcount_.store(
+          detail::kUniqueRef, std::memory_order_relaxed);
+    }
+  }
+
+ public:
+  using element_type = TTarget;
+
+  intrusive_ptr() noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
+
+  /* implicit */ intrusive_ptr(std::nullptr_t) noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
+
+  // This constructor will not increase the ref counter for you.
+  // We use the tagged dispatch mechanism to explicitly mark this constructor
+  // to not increase the refcount
+  explicit intrusive_ptr(
+      TTarget* target,
+      raw::DontIncreaseRefcount /*unused*/) noexcept
+      : target_(target) {}
+
+  explicit intrusive_ptr(std::unique_ptr<TTarget> rhs) noexcept
+      : intrusive_ptr(rhs.release()) {}
+
+  intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
+    rhs.target_ = NullType::singleton();
+  }
+
+  template <class From, class FromNullType>
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  /* implicit */ intrusive_ptr(intrusive_ptr<From, FromNullType>&& rhs) noexcept
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
+    rhs.target_ = FromNullType::singleton();
+  }
+
+  intrusive_ptr(const intrusive_ptr& rhs) : target_(rhs.target_) {
+    retain_();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ intrusive_ptr(const intrusive_ptr<From, FromNullType>& rhs)
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type.");
+    retain_();
+  }
+
+  ~intrusive_ptr() noexcept {
+    reset_();
+  }
+
+  intrusive_ptr& operator=(intrusive_ptr&& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign*)
+    return this->template operator= <TTarget, NullType>(std::move(rhs));
+  }
+
+  template <class From, class FromNullType>
+  intrusive_ptr& operator=(intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. intrusive_ptr move assignment got pointer of wrong type.");
+    intrusive_ptr tmp = std::move(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  // Assignment is implemented using copy and swap. That's safe for self
+  // assignment.
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment)
+  intrusive_ptr& operator=(const intrusive_ptr& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign-operator, *assignment-signature)
+    return this->template operator= <TTarget, NullType>(rhs);
+  }
+
+  template <class From, class FromNullType>
+  intrusive_ptr& operator=(
+      const intrusive_ptr<From, NullType>& rhs) & noexcept {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type.");
+    intrusive_ptr tmp = rhs;
+    swap(tmp);
+    return *this;
+  }
+
+  TTarget* get() const noexcept {
+    return target_;
+  }
+
+  TTarget& operator*() const noexcept {
+    return *target_;
+  }
+
+  TTarget* operator->() const noexcept {
+    return target_;
+  }
+
+  operator bool() const noexcept {
+    return target_ != NullType::singleton();
+  }
+
+  void reset() noexcept {
+    reset_();
+    target_ = NullType::singleton();
+  }
+
+  void swap(intrusive_ptr& rhs) noexcept {
+    std::swap(target_, rhs.target_);
+  }
+
+  // We do a lot of null-pointer checks in our code, good to have this be cheap.
+  bool defined() const noexcept {
+    return target_ != NullType::singleton();
+  }
+
+  uint32_t use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->refcount(std::memory_order_relaxed);
+  }
+
+  uint32_t weak_use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->weakcount(std::memory_order_relaxed);
+  }
+
+  bool unique() const noexcept {
+    return use_count() == 1;
+  }
+
+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    return detail::is_uniquely_owned(
+        target_->combined_refcount_.load(std::memory_order_acquire));
+  }
+
+  /**
+   * Returns an owning (!) pointer to the underlying object and makes the
+   * intrusive_ptr instance invalid. That means the refcount is not decreased.
+   * You *must* put the returned pointer back into a intrusive_ptr using
+   * intrusive_ptr::reclaim(ptr) to properly destruct it.
+   * This is helpful for C APIs.
+   */
+  TTarget* release() noexcept {
+    // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign)
+    TTarget* result = target_;
+    target_ = NullType::singleton();
+    return result;
+  }
+
+  /**
+   * Takes an owning pointer to TTarget* and creates an intrusive_ptr that takes
+   * over ownership. That means the refcount is not increased.
+   * This is the counter-part to intrusive_ptr::release() and the pointer
+   * passed in *must* have been created using intrusive_ptr::release().
+   */
+  static intrusive_ptr reclaim(TTarget* owning_ptr) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
+            owning_ptr->weakcount(),
+        "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
+    return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
+  }
+
+  /**
+   * Takes an owning pointer to TTarget* and creates an intrusive_ptr
+   * representing a new reference, i.e. the raw pointer retains
+   * ownership.
+   */
+  static intrusive_ptr reclaim_copy(TTarget* owning_ptr) {
+    auto ret = reclaim(owning_ptr);
+    ret.retain_();
+    return ret;
+  }
+
+  /**
+   * Allocate a heap object with args and wrap it inside a intrusive_ptr and
+   * incref. This is a helper function to let make_intrusive() access private
+   * intrusive_ptr constructors.
+   */
+  template <class... Args>
+  static intrusive_ptr make(Args&&... args) {
+    return intrusive_ptr(new TTarget(std::forward<Args>(args)...));
+  }
+
+  /**
+   * Turn a new instance of TTarget (e.g., literally allocated
+   * using new TTarget(...) into an intrusive_ptr.  If possible,
+   * use intrusive_ptr::make instead which statically guarantees
+   * that the allocation was done properly.
+   *
+   * At the moment, the only reason this method exists is because
+   * pybind11 holder types expect to be able to allocate in
+   * this way (because pybind11 handles the new allocation itself).
+   */
+  static intrusive_ptr unsafe_steal_from_new(TTarget* raw_ptr) {
+    return intrusive_ptr(raw_ptr);
+  }
+
+  /**
+   * Turn an instance of TTarget that should not be reference counted
+   * (e.g., allocated into an arena with placement new) into an
+   * intrusive_ptr. This is gratuitously unsafe and should only be
+   * used if you can guarantee that the pointer will not escape and be
+   * refcounted as normal.
+   *
+   * `expected_decrefs` is a debugging parameter: it indicates the
+   * number of strong owners the intrusive_ptr_target in question is
+   * expected to get. In most use cases, this will likely be 1.
+   *
+   * The reason this method exists is for manually sharing
+   * StorageImpls across Tensors in the static runtime. It needs
+   * access to private intrusive_ptr members so that the refcounts can
+   * be initialized to custom values.
+   */
+  static intrusive_ptr unsafe_adapt_non_heap_allocated(
+      TTarget* raw_ptr,
+      uint32_t expected_decrefs) {
+    intrusive_ptr result(raw_ptr, raw::DontIncreaseRefcount{});
+    // kImpracticallyHugeReferenceCount is impractically huge for a reference
+    // count, while being in no danger of overflowing uint32_t. We actually only
+    // need to initialize the refcount to 2 -- we are just doing an unbalanced
+    // incref to prevent the non-heap-allocated target from being
+    // freed, and we are optimizing that incref by directly
+    // initializing the refcounts rather than doing an expensive
+    // atomic increment. The reason to use kImpracticallyHugeReferenceCount is
+    // to accommodate the debug assertions in ~intrusive_ptr_target.
+#ifdef NDEBUG
+    expected_decrefs = 0;
+#endif
+    result.target_->combined_refcount_.store(
+        detail::refcount(
+            detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
+            detail::kImpracticallyHugeWeakReferenceCount,
+        std::memory_order_relaxed);
+    return result;
+  }
+
+  /**
+   * Turn a **non-owning raw pointer** to an intrusive_ptr.  It is
+   * the moral equivalent of enable_shared_from_this on a shared pointer.
+   *
+   * This method is only valid for objects that are already live.  If
+   * you are looking for the moral equivalent of unique_ptr<T>(T*)
+   * constructor, see steal_from_new.
+   *
+   * TODO: https://github.com/pytorch/pytorch/issues/56482
+   */
+  static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
+    // See Note [Stack allocated intrusive_ptr_target safety]
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
+        "intrusive_ptr: Can only reclaim pointers that are owned by someone");
+    auto ptr = reclaim(raw_ptr); // doesn't increase refcount
+    ptr.retain_();
+    return ptr;
+  }
+};
+
+template <
+    class TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>,
+    class... Args>
+inline intrusive_ptr<TTarget, NullType> make_intrusive(Args&&... args) {
+  return intrusive_ptr<TTarget, NullType>::make(std::forward<Args>(args)...);
+}
+
+template <class TTarget, class NullType>
+inline void swap(
+    intrusive_ptr<TTarget, NullType>& lhs,
+    intrusive_ptr<TTarget, NullType>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+// To allow intrusive_ptr inside std::map or std::set, we need operator<
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator<(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.get() < rhs.get();
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator==(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.get() == rhs.get();
+}
+
+template <class TTarget1, class NullType1>
+inline bool operator==(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return lhs.get() == nullptr;
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator==(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return nullptr == rhs.get();
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator!=(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+template <class TTarget1, class NullType1>
+inline bool operator!=(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return !operator==(lhs, nullptr);
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator!=(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(nullptr, rhs);
+}
+template <typename T>
+struct MaybeOwnedTraits<c10::intrusive_ptr<T>> {
+  using owned_type = c10::intrusive_ptr<T>;
+  using borrow_type = c10::intrusive_ptr<T>;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    return borrow_type::reclaim(from.get());
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.release();
+    lhs = borrow_type::reclaim(rhs.get());
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.release();
+  }
+
+  static const owned_type& referenceFromBorrow(
+      const borrow_type& borrow) noexcept {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(
+      const borrow_type& borrow) noexcept {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) noexcept {
+    return true;
+  }
+};
+
+template <
+    typename TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>>
+class weak_intrusive_ptr final {
+ private:
+  static_assert(
+      std::is_base_of_v<intrusive_ptr_target, TTarget>,
+      "intrusive_ptr can only be used for classes that inherit from intrusive_ptr_target.");
+#ifndef _WIN32
+  // This static_assert triggers on MSVC
+  //  error C2131: expression did not evaluate to a constant
+  static_assert(
+      NullType::singleton() == NullType::singleton(),
+      "NullType must have a constexpr singleton() method");
+#endif
+  static_assert(
+      std::is_base_of_v<
+          TTarget,
+          std::remove_pointer_t<decltype(NullType::singleton())>>,
+      "NullType::singleton() must return a element_type* pointer");
+
+  TTarget* target_;
+
+  template <class TTarget2, class NullType2>
+  friend class weak_intrusive_ptr;
+
+  void retain_() {
+    if (target_ != NullType::singleton()) {
+      uint32_t new_weakcount =
+          detail::atomic_weakcount_increment(target_->combined_refcount_);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          new_weakcount != 1,
+          "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
+    }
+  }
+
+  void reset_() noexcept {
+    if (target_ != NullType::singleton() &&
+        detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
+      delete target_;
+    }
+    target_ = NullType::singleton();
+  }
+
+  constexpr explicit weak_intrusive_ptr(TTarget* target) : target_(target) {}
+
+ public:
+  using element_type = TTarget;
+
+  explicit weak_intrusive_ptr(const intrusive_ptr<TTarget, NullType>& ptr)
+      : weak_intrusive_ptr(ptr.get()) {
+    retain_();
+  }
+
+  weak_intrusive_ptr(weak_intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
+    rhs.target_ = NullType::singleton();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ weak_intrusive_ptr(
+      // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+      weak_intrusive_ptr<From, FromNullType>&& rhs) noexcept
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type.");
+    rhs.target_ = FromNullType::singleton();
+  }
+
+  weak_intrusive_ptr(const weak_intrusive_ptr& rhs) : target_(rhs.target_) {
+    retain_();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ weak_intrusive_ptr(
+      const weak_intrusive_ptr<From, FromNullType>& rhs)
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type.");
+    retain_();
+  }
+
+  ~weak_intrusive_ptr() noexcept {
+    reset_();
+  }
+
+  weak_intrusive_ptr& operator=(weak_intrusive_ptr&& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign*)
+    return this->template operator= <TTarget, NullType>(std::move(rhs));
+  }
+
+  template <class From, class FromNullType>
+  weak_intrusive_ptr& operator=(
+      weak_intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type.");
+    weak_intrusive_ptr tmp = std::move(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  weak_intrusive_ptr& operator=(const weak_intrusive_ptr& rhs) & noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    // NOLINTNEXTLINE(*assign*)
+    return this->template operator= <TTarget, NullType>(rhs);
+  }
+
+  weak_intrusive_ptr& operator=(
+      const intrusive_ptr<TTarget, NullType>& rhs) & noexcept {
+    weak_intrusive_ptr tmp(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  template <class From, class FromNullType>
+  weak_intrusive_ptr& operator=(
+      const weak_intrusive_ptr<From, NullType>& rhs) & noexcept {
+    static_assert(
+        std::is_convertible_v<From*, TTarget*>,
+        "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type.");
+    weak_intrusive_ptr tmp = rhs;
+    swap(tmp);
+    return *this;
+  }
+
+  void reset() noexcept {
+    reset_();
+  }
+
+  void swap(weak_intrusive_ptr& rhs) noexcept {
+    TTarget* tmp = target_;
+    target_ = rhs.target_;
+    rhs.target_ = tmp;
+  }
+
+  // NB: This should ONLY be used by the std::hash implementation
+  // for weak_intrusive_ptr.  Another way you could do this is
+  // friend std::hash<weak_intrusive_ptr>, but this triggers two
+  // bugs:
+  //
+  //  (1) It triggers an nvcc bug, where std::hash in a friend class
+  //      declaration gets preprocessed into hash, which then cannot
+  //      actually be found.  The error in this case looks like:
+  //
+  //        error: no template named 'hash'; did you mean 'std::hash'?
+  //
+  //  (2) On OS X, std::hash is declared as a struct, not a class.
+  //      This twings:
+  //
+  //        error: class 'hash' was previously declared as a struct
+  //        [-Werror,-Wmismatched-tags]
+  //
+  // Both of these are work-aroundable, but on the whole, I decided
+  // it would be simpler and easier to make work if we just expose
+  // an unsafe getter for target_
+  //
+  TTarget* _unsafe_get_target() const noexcept {
+    return target_;
+  }
+
+  uint32_t use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->refcount(
+        std::memory_order_relaxed); // refcount, not weakcount!
+  }
+
+  uint32_t weak_use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->weakcount(std::memory_order_relaxed);
+  }
+
+  bool expired() const noexcept {
+    return use_count() == 0;
+  }
+
+  intrusive_ptr<TTarget, NullType> lock() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return intrusive_ptr<TTarget, NullType>();
+    } else {
+      bool increfed = false;
+      auto combined_refcount =
+          target_->combined_refcount_.load(std::memory_order_relaxed);
+      do {
+        if (detail::refcount(combined_refcount) == 0) {
+          // Object already destructed, no strong references left anymore.
+          // Return nullptr.
+          return intrusive_ptr<TTarget, NullType>();
+        }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
+      } while (!target_->combined_refcount_.compare_exchange_weak(
+          combined_refcount,
+          combined_refcount + detail::kReferenceCountOne,
+          std::memory_order_acquire,
+          std::memory_order_relaxed));
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
+      return intrusive_ptr<TTarget, NullType>(
+          target_, raw::DontIncreaseRefcount{});
+    }
+  }
+
+  /**
+   * Returns an owning (but still only weakly referenced) pointer to the
+   * underlying object and makes the weak_intrusive_ptr instance invalid.
+   * That means the weakcount is not decreased.
+   * You *must* put the returned pointer back into a weak_intrusive_ptr using
+   * weak_intrusive_ptr::reclaim(ptr) to properly destruct it.
+   * This is helpful for C APIs.
+   */
+  TTarget* release() noexcept {
+    TTarget* result = target_;
+    target_ = NullType::singleton();
+    return result;
+  }
+
+  /**
+   * Takes an owning (but must be weakly referenced) pointer to TTarget* and
+   * creates a weak_intrusive_ptr that takes over ownership.
+   * This means that the weakcount is not increased.
+   * This is the counter-part to weak_intrusive_ptr::release() and the pointer
+   * passed in *must* have been created using weak_intrusive_ptr::release().
+   */
+  static weak_intrusive_ptr reclaim(TTarget* owning_weak_ptr) {
+    // See Note [Stack allocated intrusive_ptr_target safety]
+    // if refcount > 0, weakcount must be >1 for weak references to exist.
+    // see weak counting explanation at top of this file.
+    // if refcount == 0, weakcount only must be >0.
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        owning_weak_ptr == NullType::singleton() ||
+            owning_weak_ptr->weakcount() > 1 ||
+            (owning_weak_ptr->refcount() == 0 &&
+             owning_weak_ptr->weakcount() > 0),
+        "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
+    return weak_intrusive_ptr(owning_weak_ptr);
+  }
+
+  /**
+   * Takes a pointer to TTarget* (may be weak or strong) and creates a
+   * new weak_intrusive_ptr representing a new weak reference, i.e.
+   * the raw pointer retains ownership.
+   */
+  static weak_intrusive_ptr reclaim_copy(TTarget* owning_ptr) {
+    auto ret = reclaim(owning_ptr);
+    ret.retain_();
+    return ret;
+  }
+
+  template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+  friend bool operator<(
+      const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+      const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept;
+  template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+  friend bool operator==(
+      const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+      const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept;
+};
+
+template <class TTarget, class NullType>
+inline void swap(
+    weak_intrusive_ptr<TTarget, NullType>& lhs,
+    weak_intrusive_ptr<TTarget, NullType>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+// To allow weak_intrusive_ptr inside std::map or std::set, we need operator<
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator<(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.target_ < rhs.target_;
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator==(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.target_ == rhs.target_;
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator!=(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+// Alias for documentary purposes, to more easily distinguish
+// weak raw intrusive pointers from intrusive pointers.
+using weak_intrusive_ptr_target = intrusive_ptr_target;
+
+// This namespace provides some methods for working with
+// raw pointers that subclass intrusive_ptr_target.  They are not provided
+// as methods on intrusive_ptr_target, because ideally you would not need these
+// methods at all (use smart pointers), but if you are dealing with legacy code
+// that still needs to pass around raw pointers, you may find these quite
+// useful.
+//
+// An important usage note: some functions are only valid if you have a
+// strong raw pointer to the object, while others are only valid if you
+// have a weak raw pointer to the object.  ONLY call intrusive_ptr namespace
+// functions on strong pointers, and weak_intrusive_ptr namespace functions
+// on weak pointers.  If you mix it up, you may get an assert failure.
+namespace raw {
+
+namespace intrusive_ptr {
+
+// WARNING: Unlike the reclaim() API, it is NOT valid to pass
+// NullType::singleton to this function
+inline void incref(intrusive_ptr_target* self) {
+  if (self) {
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+#ifndef C10_MOBILE
+    if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
+      self->incref_pyobject();
+    }
+#else
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
+#endif
+  }
+}
+
+// WARNING: Unlike the reclaim() API, it is NOT valid to pass
+// NullType::singleton to this function
+inline void decref(intrusive_ptr_target* self) {
+  // Let it die
+  c10::intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  // NB: Caller still has 'self' pointer, but it's now invalid.
+  // If you want more safety, used the actual c10::intrusive_ptr class
+}
+
+template <typename T>
+inline T* make_weak(T* self) {
+  // NB: 'this' is a strong pointer, but we return a weak pointer
+  auto ptr = c10::intrusive_ptr<T>::reclaim(self);
+  c10::weak_intrusive_ptr<T> wptr(ptr);
+  ptr.release();
+  return wptr.release();
+}
+
+inline uint32_t use_count(intrusive_ptr_target* self) {
+  auto ptr = c10::intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  auto r = ptr.use_count();
+  ptr.release();
+  return r;
+}
+
+} // namespace intrusive_ptr
+
+namespace weak_intrusive_ptr {
+
+inline void incref(weak_intrusive_ptr_target* self) {
+  detail::atomic_weakcount_increment(self->combined_refcount_);
+}
+
+inline void decref(weak_intrusive_ptr_target* self) {
+  // Let it die
+  c10::weak_intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  // NB: You still "have" the 'self' pointer, but it's now invalid.
+  // If you want more safety, used the actual c10::weak_intrusive_ptr class
+}
+
+template <typename T>
+inline T* lock(T* self) {
+  auto wptr = c10::weak_intrusive_ptr<T>::reclaim(self);
+  auto ptr = wptr.lock();
+  wptr.release();
+  return ptr.release();
+}
+
+// This gives the STRONG refcount of a WEAK pointer
+inline uint32_t use_count(weak_intrusive_ptr_target* self) {
+  auto wptr = c10::weak_intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  auto r = wptr.use_count();
+  wptr.release();
+  return r;
+}
+
+} // namespace weak_intrusive_ptr
+
+} // namespace raw
+
+} // namespace c10
+
+namespace std {
+// To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or
+// std::unordered_set, we need std::hash
+template <class TTarget, class NullType>
+struct hash<c10::intrusive_ptr<TTarget, NullType>> {
+  size_t operator()(const c10::intrusive_ptr<TTarget, NullType>& x) const {
+    return std::hash<TTarget*>()(x.get());
+  }
+};
+template <class TTarget, class NullType>
+struct hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
+  size_t operator()(const c10::weak_intrusive_ptr<TTarget, NullType>& x) const {
+    return std::hash<TTarget*>()(x._unsafe_get_target());
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc2a018db397a56dee0199af77509fc23dfe405b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/irange.h
@@ -0,0 +1,128 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, int> = 0>
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
+  explicit constexpr integer_iterator(I val) : value(val) {}
+
+  constexpr I operator*() const {
+    return value;
+  }
+
+  constexpr I const* operator->() const {
+    return &value;
+  }
+
+  constexpr integer_iterator& operator++() {
+    ++value;
+    return *this;
+  }
+
+  constexpr integer_iterator operator++(int) {
+    const auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  constexpr bool operator==(const integer_iterator& other) const {
+    if constexpr (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return is_negative(other.value) || value == other.value;
+    } else {
+      return value == other.value;
+    }
+    // Suppress "warning: missing return statement at end of non-void function"
+    // which Nvidia's Robert Crovella confirms is an NVCC compiler error
+    // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27
+    // `__builtin_unreachable();` would be best here, but it's not
+    // available with all compilers. So we instead return an arbitrary
+    // value trusting that this line will, in fact, never be reached.
+    return false; // Horrible hack
+  }
+
+  constexpr bool operator!=(const integer_iterator& other) const {
+    return !(*this == other);
+  }
+
+ protected:
+  I value;
+};
+
+} // namespace detail
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, bool> = true>
+struct integer_range {
+ public:
+  constexpr integer_range(I begin, I end) : begin_(begin), end_(end) {}
+  using iterator = detail::integer_iterator<I, one_sided>;
+  constexpr iterator begin() const {
+    return begin_;
+  }
+  constexpr iterator end() const {
+    return end_;
+  }
+
+ private:
+  iterator begin_;
+  iterator end_;
+};
+
+/// Creates an integer range for the half-open interval [begin, end)
+/// If end<=begin, then the range is empty.
+/// The range has the type of the `end` integer; `begin` integer is
+/// cast to this type.
+template <
+    typename Integer1,
+    typename Integer2,
+    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
+    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+constexpr integer_range<Integer2> irange(Integer1 begin, Integer2 end) {
+  // If end<=begin then the range is empty; we can achieve this effect by
+  // choosing the larger of {begin, end} as the loop terminator
+  return {
+      static_cast<Integer2>(begin),
+      std::max(static_cast<Integer2>(begin), end)};
+}
+
+/// Creates an integer range for the half-open interval [0, end)
+/// If end<=begin, then the range is empty
+template <
+    typename Integer,
+    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
+constexpr integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h
new file mode 100644
index 0000000000000000000000000000000000000000..6884e20d112ace8886c69b10499f830c58c3703f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/llvmMathExtras.h
@@ -0,0 +1,910 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some functions that are useful for math stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <c10/util/bit_cast.h>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#ifdef __ANDROID_NDK__
+#include <android/api-level.h>
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifndef LLVM_GNUC_PREREQ
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define LLVM_GNUC_PREREQ(maj, min, patch)                             \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 20) + ((min) << 10) + (patch))
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define LLVM_GNUC_PREREQ(maj, min, patch) \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
+#else
+#define LLVM_GNUC_PREREQ(maj, min, patch) 0
+#endif
+#endif
+
+#ifdef _MSC_VER
+// Declare these intrinsics manually rather including intrin.h. It's very
+// expensive, and MathExtras.h is popular.
+// #include <intrin.h>
+extern "C" {
+unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask);
+unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask);
+unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
+}
+#endif
+
+namespace c10::llvm {
+/// The behavior an operation has on an input of 0.
+enum ZeroBehavior {
+  /// The returned value is undefined.
+  ZB_Undefined,
+  /// The returned value is numeric_limits<T>::max()
+  ZB_Max,
+  /// The returned value is numeric_limits<T>::digits
+  ZB_Width
+};
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct TrailingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+    if (Val & 0x1)
+      return 0;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    T Shift = std::numeric_limits<T>::digits >> 1;
+    T Mask = std::numeric_limits<T>::max() >> Shift;
+    while (Shift) {
+      if ((Val & Mask) == 0) {
+        Val >>= Shift;
+        ZeroBits |= Shift;
+      }
+      Shift >>= 1;
+      Mask >>= Shift;
+    }
+    return ZeroBits;
+  }
+};
+
+#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER)
+template <typename T>
+struct TrailingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
+
+#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_ctz(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanForward(&Index, Val);
+    return Index;
+#endif
+  }
+};
+
+#if !defined(_MSC_VER) || defined(_M_X64)
+template <typename T>
+struct TrailingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
+
+#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_ctzll(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanForward64(&Index, Val);
+    return Index;
+#endif
+  }
+};
+#endif
+#endif
+} // namespace detail
+
+/// Count number of 0's from the least significant bit to the most
+///   stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct LeadingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
+      T Tmp = Val >> Shift;
+      if (Tmp)
+        Val = Tmp;
+      else
+        ZeroBits |= Shift;
+    }
+    return ZeroBits;
+  }
+};
+
+#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER)
+template <typename T>
+struct LeadingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
+
+#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_clz(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanReverse(&Index, Val);
+    return Index ^ 31;
+#endif
+  }
+};
+
+#if !defined(_MSC_VER) || defined(_M_X64)
+template <typename T>
+struct LeadingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
+
+#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_clzll(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanReverse64(&Index, Val);
+    return Index ^ 63;
+#endif
+  }
+};
+#endif
+#endif
+} // namespace detail
+
+/// Count number of 0's from the most significant bit to the least
+///   stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+/// Get the index of the first set bit starting from the least
+///   significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
+  if (ZB == ZB_Max && Val == 0)
+    return std::numeric_limits<T>::max();
+
+  return countTrailingZeros(Val, ZB_Undefined);
+}
+
+/// Create a bitmask with the N right-most bits set to 1, and all other
+/// bits set to 0.  Only unsigned types are allowed.
+template <typename T>
+T maskTrailingOnes(unsigned N) {
+  static_assert(std::is_unsigned_v<T>, "Invalid type!");
+  const unsigned Bits = CHAR_BIT * sizeof(T);
+  assert(N <= Bits && "Invalid bit index");
+  return N == 0 ? 0 : (T(-1) >> (Bits - N));
+}
+
+/// Create a bitmask with the N left-most bits set to 1, and all other
+/// bits set to 0.  Only unsigned types are allowed.
+template <typename T>
+T maskLeadingOnes(unsigned N) {
+  return ~maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Create a bitmask with the N right-most bits set to 0, and all other
+/// bits set to 1.  Only unsigned types are allowed.
+template <typename T>
+T maskTrailingZeros(unsigned N) {
+  return maskLeadingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Create a bitmask with the N left-most bits set to 0, and all other
+/// bits set to 1.  Only unsigned types are allowed.
+template <typename T>
+T maskLeadingZeros(unsigned N) {
+  return maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Get the index of the last set bit starting from the least
+///   significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
+  if (ZB == ZB_Max && Val == 0)
+    return std::numeric_limits<T>::max();
+
+  // Use ^ instead of - because both gcc and llvm can remove the associated ^
+  // in the __builtin_clz intrinsic on x86.
+  return countLeadingZeros(Val, ZB_Undefined) ^
+      (std::numeric_limits<T>::digits - 1);
+}
+
+/// Macro compressed bit reversal table for 256 bits.
+///
+/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+/// NOLINTNEXTLINE(*c-arrays*)
+static constexpr unsigned char BitReverseTable256[256] = {
+#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
+#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
+#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
+    R6(0),
+    R6(2),
+    R6(1),
+    R6(3)
+#undef R2
+#undef R4
+#undef R6
+};
+
+/// Reverse the bits in \p Val.
+template <typename T>
+T reverseBits(T Val) {
+  // NOLINTNEXTLINE(*c-arrays*)
+  unsigned char in[sizeof(Val)];
+  // NOLINTNEXTLINE(*c-arrays*)
+  unsigned char out[sizeof(Val)];
+  std::memcpy(in, &Val, sizeof(Val));
+  for (unsigned i = 0; i < sizeof(Val); ++i)
+    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
+  std::memcpy(&Val, out, sizeof(Val));
+  return Val;
+}
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+
+/// Return the high 32 bits of a 64 bit value.
+constexpr inline uint32_t Hi_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Return the low 32 bits of a 64 bit value.
+constexpr inline uint32_t Lo_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value);
+}
+
+/// Make a 64-bit integer from a high / low pair of 32-bit integers.
+constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+  return ((uint64_t)High << 32) | (uint64_t)Low;
+}
+
+/// Checks if an integer fits into the given bit width.
+template <unsigned N>
+constexpr inline bool isInt(int64_t x) {
+  return N >= 64 ||
+      (-(INT64_C(1) << (N - 1)) <= x && x < (INT64_C(1) << (N - 1)));
+}
+// Template specializations to get better code for common cases.
+template <>
+constexpr inline bool isInt<8>(int64_t x) {
+  return static_cast<int8_t>(x) == x;
+}
+template <>
+constexpr inline bool isInt<16>(int64_t x) {
+  return static_cast<int16_t>(x) == x;
+}
+template <>
+constexpr inline bool isInt<32>(int64_t x) {
+  return static_cast<int32_t>(x) == x;
+}
+
+/// Checks if a signed integer is an N bit number shifted left by S.
+template <unsigned N, unsigned S>
+constexpr inline bool isShiftedInt(int64_t x) {
+  static_assert(
+      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
+  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
+  return isInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+}
+
+/// Checks if an unsigned integer fits into the given bit width.
+///
+/// This is written as two functions rather than as simply
+///
+///   return N >= 64 || X < (UINT64_C(1) << N);
+///
+/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
+/// left too many places.
+template <unsigned N>
+constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
+  static_assert(N > 0, "isUInt<0> doesn't make sense");
+  return X < (UINT64_C(1) << N);
+}
+template <unsigned N>
+constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t /*X*/) {
+  return true;
+}
+
+// Template specializations to get better code for common cases.
+template <>
+constexpr inline bool isUInt<8>(uint64_t x) {
+  return static_cast<uint8_t>(x) == x;
+}
+template <>
+constexpr inline bool isUInt<16>(uint64_t x) {
+  return static_cast<uint16_t>(x) == x;
+}
+template <>
+constexpr inline bool isUInt<32>(uint64_t x) {
+  return static_cast<uint32_t>(x) == x;
+}
+
+/// Checks if a unsigned integer is an N bit number shifted left by S.
+template <unsigned N, unsigned S>
+constexpr inline bool isShiftedUInt(uint64_t x) {
+  static_assert(
+      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
+  static_assert(
+      N + S <= 64, "isShiftedUInt<N, S> with N + S > 64 is too wide.");
+  // Per the two static_asserts above, S must be strictly less than 64.  So
+  // 1 << S is not undefined behavior.
+  return isUInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+}
+
+/// Gets the maximum value for a N-bit unsigned integer.
+inline uint64_t maxUIntN(uint64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+
+  // uint64_t(1) << 64 is undefined behavior, so we can't do
+  //   (uint64_t(1) << N) - 1
+  // without checking first that N != 64.  But this works and doesn't have a
+  // branch.
+  return UINT64_MAX >> (64 - N);
+}
+
+// Ignore the false warning "Arithmetic overflow" for MSVC
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+
+/// Gets the minimum value for a N-bit signed integer.
+inline int64_t minIntN(int64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  return -(UINT64_C(1) << (N - 1));
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/// Gets the maximum value for a N-bit signed integer.
+inline int64_t maxIntN(int64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+
+  // This relies on two's complement wraparound when N == 64, so we convert to
+  // int64_t only at the very end to avoid UB.
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  return (UINT64_C(1) << (N - 1)) - 1;
+}
+
+/// Checks if an unsigned integer fits into the given (dynamic) bit width.
+inline bool isUIntN(unsigned N, uint64_t x) {
+  return N >= 64 || x <= maxUIntN(N);
+}
+
+/// Checks if an signed integer fits into the given (dynamic) bit width.
+inline bool isIntN(unsigned N, int64_t x) {
+  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
+}
+
+/// Return true if the argument is a non-empty sequence of ones starting at the
+/// least significant bit with the remainder zero (32 bit version).
+/// Ex. isMask_32(0x0000FFFFU) == true.
+constexpr inline bool isMask_32(uint32_t Value) {
+  return Value && ((Value + 1) & Value) == 0;
+}
+
+/// Return true if the argument is a non-empty sequence of ones starting at the
+/// least significant bit with the remainder zero (64 bit version).
+constexpr inline bool isMask_64(uint64_t Value) {
+  return Value && ((Value + 1) & Value) == 0;
+}
+
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
+constexpr inline bool isShiftedMask_32(uint32_t Value) {
+  return Value && isMask_32((Value - 1) | Value);
+}
+
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (64 bit version.)
+constexpr inline bool isShiftedMask_64(uint64_t Value) {
+  return Value && isMask_64((Value - 1) | Value);
+}
+
+/// Return true if the argument is a power of two > 0.
+/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
+constexpr inline bool isPowerOf2_32(uint32_t Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+/// Return true if the argument is a power of two > 0 (64 bit edition.)
+constexpr inline bool isPowerOf2_64(uint64_t Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+/// Count the number of ones from the most significant bit to the first
+/// zero bit.
+///
+/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return countLeadingZeros<T>(~Value, ZB);
+}
+
+/// Count the number of ones from the least significant bit to the first
+/// zero bit.
+///
+/// Ex. countTrailingOnes(0x00FF00FF) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return countTrailingZeros<T>(~Value, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct PopulationCounter {
+  static unsigned count(T Value) {
+    // Generic version, forward to 32 bits.
+    static_assert(SizeOfT <= 4, "Not implemented!");
+#if defined(__GNUC__) && __GNUC__ >= 4
+    return __builtin_popcount(Value);
+#else
+    uint32_t v = Value;
+    v = v - ((v >> 1) & 0x55555555);
+    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+#endif
+  }
+};
+
+template <typename T>
+struct PopulationCounter<T, 8> {
+  static unsigned count(T Value) {
+#if defined(__GNUC__) && __GNUC__ >= 4
+    return __builtin_popcountll(Value);
+#else
+    uint64_t v = Value;
+    v = v - ((v >> 1) & 0x5555555555555555ULL);
+    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+  }
+};
+} // namespace detail
+
+/// Count the number of set bits in a value.
+/// Ex. countPopulation(0xF000F000) = 8
+/// Returns 0 if the word is zero.
+template <typename T>
+inline unsigned countPopulation(T Value) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
+}
+
+/// Return the log base 2 of the specified value.
+inline double Log2(double Value) {
+#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
+  return __builtin_log(Value) / __builtin_log(2.0);
+#else
+  return log2(Value);
+#endif
+}
+
+/// Return the floor log base 2 of the specified value, -1 if the value is zero.
+/// (32 bit edition.)
+/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
+inline unsigned Log2_32(uint32_t Value) {
+  return static_cast<unsigned>(31 - countLeadingZeros(Value));
+}
+
+/// Return the floor log base 2 of the specified value, -1 if the value is zero.
+/// (64 bit edition.)
+inline unsigned Log2_64(uint64_t Value) {
+  return static_cast<unsigned>(63 - countLeadingZeros(Value));
+}
+
+/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
+/// (32 bit edition).
+/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
+inline unsigned Log2_32_Ceil(uint32_t Value) {
+  return static_cast<unsigned>(32 - countLeadingZeros(Value - 1));
+}
+
+/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
+/// (64 bit edition.)
+inline unsigned Log2_64_Ceil(uint64_t Value) {
+  return static_cast<unsigned>(64 - countLeadingZeros(Value - 1));
+}
+
+/// Return the greatest common divisor of the values using Euclid's algorithm.
+inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
+  while (B) {
+    uint64_t T = B;
+    B = A % B;
+    A = T;
+  }
+  return A;
+}
+
+/// This function takes a 64-bit integer and returns the bit equivalent double.
+inline double BitsToDouble(uint64_t Bits) {
+  double D = 0;
+  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+  memcpy(&D, &Bits, sizeof(Bits));
+  return D;
+}
+
+/// This function takes a 32-bit integer and returns the bit equivalent float.
+inline float BitsToFloat(uint32_t Bits) {
+  // TODO: Use std::bit_cast once C++20 becomes available.
+  return c10::bit_cast<float>(Bits);
+}
+
+/// This function takes a double and returns the bit equivalent 64-bit integer.
+/// Note that copying doubles around changes the bits of NaNs on some hosts,
+/// notably x86, so this routine cannot be used if these bits are needed.
+inline uint64_t DoubleToBits(double Double) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint64_t Bits;
+  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+  memcpy(&Bits, &Double, sizeof(Double));
+  return Bits;
+}
+
+/// This function takes a float and returns the bit equivalent 32-bit integer.
+/// Note that copying floats around changes the bits of NaNs on some hosts,
+/// notably x86, so this routine cannot be used if these bits are needed.
+inline uint32_t FloatToBits(float Float) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint32_t Bits;
+  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
+  memcpy(&Bits, &Float, sizeof(Float));
+  return Bits;
+}
+
+/// A and B are either alignments or offsets. Return the minimum alignment that
+/// may be assumed after adding the two together.
+constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+  // The largest power of 2 that divides both A and B.
+  //
+  // Replace "-Value" by "1+~Value" in the following commented code to avoid
+  // MSVC warning C4146
+  //    return (A | B) & -(A | B);
+  return (A | B) & (1 + ~(A | B));
+}
+
+/// Aligns \c Addr to \c Alignment bytes, rounding up.
+///
+/// Alignment should be a power of two.  This method rounds up, so
+/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
+inline uintptr_t alignAddr(const void* Addr, size_t Alignment) {
+  assert(
+      Alignment && isPowerOf2_64((uint64_t)Alignment) &&
+      "Alignment is not a power of two!");
+
+  assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
+
+  return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
+}
+
+/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+/// bytes, rounding up.
+inline size_t alignmentAdjustment(const void* Ptr, size_t Alignment) {
+  return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
+}
+
+/// Returns the next power of two (in 64-bits) that is strictly greater than A.
+/// Returns zero on overflow.
+inline uint64_t NextPowerOf2(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+/// Returns the power of two which is less than or equal to the given value.
+/// Essentially, it is a floor operation across the domain of powers of two.
+inline uint64_t PowerOf2Floor(uint64_t A) {
+  if (!A)
+    return 0;
+  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+}
+
+/// Returns the power of two which is greater than or equal to the given value.
+/// Essentially, it is a ceil operation across the domain of powers of two.
+inline uint64_t PowerOf2Ceil(uint64_t A) {
+  if (!A)
+    return 0;
+  return NextPowerOf2(A - 1);
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// If non-zero \p Skew is specified, the return value will be a minimal
+/// integer that is greater than or equal to \p Value and equal to
+/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
+/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
+///
+/// Examples:
+/// \code
+///   alignTo(5, 8) = 8
+///   alignTo(17, 8) = 24
+///   alignTo(~0LL, 8) = 0
+///   alignTo(321, 255) = 510
+///
+///   alignTo(5, 8, 7) = 7
+///   alignTo(17, 8, 1) = 17
+///   alignTo(~0LL, 8, 3) = 3
+///   alignTo(321, 255, 42) = 552
+/// \endcode
+inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+  assert(Align != 0u && "Align can't be 0.");
+  Skew %= Align;
+  return (Value + Align - 1 - Skew) / Align * Align + Skew;
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
+template <uint64_t Align>
+constexpr inline uint64_t alignTo(uint64_t Value) {
+  static_assert(Align != 0u, "Align must be non-zero");
+  return (Value + Align - 1) / Align * Align;
+}
+
+/// Returns the integer ceil(Numerator / Denominator).
+inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
+  return alignTo(Numerator, Denominator) / Denominator;
+}
+
+/// \c alignTo for contexts where a constant expression is required.
+/// \sa alignTo
+///
+/// \todo FIXME: remove when \c constexpr becomes really \c constexpr
+template <uint64_t Align>
+struct AlignTo {
+  static_assert(Align != 0u, "Align must be non-zero");
+  template <uint64_t Value>
+  struct from_value {
+    static const uint64_t value = (Value + Align - 1) / Align * Align;
+  };
+};
+
+/// Returns the largest uint64_t less than or equal to \p Value and is
+/// \p Skew mod \p Align. \p Align must be non-zero
+inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+  assert(Align != 0u && "Align can't be 0.");
+  Skew %= Align;
+  return (Value - Skew) / Align * Align + Skew;
+}
+
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align. \p Align must be
+/// non-zero.
+inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
+  return alignTo(Value, Align) - Value;
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+/// Requires 0 < B <= 32.
+template <unsigned B>
+constexpr inline int32_t SignExtend32(uint32_t X) {
+  static_assert(B > 0, "Bit width can't be 0.");
+  static_assert(B <= 32, "Bit width out of range.");
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+/// Requires 0 < B < 32.
+inline int32_t SignExtend32(uint32_t X, unsigned B) {
+  assert(B > 0 && "Bit width can't be 0.");
+  assert(B <= 32 && "Bit width out of range.");
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+/// Requires 0 < B < 64.
+template <unsigned B>
+constexpr inline int64_t SignExtend64(uint64_t x) {
+  static_assert(B > 0, "Bit width can't be 0.");
+  static_assert(B <= 64, "Bit width out of range.");
+  return int64_t(x << (64 - B)) >> (64 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+/// Requires 0 < B < 64.
+inline int64_t SignExtend64(uint64_t X, unsigned B) {
+  assert(B > 0 && "Bit width can't be 0.");
+  assert(B <= 64 && "Bit width out of range.");
+  return int64_t(X << (64 - B)) >> (64 - B);
+}
+
+/// Subtract two unsigned integers, X and Y, of type T and return the absolute
+/// value of the result.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) {
+  return std::max(X, Y) - std::min(X, Y);
+}
+
+/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
+/// maximum representable value of T on overflow.  ResultOverflowed indicates if
+/// the result is larger than the maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(
+    T X,
+    T Y,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+  // Hacker's Delight, p. 29
+  T Z = X + Y;
+  Overflowed = (Z < X || Z < Y);
+  if (Overflowed)
+    return std::numeric_limits<T>::max();
+  else
+    return Z;
+}
+
+/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
+/// maximum representable value of T on overflow.  ResultOverflowed indicates if
+/// the result is larger than the maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingMultiply(
+    T X,
+    T Y,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
+  // because it fails for uint16_t (where multiplication can have undefined
+  // behavior due to promotion to int), and requires a division in addition
+  // to the multiplication.
+
+  Overflowed = false;
+
+  // Log2(Z) would be either Log2Z or Log2Z + 1.
+  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
+  // will necessarily be less than Log2Max as desired.
+  int Log2Z = Log2_64(X) + Log2_64(Y);
+  const T Max = std::numeric_limits<T>::max();
+  int Log2Max = Log2_64(Max);
+  if (Log2Z < Log2Max) {
+    return X * Y;
+  }
+  if (Log2Z > Log2Max) {
+    Overflowed = true;
+    return Max;
+  }
+
+  // We're going to use the top bit, and maybe overflow one
+  // bit past it. Multiply all but the bottom bit then add
+  // that on at the end.
+  T Z = (X >> 1) * Y;
+  if (Z & ~(Max >> 1)) {
+    Overflowed = true;
+    return Max;
+  }
+  Z <<= 1;
+  if (X & 1)
+    return SaturatingAdd(Z, Y, ResultOverflowed);
+
+  return Z;
+}
+
+/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
+/// the product. Clamp the result to the maximum representable value of T on
+/// overflow. ResultOverflowed indicates if the result is larger than the
+/// maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingMultiplyAdd(
+    T X,
+    T Y,
+    T A,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+  T Product = SaturatingMultiply(X, Y, &Overflowed);
+  if (Overflowed)
+    return Product;
+
+  return SaturatingAdd(A, Product, &Overflowed);
+}
+
+/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
+extern const float huge_valf;
+} // namespace c10::llvm
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ae58609b5d56135e59075d5428e03a6c99ff230
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/numa.h
@@ -0,0 +1,46 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+#include <cstddef>
+
+C10_DECLARE_bool(caffe2_cpu_numa_enabled);
+
+namespace c10 {
+
+/**
+ * Check whether NUMA is enabled
+ */
+C10_API bool IsNUMAEnabled();
+
+/**
+ * Bind to a given NUMA node
+ */
+C10_API void NUMABind(int numa_node_id);
+
+/**
+ * Get the NUMA id for a given pointer `ptr`
+ */
+C10_API int GetNUMANode(const void* ptr);
+
+/**
+ * Get number of NUMA nodes
+ */
+C10_API int GetNumNUMANodes();
+
+/**
+ * Move the memory pointed to by `ptr` of a given size to another NUMA node
+ */
+C10_API void NUMAMove(void* ptr, size_t size, int numa_node_id);
+
+/**
+ * Get the current NUMA node id
+ */
+C10_API int GetCurrentNUMANode();
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..e991a567ec5eac9c967f4743255de1eb51c9338a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h
@@ -0,0 +1,2222 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Taken from
+// https://github.com/skarupke/flat_hash_map/blob/2c4687431f978f02a3780e24b8b701d22aa32d9c/flat_hash_map.hpp
+// with fixes applied:
+// - https://github.com/skarupke/flat_hash_map/pull/25
+// - https://github.com/skarupke/flat_hash_map/pull/26
+// - replace size_t with uint64_t to fix it for 32bit
+// - add "GCC diagnostic" pragma to ignore -Wshadow
+// - make sherwood_v3_table::convertible_to_iterator public because GCC5 seems
+// to have issues with it otherwise
+// - fix compiler warnings in operator templated_iterator<const value_type>
+// - make use of 'if constexpr' and eliminate AssignIfTrue template
+
+//          Copyright Malte Skarupke 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See http://www.boost.org/LICENSE_1_0.txt)
+
+// Modified to maintain insertion and deletion order through a doubly-linked
+// list
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#ifdef _MSC_VER
+#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
+#else
+#define SKA_NOINLINE(...) __VA_ARGS__ __attribute__((noinline))
+#endif
+
+namespace ska_ordered {
+
+struct prime_number_hash_policy;
+struct power_of_two_hash_policy;
+struct fibonacci_hash_policy;
+
+namespace detailv3 {
+template <typename Result, typename Functor>
+struct functor_storage : Functor {
+  functor_storage() = default;
+  functor_storage(const Functor& functor) : Functor(functor) {}
+  template <typename... Args>
+  Result operator()(Args&&... args) {
+    return static_cast<Functor&>(*this)(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  Result operator()(Args&&... args) const {
+    return static_cast<const Functor&>(*this)(std::forward<Args>(args)...);
+  }
+};
+template <typename Result, typename... Args>
+struct functor_storage<Result, Result (*)(Args...)> {
+  typedef Result (*function_ptr)(Args...);
+  function_ptr function;
+  functor_storage(function_ptr function) : function(function) {}
+  Result operator()(Args... args) const {
+    return function(std::forward<Args>(args)...);
+  }
+  operator function_ptr&() {
+    return function;
+  }
+  operator const function_ptr&() {
+    return function;
+  }
+};
+template <typename key_type, typename value_type, typename hasher>
+struct KeyOrValueHasher : functor_storage<uint64_t, hasher> {
+  typedef functor_storage<uint64_t, hasher> hasher_storage;
+  KeyOrValueHasher() = default;
+  KeyOrValueHasher(const hasher& hash) : hasher_storage(hash) {}
+  uint64_t operator()(const key_type& key) {
+    return static_cast<hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const key_type& key) const {
+    return static_cast<const hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const value_type& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  uint64_t operator()(const value_type& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+};
+template <typename key_type, typename value_type, typename key_equal>
+struct KeyOrValueEquality : functor_storage<bool, key_equal> {
+  typedef functor_storage<bool, key_equal> equality_storage;
+  KeyOrValueEquality() = default;
+  KeyOrValueEquality(const key_equal& equality) : equality_storage(equality) {}
+  bool operator()(const key_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs);
+  }
+  bool operator()(const key_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  bool operator()(const value_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  bool operator()(const value_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const key_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  template <typename F, typename S>
+  bool operator()(const value_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename FL, typename SL, typename FR, typename SR>
+  bool operator()(const std::pair<FL, SL>& lhs, const std::pair<FR, SR>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+};
+static constexpr int8_t min_lookups = 4;
+template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+struct sherwood_v3_entry {
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  sherwood_v3_entry() {}
+  sherwood_v3_entry(int8_t distance_from_desired)
+      : distance_from_desired(distance_from_desired) {}
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  ~sherwood_v3_entry() {}
+
+  bool has_value() const {
+    return distance_from_desired >= 0;
+  }
+  bool is_empty() const {
+    return distance_from_desired < 0;
+  }
+  bool is_at_desired_position() const {
+    return distance_from_desired <= 0;
+  }
+  template <typename... Args>
+  void emplace(int8_t distance, Args&&... args) {
+    new (std::addressof(value)) T(std::forward<Args>(args)...);
+    distance_from_desired = distance;
+  }
+
+  void destroy_value() {
+    value.~T();
+    distance_from_desired = -1;
+  }
+
+  sherwood_v3_entry<T>* prev = nullptr;
+  sherwood_v3_entry<T>* next = nullptr;
+  int8_t distance_from_desired = -1;
+  static constexpr int8_t special_end_value = 0;
+  union {
+    T value;
+  };
+};
+
+inline int8_t log2(uint64_t value) {
+  static constexpr std::array<int8_t, 64> table = {
+      63, 0,  58, 1,  59, 47, 53, 2,  60, 39, 48, 27, 54, 33, 42, 3,
+      61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4,
+      62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21,
+      56, 45, 25, 31, 35, 16, 9,  12, 44, 24, 15, 8,  23, 7,  6,  5};
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  value |= value >> 32;
+  return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58];
+}
+
+inline uint64_t next_power_of_two(uint64_t i) {
+  --i;
+  i |= i >> 1;
+  i |= i >> 2;
+  i |= i >> 4;
+  i |= i >> 8;
+  i |= i >> 16;
+  i |= i >> 32;
+  ++i;
+  return i;
+}
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+// (it takes CWG1558 into account and also works for older compilers)
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+template <typename T, typename = void>
+struct HashPolicySelector {
+  typedef fibonacci_hash_policy type;
+};
+template <typename T>
+struct HashPolicySelector<T, void_t<typename T::hash_policy>> {
+  typedef typename T::hash_policy type;
+};
+
+template <
+    typename T,
+    typename FindKey,
+    typename ArgumentHash,
+    typename Hasher,
+    typename ArgumentEqual,
+    typename Equal,
+    typename ArgumentAlloc,
+    typename EntryAlloc>
+class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
+  using Entry = detailv3::sherwood_v3_entry<T>;
+  using AllocatorTraits = std::allocator_traits<EntryAlloc>;
+  using EntryPointer = typename AllocatorTraits::pointer;
+
+ public:
+  struct convertible_to_iterator;
+
+  using value_type = T;
+  using size_type = uint64_t;
+  using difference_type = std::ptrdiff_t;
+  using hasher = ArgumentHash;
+  using key_equal = ArgumentEqual;
+  using allocator_type = EntryAlloc;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  sherwood_v3_table() = default;
+  explicit sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : EntryAlloc(alloc), Hasher(hash), Equal(equal) {
+    rehash(bucket_count);
+  }
+  sherwood_v3_table(size_type bucket_count, const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(bucket_count, hash, ArgumentEqual(), alloc) {}
+  explicit sherwood_v3_table(const ArgumentAlloc& alloc) : EntryAlloc(alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            hash,
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    if (bucket_count == 0)
+      rehash(il.size());
+    insert(il.begin(), il.end());
+  }
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            il,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(il, bucket_count, hash, ArgumentEqual(), alloc) {}
+  sherwood_v3_table(const sherwood_v3_table& other)
+      : sherwood_v3_table(
+            other,
+            AllocatorTraits::select_on_container_copy_construction(
+                other.get_allocator())) {}
+  sherwood_v3_table(const sherwood_v3_table& other, const ArgumentAlloc& alloc)
+      : EntryAlloc(alloc),
+        Hasher(other),
+        Equal(other),
+        _max_load_factor(other._max_load_factor) {
+    rehash_for_other_container(other);
+    try {
+      insert(other.begin(), other.end());
+    } catch (...) {
+      clear();
+      deallocate_data(entries, num_slots_minus_one, max_lookups);
+      throw;
+    }
+  }
+  sherwood_v3_table(sherwood_v3_table&& other) noexcept
+      : EntryAlloc(std::move(other)),
+        Hasher(std::move(other)),
+        Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table(
+      sherwood_v3_table&& other,
+      const ArgumentAlloc& alloc) noexcept
+      : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table& operator=(const sherwood_v3_table& other) {
+    if (this == std::addressof(other))
+      return *this;
+
+    clear();
+    if constexpr (AllocatorTraits::propagate_on_container_copy_assignment::
+                      value) {
+      if (static_cast<EntryAlloc&>(*this) !=
+          static_cast<const EntryAlloc&>(other)) {
+        reset_to_empty_state();
+      }
+      static_cast<EntryAlloc&>(*this) = other;
+    }
+    _max_load_factor = other._max_load_factor;
+    static_cast<Hasher&>(*this) = other;
+    static_cast<Equal&>(*this) = other;
+    rehash_for_other_container(other);
+    insert(other.begin(), other.end());
+    return *this;
+  }
+  sherwood_v3_table& operator=(sherwood_v3_table&& other) noexcept {
+    if (this == std::addressof(other))
+      return *this;
+    else if constexpr (AllocatorTraits::propagate_on_container_move_assignment::
+                           value) {
+      clear();
+      reset_to_empty_state();
+      static_cast<EntryAlloc&>(*this) = std::move(other);
+      swap_pointers(other);
+    } else if (
+        static_cast<EntryAlloc&>(*this) == static_cast<EntryAlloc&>(other)) {
+      swap_pointers(other);
+    } else {
+      clear();
+      _max_load_factor = other._max_load_factor;
+      rehash_for_other_container(other);
+      for (T& elem : other)
+        emplace(std::move(elem));
+      other.clear();
+    }
+    static_cast<Hasher&>(*this) = std::move(other);
+    static_cast<Equal&>(*this) = std::move(other);
+    return *this;
+  }
+  ~sherwood_v3_table() {
+    clear();
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+  }
+
+  const allocator_type& get_allocator() const {
+    return static_cast<const allocator_type&>(*this);
+  }
+  const ArgumentEqual& key_eq() const {
+    return static_cast<const ArgumentEqual&>(*this);
+  }
+  const ArgumentHash& hash_function() const {
+    return static_cast<const ArgumentHash&>(*this);
+  }
+
+  template <typename ValueType>
+  struct templated_iterator {
+    templated_iterator() = default;
+    templated_iterator(EntryPointer current) : current(current) {}
+    EntryPointer current = EntryPointer();
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ValueType;
+    using difference_type = ptrdiff_t;
+    using pointer = ValueType*;
+    using reference = ValueType&;
+
+    friend bool operator==(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return lhs.current == rhs.current;
+    }
+    friend bool operator!=(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return !(lhs == rhs);
+    }
+
+    templated_iterator& operator++() {
+      current = current->next;
+      return *this;
+    }
+    templated_iterator operator++(int) {
+      templated_iterator copy(*this);
+      ++*this;
+      return copy;
+    }
+
+    ValueType& operator*() const {
+      return current->value;
+    }
+    ValueType* operator->() const {
+      return std::addressof(current->value);
+    }
+
+    // the template automatically disables the operator when value_type is
+    // already const, because that would cause a lot of compiler warnings
+    // otherwise.
+    template <
+        class target_type = const value_type,
+        class = std::enable_if_t<
+            std::is_same_v<target_type, const value_type> &&
+            !std::is_same_v<target_type, value_type>>>
+    operator templated_iterator<target_type>() const {
+      return {current};
+    }
+  };
+  using iterator = templated_iterator<value_type>;
+  using const_iterator = templated_iterator<const value_type>;
+
+  iterator begin() {
+    return sentinel->next;
+  }
+  const_iterator begin() const {
+    return sentinel->next;
+  }
+  const_iterator cbegin() const {
+    return begin();
+  }
+  iterator end() {
+    return sentinel;
+  }
+  const_iterator end() const {
+    return sentinel;
+  }
+  const_iterator cend() const {
+    return end();
+  }
+
+  iterator find(const FindKey& key) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer it = entries + ptrdiff_t(index);
+    for (int8_t distance = 0; it->distance_from_desired >= distance;
+         ++distance, ++it) {
+      if (compares_equal(key, it->value))
+        return {it};
+    }
+    return end();
+  }
+  const_iterator find(const FindKey& key) const {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return const_cast<sherwood_v3_table*>(this)->find(key);
+  }
+  uint64_t count(const FindKey& key) const {
+    return find(key) == end() ? 0 : 1;
+  }
+  std::pair<iterator, iterator> equal_range(const FindKey& key) {
+    iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+  std::pair<const_iterator, const_iterator> equal_range(
+      const FindKey& key) const {
+    const_iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+
+  template <typename Key, typename... Args>
+  std::pair<iterator, bool> emplace(Key&& key, Args&&... args) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer current_entry = entries + ptrdiff_t(index);
+    int8_t distance_from_desired = 0;
+    for (; current_entry->distance_from_desired >= distance_from_desired;
+         ++current_entry, ++distance_from_desired) {
+      // insertion of an existing key does not change ordering
+      if (compares_equal(key, current_entry->value))
+        return {{current_entry}, false};
+    }
+    return emplace_new_key(
+        distance_from_desired,
+        current_entry,
+        std::forward<Key>(key),
+        std::forward<Args>(args)...);
+  }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return emplace(value);
+  }
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return emplace(std::move(value));
+  }
+  template <typename... Args>
+  iterator emplace_hint(const_iterator /*unused*/, Args&&... args) {
+    return emplace(std::forward<Args>(args)...).first;
+  }
+  iterator insert(const_iterator /*unused*/, const value_type& value) {
+    return emplace(value).first;
+  }
+  iterator insert(const_iterator /*unused*/, value_type&& value) {
+    return emplace(std::move(value)).first;
+  }
+
+  template <typename It>
+  void insert(It begin, It end) {
+    for (; begin != end; ++begin) {
+      emplace(*begin);
+    }
+  }
+  void insert(std::initializer_list<value_type> il) {
+    insert(il.begin(), il.end());
+  }
+
+  void rehash(uint64_t num_buckets) {
+    num_buckets = std::max(
+        num_buckets,
+        static_cast<uint64_t>(std::ceil(
+            static_cast<double>(num_elements) /
+            static_cast<double>(_max_load_factor))));
+    if (num_buckets == 0) {
+      reset_to_empty_state();
+      return;
+    }
+    auto new_prime_index = hash_policy.next_size_over(num_buckets);
+    if (num_buckets == bucket_count())
+      return;
+    int8_t new_max_lookups = compute_max_lookups(num_buckets);
+    EntryPointer new_buckets(
+        AllocatorTraits::allocate(*this, num_buckets + new_max_lookups));
+    EntryPointer special_end_item =
+        new_buckets + static_cast<ptrdiff_t>(num_buckets + new_max_lookups - 1);
+    for (EntryPointer it = new_buckets; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    std::swap(entries, new_buckets);
+    std::swap(num_slots_minus_one, num_buckets);
+    --num_slots_minus_one;
+    hash_policy.commit(new_prime_index);
+    int8_t old_max_lookups = max_lookups;
+    max_lookups = new_max_lookups;
+    num_elements = 0;
+
+    auto start = sentinel->next;
+    // point sentinel to itself;
+    reset_list();
+    // reinsert list
+    for (EntryPointer it = start; it != sentinel;) {
+      auto next = it->next;
+      emplace(std::move(it->value));
+      it->destroy_value();
+      it = next;
+    }
+
+    deallocate_data(new_buckets, num_buckets, old_max_lookups);
+  }
+
+  void reserve(uint64_t num_elements_) {
+    uint64_t required_buckets = num_buckets_for_reserve(num_elements_);
+    if (required_buckets > bucket_count())
+      rehash(required_buckets);
+  }
+
+  void replace_linked_list_position(
+      EntryPointer to_be_replaced,
+      EntryPointer new_node) {
+    remove_from_list(new_node);
+    insert_after(new_node, to_be_replaced->prev);
+    remove_from_list(to_be_replaced);
+  }
+
+  // the return value is a type that can be converted to an iterator
+  // the reason for doing this is that it's not free to find the
+  // iterator pointing at the next element. if you care about the
+  // next iterator, turn the return value into an iterator
+  convertible_to_iterator erase(const_iterator to_erase) {
+    EntryPointer current = to_erase.current;
+    remove_from_list(current);
+    current->destroy_value();
+    --num_elements;
+
+    for (EntryPointer next = current + ptrdiff_t(1);
+         !next->is_at_desired_position();
+         ++current, ++next) {
+      // if an entry is being removed, and there are other entries with the
+      // same hash, the other entries get moved to their desired position by
+      // reinserting.
+      current->emplace(next->distance_from_desired - 1, std::move(next->value));
+      replace_linked_list_position(next, current);
+      next->destroy_value();
+    }
+    return {to_erase.current};
+  }
+
+  iterator erase(const_iterator begin_it, const_iterator end_it) {
+    // whenever an entry is removed and there are other entries with the same
+    // hash, the other entries must get moved to their desired position.
+    // any reference to a moved entry is invalidated.
+    // here, we iterate through the range, and make sure that we update
+    // the pointer to our next entry in the list or the end of the iterator
+    // when it is invalidated.
+
+    auto curr_iter = begin_it.current;
+    auto next_iter = curr_iter->next;
+    auto end_iter = end_it.current;
+
+    while (curr_iter != end_iter) {
+      remove_from_list(curr_iter);
+      curr_iter->destroy_value();
+      --num_elements;
+
+      for (EntryPointer next_hash_slot = curr_iter + ptrdiff_t(1);
+           !next_hash_slot->is_at_desired_position();
+           ++curr_iter, ++next_hash_slot) {
+        curr_iter->emplace(
+            next_hash_slot->distance_from_desired - 1,
+            std::move(next_hash_slot->value));
+        replace_linked_list_position(next_hash_slot, curr_iter);
+        next_hash_slot->destroy_value();
+
+        // we are invalidating next_iter or end_iter
+        if (next_hash_slot == end_iter) {
+          end_iter = curr_iter;
+        } else if (next_hash_slot == next_iter) {
+          next_iter = curr_iter;
+        }
+      }
+      curr_iter = next_iter;
+      next_iter = curr_iter->next;
+    }
+
+    return {end_iter};
+  }
+
+  uint64_t erase(const FindKey& key) {
+    auto found = find(key);
+    if (found == end())
+      return 0;
+    else {
+      erase(found);
+      return 1;
+    }
+  }
+
+  void clear() {
+    for (EntryPointer it = entries,
+                      end = it +
+             static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups);
+         it != end;
+         ++it) {
+      if (it->has_value())
+        it->destroy_value();
+    }
+    reset_list();
+    num_elements = 0;
+  }
+
+  void shrink_to_fit() {
+    rehash_for_other_container(*this);
+  }
+
+  void swap(sherwood_v3_table& other) noexcept {
+    using std::swap;
+    swap_pointers(other);
+    swap(static_cast<ArgumentHash&>(*this), static_cast<ArgumentHash&>(other));
+    swap(
+        static_cast<ArgumentEqual&>(*this), static_cast<ArgumentEqual&>(other));
+    if (AllocatorTraits::propagate_on_container_swap::value)
+      swap(static_cast<EntryAlloc&>(*this), static_cast<EntryAlloc&>(other));
+  }
+
+  uint64_t size() const {
+    return num_elements;
+  }
+  uint64_t max_size() const {
+    return (AllocatorTraits::max_size(*this)) / sizeof(Entry);
+  }
+  uint64_t bucket_count() const {
+    return num_slots_minus_one ? num_slots_minus_one + 1 : 0;
+  }
+  size_type max_bucket_count() const {
+    return (AllocatorTraits::max_size(*this) - min_lookups) / sizeof(Entry);
+  }
+  uint64_t bucket(const FindKey& key) const {
+    return hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+  }
+  float load_factor() const {
+    uint64_t buckets = bucket_count();
+    if (buckets)
+      return static_cast<float>(num_elements) / bucket_count();
+    else
+      return 0;
+  }
+  void max_load_factor(float value) {
+    _max_load_factor = value;
+  }
+  float max_load_factor() const {
+    return _max_load_factor;
+  }
+
+  bool empty() const {
+    return num_elements == 0;
+  }
+
+ private:
+  EntryPointer entries = empty_default_table();
+  uint64_t num_slots_minus_one = 0;
+  typename HashPolicySelector<ArgumentHash>::type hash_policy;
+  int8_t max_lookups = detailv3::min_lookups - 1;
+  float _max_load_factor = 0.5f;
+  uint64_t num_elements = 0;
+  std::unique_ptr<sherwood_v3_entry<T>> sentinel_val;
+
+  // head of doubly linked list
+  EntryPointer sentinel = initSentinel();
+
+  EntryPointer initSentinel() {
+    // needs to be a pointer so that hash map can be used with forward declared
+    // types
+    sentinel_val = std::make_unique<sherwood_v3_entry<T>>();
+    sentinel = sentinel_val.get();
+    reset_list();
+    return sentinel;
+  }
+
+  EntryPointer empty_default_table() {
+    EntryPointer result =
+        AllocatorTraits::allocate(*this, detailv3::min_lookups);
+    EntryPointer special_end_item =
+        result + static_cast<ptrdiff_t>(detailv3::min_lookups - 1);
+    for (EntryPointer it = result; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    return result;
+  }
+
+  static int8_t compute_max_lookups(uint64_t num_buckets) {
+    int8_t desired = detailv3::log2(num_buckets);
+    return std::max(detailv3::min_lookups, desired);
+  }
+
+  uint64_t num_buckets_for_reserve(uint64_t num_elements_) const {
+    return static_cast<uint64_t>(std::ceil(
+        static_cast<double>(num_elements_) /
+        std::min(0.5, static_cast<double>(_max_load_factor))));
+  }
+  void rehash_for_other_container(const sherwood_v3_table& other) {
+    rehash(
+        std::min(num_buckets_for_reserve(other.size()), other.bucket_count()));
+  }
+
+  void swap_pointers(sherwood_v3_table& other) {
+    using std::swap;
+    swap(hash_policy, other.hash_policy);
+    swap(entries, other.entries);
+    swap(num_slots_minus_one, other.num_slots_minus_one);
+    swap(num_elements, other.num_elements);
+    swap(max_lookups, other.max_lookups);
+    swap(_max_load_factor, other._max_load_factor);
+    swap(sentinel, other.sentinel);
+    swap(sentinel_val, other.sentinel_val);
+  }
+
+  void reset_list() {
+    sentinel->next = sentinel;
+    sentinel->prev = sentinel;
+  }
+
+  void remove_from_list(EntryPointer elem) {
+    elem->prev->next = elem->next;
+    elem->next->prev = elem->prev;
+  }
+
+  void insert_after(EntryPointer new_elem, EntryPointer prev) {
+    auto next = prev->next;
+
+    prev->next = new_elem;
+    new_elem->prev = prev;
+
+    new_elem->next = next;
+    next->prev = new_elem;
+  }
+
+  void swap_adjacent_nodes(EntryPointer before, EntryPointer after) {
+    // sentinel stays constant, so before->prev cannot equal after
+    auto before_prev = before->prev;
+    auto after_next = after->next;
+
+    before_prev->next = after;
+    after->prev = before_prev;
+
+    after_next->prev = before;
+    before->next = after_next;
+
+    before->prev = after;
+    after->next = before;
+  }
+
+  void swap_positions(EntryPointer p1, EntryPointer p2) {
+    if (p1 == p2) {
+      return;
+    }
+    if (p1->next == p2) {
+      return swap_adjacent_nodes(p1, p2);
+    } else if (p2->next == p1) {
+      return swap_adjacent_nodes(p2, p1);
+    }
+
+    auto p1_prev = p1->prev;
+    auto p1_next = p1->next;
+
+    auto p2_prev = p2->prev;
+    auto p2_next = p2->next;
+
+    p1_prev->next = p2;
+    p2->prev = p1_prev;
+
+    p1_next->prev = p2;
+    p2->next = p1_next;
+
+    p2_prev->next = p1;
+    p1->prev = p2_prev;
+
+    p2_next->prev = p1;
+    p1->next = p2_next;
+  }
+
+  void append_to_list(EntryPointer new_tail) {
+    insert_after(new_tail, sentinel->prev);
+  }
+
+  template <typename Key, typename... Args>
+  SKA_NOINLINE(std::pair<iterator, bool>)
+  emplace_new_key(
+      int8_t distance_from_desired,
+      EntryPointer current_entry,
+      Key&& key,
+      Args&&... args) {
+    using std::swap;
+    if (num_slots_minus_one == 0 || distance_from_desired == max_lookups ||
+        static_cast<double>(num_elements + 1) >
+            static_cast<double>(num_slots_minus_one + 1) *
+                static_cast<double>(_max_load_factor)) {
+      grow();
+      return emplace(std::forward<Key>(key), std::forward<Args>(args)...);
+    } else if (current_entry->is_empty()) {
+      current_entry->emplace(
+          distance_from_desired,
+          std::forward<Key>(key),
+          std::forward<Args>(args)...);
+      ++num_elements;
+      append_to_list(current_entry);
+      return {{current_entry}, true};
+    }
+    value_type to_insert(std::forward<Key>(key), std::forward<Args>(args)...);
+    swap(distance_from_desired, current_entry->distance_from_desired);
+    // We maintain the invariant that:
+    // - result.current_entry contains the new value we're inserting
+    //   and is in the LinkedList position of to_insert
+    // - to_insert contains the value that represents the position of
+    //   result.current_entry
+    swap(to_insert, current_entry->value);
+    iterator result = {current_entry};
+    for (++distance_from_desired, ++current_entry;; ++current_entry) {
+      if (current_entry->is_empty()) {
+        current_entry->emplace(distance_from_desired, std::move(to_insert));
+        append_to_list(current_entry);
+        // now we can swap back the displaced value to its correct position,
+        // putting the new value we're inserting to the front of the list
+        swap_positions(current_entry, result.current);
+        ++num_elements;
+        return {result, true};
+      } else if (current_entry->distance_from_desired < distance_from_desired) {
+        swap(distance_from_desired, current_entry->distance_from_desired);
+        swap(to_insert, current_entry->value);
+        // to maintain our invariants we need to swap positions
+        // of result.current & current_entry:
+        swap_positions(result.current, current_entry);
+        ++distance_from_desired;
+      } else {
+        ++distance_from_desired;
+        if (distance_from_desired == max_lookups) {
+          // the displaced element gets put back into its correct position
+          // we grow the hash table, and then try again to reinsert the new
+          // element
+          swap(to_insert, result.current->value);
+          grow();
+          return emplace(std::move(to_insert));
+        }
+      }
+    }
+  }
+
+  void grow() {
+    rehash(std::max(uint64_t(4), 2 * bucket_count()));
+  }
+
+  void deallocate_data(
+      EntryPointer begin,
+      uint64_t num_slots_minus_one_,
+      int8_t max_lookups_) {
+    AllocatorTraits::deallocate(
+        *this, begin, num_slots_minus_one_ + max_lookups_ + 1);
+  }
+
+  void reset_to_empty_state() {
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+    entries = empty_default_table();
+    num_slots_minus_one = 0;
+    hash_policy.reset();
+    max_lookups = detailv3::min_lookups - 1;
+  }
+
+  template <typename U>
+  uint64_t hash_object(const U& key) {
+    return static_cast<Hasher&>(*this)(key);
+  }
+  template <typename U>
+  uint64_t hash_object(const U& key) const {
+    return static_cast<const Hasher&>(*this)(key);
+  }
+  template <typename L, typename R>
+  bool compares_equal(const L& lhs, const R& rhs) {
+    return static_cast<Equal&>(*this)(lhs, rhs);
+  }
+
+ public:
+  struct convertible_to_iterator {
+    EntryPointer it;
+
+    operator iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++iterator{it};
+    }
+    operator const_iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++const_iterator{it};
+    }
+  };
+};
+} // namespace detailv3
+
+struct prime_number_hash_policy {
+  static uint64_t mod0(uint64_t /*unused*/) {
+    return 0llu;
+  }
+  static uint64_t mod2(uint64_t hash) {
+    return hash % 2llu;
+  }
+  static uint64_t mod3(uint64_t hash) {
+    return hash % 3llu;
+  }
+  static uint64_t mod5(uint64_t hash) {
+    return hash % 5llu;
+  }
+  static uint64_t mod7(uint64_t hash) {
+    return hash % 7llu;
+  }
+  static uint64_t mod11(uint64_t hash) {
+    return hash % 11llu;
+  }
+  static uint64_t mod13(uint64_t hash) {
+    return hash % 13llu;
+  }
+  static uint64_t mod17(uint64_t hash) {
+    return hash % 17llu;
+  }
+  static uint64_t mod23(uint64_t hash) {
+    return hash % 23llu;
+  }
+  static uint64_t mod29(uint64_t hash) {
+    return hash % 29llu;
+  }
+  static uint64_t mod37(uint64_t hash) {
+    return hash % 37llu;
+  }
+  static uint64_t mod47(uint64_t hash) {
+    return hash % 47llu;
+  }
+  static uint64_t mod59(uint64_t hash) {
+    return hash % 59llu;
+  }
+  static uint64_t mod73(uint64_t hash) {
+    return hash % 73llu;
+  }
+  static uint64_t mod97(uint64_t hash) {
+    return hash % 97llu;
+  }
+  static uint64_t mod127(uint64_t hash) {
+    return hash % 127llu;
+  }
+  static uint64_t mod151(uint64_t hash) {
+    return hash % 151llu;
+  }
+  static uint64_t mod197(uint64_t hash) {
+    return hash % 197llu;
+  }
+  static uint64_t mod251(uint64_t hash) {
+    return hash % 251llu;
+  }
+  static uint64_t mod313(uint64_t hash) {
+    return hash % 313llu;
+  }
+  static uint64_t mod397(uint64_t hash) {
+    return hash % 397llu;
+  }
+  static uint64_t mod499(uint64_t hash) {
+    return hash % 499llu;
+  }
+  static uint64_t mod631(uint64_t hash) {
+    return hash % 631llu;
+  }
+  static uint64_t mod797(uint64_t hash) {
+    return hash % 797llu;
+  }
+  static uint64_t mod1009(uint64_t hash) {
+    return hash % 1009llu;
+  }
+  static uint64_t mod1259(uint64_t hash) {
+    return hash % 1259llu;
+  }
+  static uint64_t mod1597(uint64_t hash) {
+    return hash % 1597llu;
+  }
+  static uint64_t mod2011(uint64_t hash) {
+    return hash % 2011llu;
+  }
+  static uint64_t mod2539(uint64_t hash) {
+    return hash % 2539llu;
+  }
+  static uint64_t mod3203(uint64_t hash) {
+    return hash % 3203llu;
+  }
+  static uint64_t mod4027(uint64_t hash) {
+    return hash % 4027llu;
+  }
+  static uint64_t mod5087(uint64_t hash) {
+    return hash % 5087llu;
+  }
+  static uint64_t mod6421(uint64_t hash) {
+    return hash % 6421llu;
+  }
+  static uint64_t mod8089(uint64_t hash) {
+    return hash % 8089llu;
+  }
+  static uint64_t mod10193(uint64_t hash) {
+    return hash % 10193llu;
+  }
+  static uint64_t mod12853(uint64_t hash) {
+    return hash % 12853llu;
+  }
+  static uint64_t mod16193(uint64_t hash) {
+    return hash % 16193llu;
+  }
+  static uint64_t mod20399(uint64_t hash) {
+    return hash % 20399llu;
+  }
+  static uint64_t mod25717(uint64_t hash) {
+    return hash % 25717llu;
+  }
+  static uint64_t mod32401(uint64_t hash) {
+    return hash % 32401llu;
+  }
+  static uint64_t mod40823(uint64_t hash) {
+    return hash % 40823llu;
+  }
+  static uint64_t mod51437(uint64_t hash) {
+    return hash % 51437llu;
+  }
+  static uint64_t mod64811(uint64_t hash) {
+    return hash % 64811llu;
+  }
+  static uint64_t mod81649(uint64_t hash) {
+    return hash % 81649llu;
+  }
+  static uint64_t mod102877(uint64_t hash) {
+    return hash % 102877llu;
+  }
+  static uint64_t mod129607(uint64_t hash) {
+    return hash % 129607llu;
+  }
+  static uint64_t mod163307(uint64_t hash) {
+    return hash % 163307llu;
+  }
+  static uint64_t mod205759(uint64_t hash) {
+    return hash % 205759llu;
+  }
+  static uint64_t mod259229(uint64_t hash) {
+    return hash % 259229llu;
+  }
+  static uint64_t mod326617(uint64_t hash) {
+    return hash % 326617llu;
+  }
+  static uint64_t mod411527(uint64_t hash) {
+    return hash % 411527llu;
+  }
+  static uint64_t mod518509(uint64_t hash) {
+    return hash % 518509llu;
+  }
+  static uint64_t mod653267(uint64_t hash) {
+    return hash % 653267llu;
+  }
+  static uint64_t mod823117(uint64_t hash) {
+    return hash % 823117llu;
+  }
+  static uint64_t mod1037059(uint64_t hash) {
+    return hash % 1037059llu;
+  }
+  static uint64_t mod1306601(uint64_t hash) {
+    return hash % 1306601llu;
+  }
+  static uint64_t mod1646237(uint64_t hash) {
+    return hash % 1646237llu;
+  }
+  static uint64_t mod2074129(uint64_t hash) {
+    return hash % 2074129llu;
+  }
+  static uint64_t mod2613229(uint64_t hash) {
+    return hash % 2613229llu;
+  }
+  static uint64_t mod3292489(uint64_t hash) {
+    return hash % 3292489llu;
+  }
+  static uint64_t mod4148279(uint64_t hash) {
+    return hash % 4148279llu;
+  }
+  static uint64_t mod5226491(uint64_t hash) {
+    return hash % 5226491llu;
+  }
+  static uint64_t mod6584983(uint64_t hash) {
+    return hash % 6584983llu;
+  }
+  static uint64_t mod8296553(uint64_t hash) {
+    return hash % 8296553llu;
+  }
+  static uint64_t mod10453007(uint64_t hash) {
+    return hash % 10453007llu;
+  }
+  static uint64_t mod13169977(uint64_t hash) {
+    return hash % 13169977llu;
+  }
+  static uint64_t mod16593127(uint64_t hash) {
+    return hash % 16593127llu;
+  }
+  static uint64_t mod20906033(uint64_t hash) {
+    return hash % 20906033llu;
+  }
+  static uint64_t mod26339969(uint64_t hash) {
+    return hash % 26339969llu;
+  }
+  static uint64_t mod33186281(uint64_t hash) {
+    return hash % 33186281llu;
+  }
+  static uint64_t mod41812097(uint64_t hash) {
+    return hash % 41812097llu;
+  }
+  static uint64_t mod52679969(uint64_t hash) {
+    return hash % 52679969llu;
+  }
+  static uint64_t mod66372617(uint64_t hash) {
+    return hash % 66372617llu;
+  }
+  static uint64_t mod83624237(uint64_t hash) {
+    return hash % 83624237llu;
+  }
+  static uint64_t mod105359939(uint64_t hash) {
+    return hash % 105359939llu;
+  }
+  static uint64_t mod132745199(uint64_t hash) {
+    return hash % 132745199llu;
+  }
+  static uint64_t mod167248483(uint64_t hash) {
+    return hash % 167248483llu;
+  }
+  static uint64_t mod210719881(uint64_t hash) {
+    return hash % 210719881llu;
+  }
+  static uint64_t mod265490441(uint64_t hash) {
+    return hash % 265490441llu;
+  }
+  static uint64_t mod334496971(uint64_t hash) {
+    return hash % 334496971llu;
+  }
+  static uint64_t mod421439783(uint64_t hash) {
+    return hash % 421439783llu;
+  }
+  static uint64_t mod530980861(uint64_t hash) {
+    return hash % 530980861llu;
+  }
+  static uint64_t mod668993977(uint64_t hash) {
+    return hash % 668993977llu;
+  }
+  static uint64_t mod842879579(uint64_t hash) {
+    return hash % 842879579llu;
+  }
+  static uint64_t mod1061961721(uint64_t hash) {
+    return hash % 1061961721llu;
+  }
+  static uint64_t mod1337987929(uint64_t hash) {
+    return hash % 1337987929llu;
+  }
+  static uint64_t mod1685759167(uint64_t hash) {
+    return hash % 1685759167llu;
+  }
+  static uint64_t mod2123923447(uint64_t hash) {
+    return hash % 2123923447llu;
+  }
+  static uint64_t mod2675975881(uint64_t hash) {
+    return hash % 2675975881llu;
+  }
+  static uint64_t mod3371518343(uint64_t hash) {
+    return hash % 3371518343llu;
+  }
+  static uint64_t mod4247846927(uint64_t hash) {
+    return hash % 4247846927llu;
+  }
+  static uint64_t mod5351951779(uint64_t hash) {
+    return hash % 5351951779llu;
+  }
+  static uint64_t mod6743036717(uint64_t hash) {
+    return hash % 6743036717llu;
+  }
+  static uint64_t mod8495693897(uint64_t hash) {
+    return hash % 8495693897llu;
+  }
+  static uint64_t mod10703903591(uint64_t hash) {
+    return hash % 10703903591llu;
+  }
+  static uint64_t mod13486073473(uint64_t hash) {
+    return hash % 13486073473llu;
+  }
+  static uint64_t mod16991387857(uint64_t hash) {
+    return hash % 16991387857llu;
+  }
+  static uint64_t mod21407807219(uint64_t hash) {
+    return hash % 21407807219llu;
+  }
+  static uint64_t mod26972146961(uint64_t hash) {
+    return hash % 26972146961llu;
+  }
+  static uint64_t mod33982775741(uint64_t hash) {
+    return hash % 33982775741llu;
+  }
+  static uint64_t mod42815614441(uint64_t hash) {
+    return hash % 42815614441llu;
+  }
+  static uint64_t mod53944293929(uint64_t hash) {
+    return hash % 53944293929llu;
+  }
+  static uint64_t mod67965551447(uint64_t hash) {
+    return hash % 67965551447llu;
+  }
+  static uint64_t mod85631228929(uint64_t hash) {
+    return hash % 85631228929llu;
+  }
+  static uint64_t mod107888587883(uint64_t hash) {
+    return hash % 107888587883llu;
+  }
+  static uint64_t mod135931102921(uint64_t hash) {
+    return hash % 135931102921llu;
+  }
+  static uint64_t mod171262457903(uint64_t hash) {
+    return hash % 171262457903llu;
+  }
+  static uint64_t mod215777175787(uint64_t hash) {
+    return hash % 215777175787llu;
+  }
+  static uint64_t mod271862205833(uint64_t hash) {
+    return hash % 271862205833llu;
+  }
+  static uint64_t mod342524915839(uint64_t hash) {
+    return hash % 342524915839llu;
+  }
+  static uint64_t mod431554351609(uint64_t hash) {
+    return hash % 431554351609llu;
+  }
+  static uint64_t mod543724411781(uint64_t hash) {
+    return hash % 543724411781llu;
+  }
+  static uint64_t mod685049831731(uint64_t hash) {
+    return hash % 685049831731llu;
+  }
+  static uint64_t mod863108703229(uint64_t hash) {
+    return hash % 863108703229llu;
+  }
+  static uint64_t mod1087448823553(uint64_t hash) {
+    return hash % 1087448823553llu;
+  }
+  static uint64_t mod1370099663459(uint64_t hash) {
+    return hash % 1370099663459llu;
+  }
+  static uint64_t mod1726217406467(uint64_t hash) {
+    return hash % 1726217406467llu;
+  }
+  static uint64_t mod2174897647073(uint64_t hash) {
+    return hash % 2174897647073llu;
+  }
+  static uint64_t mod2740199326961(uint64_t hash) {
+    return hash % 2740199326961llu;
+  }
+  static uint64_t mod3452434812973(uint64_t hash) {
+    return hash % 3452434812973llu;
+  }
+  static uint64_t mod4349795294267(uint64_t hash) {
+    return hash % 4349795294267llu;
+  }
+  static uint64_t mod5480398654009(uint64_t hash) {
+    return hash % 5480398654009llu;
+  }
+  static uint64_t mod6904869625999(uint64_t hash) {
+    return hash % 6904869625999llu;
+  }
+  static uint64_t mod8699590588571(uint64_t hash) {
+    return hash % 8699590588571llu;
+  }
+  static uint64_t mod10960797308051(uint64_t hash) {
+    return hash % 10960797308051llu;
+  }
+  static uint64_t mod13809739252051(uint64_t hash) {
+    return hash % 13809739252051llu;
+  }
+  static uint64_t mod17399181177241(uint64_t hash) {
+    return hash % 17399181177241llu;
+  }
+  static uint64_t mod21921594616111(uint64_t hash) {
+    return hash % 21921594616111llu;
+  }
+  static uint64_t mod27619478504183(uint64_t hash) {
+    return hash % 27619478504183llu;
+  }
+  static uint64_t mod34798362354533(uint64_t hash) {
+    return hash % 34798362354533llu;
+  }
+  static uint64_t mod43843189232363(uint64_t hash) {
+    return hash % 43843189232363llu;
+  }
+  static uint64_t mod55238957008387(uint64_t hash) {
+    return hash % 55238957008387llu;
+  }
+  static uint64_t mod69596724709081(uint64_t hash) {
+    return hash % 69596724709081llu;
+  }
+  static uint64_t mod87686378464759(uint64_t hash) {
+    return hash % 87686378464759llu;
+  }
+  static uint64_t mod110477914016779(uint64_t hash) {
+    return hash % 110477914016779llu;
+  }
+  static uint64_t mod139193449418173(uint64_t hash) {
+    return hash % 139193449418173llu;
+  }
+  static uint64_t mod175372756929481(uint64_t hash) {
+    return hash % 175372756929481llu;
+  }
+  static uint64_t mod220955828033581(uint64_t hash) {
+    return hash % 220955828033581llu;
+  }
+  static uint64_t mod278386898836457(uint64_t hash) {
+    return hash % 278386898836457llu;
+  }
+  static uint64_t mod350745513859007(uint64_t hash) {
+    return hash % 350745513859007llu;
+  }
+  static uint64_t mod441911656067171(uint64_t hash) {
+    return hash % 441911656067171llu;
+  }
+  static uint64_t mod556773797672909(uint64_t hash) {
+    return hash % 556773797672909llu;
+  }
+  static uint64_t mod701491027718027(uint64_t hash) {
+    return hash % 701491027718027llu;
+  }
+  static uint64_t mod883823312134381(uint64_t hash) {
+    return hash % 883823312134381llu;
+  }
+  static uint64_t mod1113547595345903(uint64_t hash) {
+    return hash % 1113547595345903llu;
+  }
+  static uint64_t mod1402982055436147(uint64_t hash) {
+    return hash % 1402982055436147llu;
+  }
+  static uint64_t mod1767646624268779(uint64_t hash) {
+    return hash % 1767646624268779llu;
+  }
+  static uint64_t mod2227095190691797(uint64_t hash) {
+    return hash % 2227095190691797llu;
+  }
+  static uint64_t mod2805964110872297(uint64_t hash) {
+    return hash % 2805964110872297llu;
+  }
+  static uint64_t mod3535293248537579(uint64_t hash) {
+    return hash % 3535293248537579llu;
+  }
+  static uint64_t mod4454190381383713(uint64_t hash) {
+    return hash % 4454190381383713llu;
+  }
+  static uint64_t mod5611928221744609(uint64_t hash) {
+    return hash % 5611928221744609llu;
+  }
+  static uint64_t mod7070586497075177(uint64_t hash) {
+    return hash % 7070586497075177llu;
+  }
+  static uint64_t mod8908380762767489(uint64_t hash) {
+    return hash % 8908380762767489llu;
+  }
+  static uint64_t mod11223856443489329(uint64_t hash) {
+    return hash % 11223856443489329llu;
+  }
+  static uint64_t mod14141172994150357(uint64_t hash) {
+    return hash % 14141172994150357llu;
+  }
+  static uint64_t mod17816761525534927(uint64_t hash) {
+    return hash % 17816761525534927llu;
+  }
+  static uint64_t mod22447712886978529(uint64_t hash) {
+    return hash % 22447712886978529llu;
+  }
+  static uint64_t mod28282345988300791(uint64_t hash) {
+    return hash % 28282345988300791llu;
+  }
+  static uint64_t mod35633523051069991(uint64_t hash) {
+    return hash % 35633523051069991llu;
+  }
+  static uint64_t mod44895425773957261(uint64_t hash) {
+    return hash % 44895425773957261llu;
+  }
+  static uint64_t mod56564691976601587(uint64_t hash) {
+    return hash % 56564691976601587llu;
+  }
+  static uint64_t mod71267046102139967(uint64_t hash) {
+    return hash % 71267046102139967llu;
+  }
+  static uint64_t mod89790851547914507(uint64_t hash) {
+    return hash % 89790851547914507llu;
+  }
+  static uint64_t mod113129383953203213(uint64_t hash) {
+    return hash % 113129383953203213llu;
+  }
+  static uint64_t mod142534092204280003(uint64_t hash) {
+    return hash % 142534092204280003llu;
+  }
+  static uint64_t mod179581703095829107(uint64_t hash) {
+    return hash % 179581703095829107llu;
+  }
+  static uint64_t mod226258767906406483(uint64_t hash) {
+    return hash % 226258767906406483llu;
+  }
+  static uint64_t mod285068184408560057(uint64_t hash) {
+    return hash % 285068184408560057llu;
+  }
+  static uint64_t mod359163406191658253(uint64_t hash) {
+    return hash % 359163406191658253llu;
+  }
+  static uint64_t mod452517535812813007(uint64_t hash) {
+    return hash % 452517535812813007llu;
+  }
+  static uint64_t mod570136368817120201(uint64_t hash) {
+    return hash % 570136368817120201llu;
+  }
+  static uint64_t mod718326812383316683(uint64_t hash) {
+    return hash % 718326812383316683llu;
+  }
+  static uint64_t mod905035071625626043(uint64_t hash) {
+    return hash % 905035071625626043llu;
+  }
+  static uint64_t mod1140272737634240411(uint64_t hash) {
+    return hash % 1140272737634240411llu;
+  }
+  static uint64_t mod1436653624766633509(uint64_t hash) {
+    return hash % 1436653624766633509llu;
+  }
+  static uint64_t mod1810070143251252131(uint64_t hash) {
+    return hash % 1810070143251252131llu;
+  }
+  static uint64_t mod2280545475268481167(uint64_t hash) {
+    return hash % 2280545475268481167llu;
+  }
+  static uint64_t mod2873307249533267101(uint64_t hash) {
+    return hash % 2873307249533267101llu;
+  }
+  static uint64_t mod3620140286502504283(uint64_t hash) {
+    return hash % 3620140286502504283llu;
+  }
+  static uint64_t mod4561090950536962147(uint64_t hash) {
+    return hash % 4561090950536962147llu;
+  }
+  static uint64_t mod5746614499066534157(uint64_t hash) {
+    return hash % 5746614499066534157llu;
+  }
+  static uint64_t mod7240280573005008577(uint64_t hash) {
+    return hash % 7240280573005008577llu;
+  }
+  static uint64_t mod9122181901073924329(uint64_t hash) {
+    return hash % 9122181901073924329llu;
+  }
+  static uint64_t mod11493228998133068689(uint64_t hash) {
+    return hash % 11493228998133068689llu;
+  }
+  static uint64_t mod14480561146010017169(uint64_t hash) {
+    return hash % 14480561146010017169llu;
+  }
+  static uint64_t mod18446744073709551557(uint64_t hash) {
+    return hash % 18446744073709551557llu;
+  }
+
+  using mod_function = uint64_t (*)(uint64_t);
+
+  mod_function next_size_over(uint64_t& size) const {
+    // prime numbers generated by the following method:
+    // 1. start with a prime p = 2
+    // 2. go to wolfram alpha and get p = NextPrime(2 * p)
+    // 3. repeat 2. until you overflow 64 bits
+    // you now have large gaps which you would hit if somebody called reserve()
+    // with an unlucky number.
+    // 4. to fill the gaps for every prime p go to wolfram alpha and get
+    // ClosestPrime(p * 2^(1/3)) and ClosestPrime(p * 2^(2/3)) and put those in
+    // the gaps
+    // 5. get PrevPrime(2^64) and put it at the end
+    // NOLINTNEXTLINE(*c-array*)
+    static constexpr const uint64_t prime_list[] = {
+        2llu,
+        3llu,
+        5llu,
+        7llu,
+        11llu,
+        13llu,
+        17llu,
+        23llu,
+        29llu,
+        37llu,
+        47llu,
+        59llu,
+        73llu,
+        97llu,
+        127llu,
+        151llu,
+        197llu,
+        251llu,
+        313llu,
+        397llu,
+        499llu,
+        631llu,
+        797llu,
+        1009llu,
+        1259llu,
+        1597llu,
+        2011llu,
+        2539llu,
+        3203llu,
+        4027llu,
+        5087llu,
+        6421llu,
+        8089llu,
+        10193llu,
+        12853llu,
+        16193llu,
+        20399llu,
+        25717llu,
+        32401llu,
+        40823llu,
+        51437llu,
+        64811llu,
+        81649llu,
+        102877llu,
+        129607llu,
+        163307llu,
+        205759llu,
+        259229llu,
+        326617llu,
+        411527llu,
+        518509llu,
+        653267llu,
+        823117llu,
+        1037059llu,
+        1306601llu,
+        1646237llu,
+        2074129llu,
+        2613229llu,
+        3292489llu,
+        4148279llu,
+        5226491llu,
+        6584983llu,
+        8296553llu,
+        10453007llu,
+        13169977llu,
+        16593127llu,
+        20906033llu,
+        26339969llu,
+        33186281llu,
+        41812097llu,
+        52679969llu,
+        66372617llu,
+        83624237llu,
+        105359939llu,
+        132745199llu,
+        167248483llu,
+        210719881llu,
+        265490441llu,
+        334496971llu,
+        421439783llu,
+        530980861llu,
+        668993977llu,
+        842879579llu,
+        1061961721llu,
+        1337987929llu,
+        1685759167llu,
+        2123923447llu,
+        2675975881llu,
+        3371518343llu,
+        4247846927llu,
+        5351951779llu,
+        6743036717llu,
+        8495693897llu,
+        10703903591llu,
+        13486073473llu,
+        16991387857llu,
+        21407807219llu,
+        26972146961llu,
+        33982775741llu,
+        42815614441llu,
+        53944293929llu,
+        67965551447llu,
+        85631228929llu,
+        107888587883llu,
+        135931102921llu,
+        171262457903llu,
+        215777175787llu,
+        271862205833llu,
+        342524915839llu,
+        431554351609llu,
+        543724411781llu,
+        685049831731llu,
+        863108703229llu,
+        1087448823553llu,
+        1370099663459llu,
+        1726217406467llu,
+        2174897647073llu,
+        2740199326961llu,
+        3452434812973llu,
+        4349795294267llu,
+        5480398654009llu,
+        6904869625999llu,
+        8699590588571llu,
+        10960797308051llu,
+        13809739252051llu,
+        17399181177241llu,
+        21921594616111llu,
+        27619478504183llu,
+        34798362354533llu,
+        43843189232363llu,
+        55238957008387llu,
+        69596724709081llu,
+        87686378464759llu,
+        110477914016779llu,
+        139193449418173llu,
+        175372756929481llu,
+        220955828033581llu,
+        278386898836457llu,
+        350745513859007llu,
+        441911656067171llu,
+        556773797672909llu,
+        701491027718027llu,
+        883823312134381llu,
+        1113547595345903llu,
+        1402982055436147llu,
+        1767646624268779llu,
+        2227095190691797llu,
+        2805964110872297llu,
+        3535293248537579llu,
+        4454190381383713llu,
+        5611928221744609llu,
+        7070586497075177llu,
+        8908380762767489llu,
+        11223856443489329llu,
+        14141172994150357llu,
+        17816761525534927llu,
+        22447712886978529llu,
+        28282345988300791llu,
+        35633523051069991llu,
+        44895425773957261llu,
+        56564691976601587llu,
+        71267046102139967llu,
+        89790851547914507llu,
+        113129383953203213llu,
+        142534092204280003llu,
+        179581703095829107llu,
+        226258767906406483llu,
+        285068184408560057llu,
+        359163406191658253llu,
+        452517535812813007llu,
+        570136368817120201llu,
+        718326812383316683llu,
+        905035071625626043llu,
+        1140272737634240411llu,
+        1436653624766633509llu,
+        1810070143251252131llu,
+        2280545475268481167llu,
+        2873307249533267101llu,
+        3620140286502504283llu,
+        4561090950536962147llu,
+        5746614499066534157llu,
+        7240280573005008577llu,
+        9122181901073924329llu,
+        11493228998133068689llu,
+        14480561146010017169llu,
+        18446744073709551557llu};
+    // NOLINTNEXTLINE(*c-array*)
+    static constexpr uint64_t (*const mod_functions[])(uint64_t) = {
+        &mod0,
+        &mod2,
+        &mod3,
+        &mod5,
+        &mod7,
+        &mod11,
+        &mod13,
+        &mod17,
+        &mod23,
+        &mod29,
+        &mod37,
+        &mod47,
+        &mod59,
+        &mod73,
+        &mod97,
+        &mod127,
+        &mod151,
+        &mod197,
+        &mod251,
+        &mod313,
+        &mod397,
+        &mod499,
+        &mod631,
+        &mod797,
+        &mod1009,
+        &mod1259,
+        &mod1597,
+        &mod2011,
+        &mod2539,
+        &mod3203,
+        &mod4027,
+        &mod5087,
+        &mod6421,
+        &mod8089,
+        &mod10193,
+        &mod12853,
+        &mod16193,
+        &mod20399,
+        &mod25717,
+        &mod32401,
+        &mod40823,
+        &mod51437,
+        &mod64811,
+        &mod81649,
+        &mod102877,
+        &mod129607,
+        &mod163307,
+        &mod205759,
+        &mod259229,
+        &mod326617,
+        &mod411527,
+        &mod518509,
+        &mod653267,
+        &mod823117,
+        &mod1037059,
+        &mod1306601,
+        &mod1646237,
+        &mod2074129,
+        &mod2613229,
+        &mod3292489,
+        &mod4148279,
+        &mod5226491,
+        &mod6584983,
+        &mod8296553,
+        &mod10453007,
+        &mod13169977,
+        &mod16593127,
+        &mod20906033,
+        &mod26339969,
+        &mod33186281,
+        &mod41812097,
+        &mod52679969,
+        &mod66372617,
+        &mod83624237,
+        &mod105359939,
+        &mod132745199,
+        &mod167248483,
+        &mod210719881,
+        &mod265490441,
+        &mod334496971,
+        &mod421439783,
+        &mod530980861,
+        &mod668993977,
+        &mod842879579,
+        &mod1061961721,
+        &mod1337987929,
+        &mod1685759167,
+        &mod2123923447,
+        &mod2675975881,
+        &mod3371518343,
+        &mod4247846927,
+        &mod5351951779,
+        &mod6743036717,
+        &mod8495693897,
+        &mod10703903591,
+        &mod13486073473,
+        &mod16991387857,
+        &mod21407807219,
+        &mod26972146961,
+        &mod33982775741,
+        &mod42815614441,
+        &mod53944293929,
+        &mod67965551447,
+        &mod85631228929,
+        &mod107888587883,
+        &mod135931102921,
+        &mod171262457903,
+        &mod215777175787,
+        &mod271862205833,
+        &mod342524915839,
+        &mod431554351609,
+        &mod543724411781,
+        &mod685049831731,
+        &mod863108703229,
+        &mod1087448823553,
+        &mod1370099663459,
+        &mod1726217406467,
+        &mod2174897647073,
+        &mod2740199326961,
+        &mod3452434812973,
+        &mod4349795294267,
+        &mod5480398654009,
+        &mod6904869625999,
+        &mod8699590588571,
+        &mod10960797308051,
+        &mod13809739252051,
+        &mod17399181177241,
+        &mod21921594616111,
+        &mod27619478504183,
+        &mod34798362354533,
+        &mod43843189232363,
+        &mod55238957008387,
+        &mod69596724709081,
+        &mod87686378464759,
+        &mod110477914016779,
+        &mod139193449418173,
+        &mod175372756929481,
+        &mod220955828033581,
+        &mod278386898836457,
+        &mod350745513859007,
+        &mod441911656067171,
+        &mod556773797672909,
+        &mod701491027718027,
+        &mod883823312134381,
+        &mod1113547595345903,
+        &mod1402982055436147,
+        &mod1767646624268779,
+        &mod2227095190691797,
+        &mod2805964110872297,
+        &mod3535293248537579,
+        &mod4454190381383713,
+        &mod5611928221744609,
+        &mod7070586497075177,
+        &mod8908380762767489,
+        &mod11223856443489329,
+        &mod14141172994150357,
+        &mod17816761525534927,
+        &mod22447712886978529,
+        &mod28282345988300791,
+        &mod35633523051069991,
+        &mod44895425773957261,
+        &mod56564691976601587,
+        &mod71267046102139967,
+        &mod89790851547914507,
+        &mod113129383953203213,
+        &mod142534092204280003,
+        &mod179581703095829107,
+        &mod226258767906406483,
+        &mod285068184408560057,
+        &mod359163406191658253,
+        &mod452517535812813007,
+        &mod570136368817120201,
+        &mod718326812383316683,
+        &mod905035071625626043,
+        &mod1140272737634240411,
+        &mod1436653624766633509,
+        &mod1810070143251252131,
+        &mod2280545475268481167,
+        &mod2873307249533267101,
+        &mod3620140286502504283,
+        &mod4561090950536962147,
+        &mod5746614499066534157,
+        &mod7240280573005008577,
+        &mod9122181901073924329,
+        &mod11493228998133068689,
+        &mod14480561146010017169,
+        &mod18446744073709551557};
+    const uint64_t* found = std::lower_bound(
+        std::begin(prime_list), std::end(prime_list) - 1, size);
+    size = *found;
+    return mod_functions[1 + found - prime_list];
+  }
+  void commit(mod_function new_mod_function) {
+    current_mod_function = new_mod_function;
+  }
+  void reset() {
+    current_mod_function = &mod0;
+  }
+
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return current_mod_function(hash);
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index > num_slots_minus_one ? current_mod_function(index) : index;
+  }
+
+ private:
+  mod_function current_mod_function = &mod0;
+};
+
+struct power_of_two_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t num_slots_minus_one) const {
+    return hash & num_slots_minus_one;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index_for_hash(index, num_slots_minus_one);
+  }
+  int8_t next_size_over(uint64_t& size) const {
+    size = detailv3::next_power_of_two(size);
+    return 0;
+  }
+  void commit(int8_t /*unused*/) {}
+  void reset() {}
+};
+
+struct fibonacci_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return (11400714819323198485ull * hash) >> shift;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index & num_slots_minus_one;
+  }
+
+  int8_t next_size_over(uint64_t& size) const {
+    size = std::max(uint64_t(2), detailv3::next_power_of_two(size));
+    return static_cast<int8_t>(64 - detailv3::log2(size));
+  }
+  void commit(int8_t shift_) {
+    shift = shift_;
+  }
+  void reset() {
+    shift = 63;
+  }
+
+ private:
+  int8_t shift = 63;
+};
+
+template <
+    typename K,
+    typename V,
+    typename H = std::hash<K>,
+    typename E = std::equal_to<K>,
+    typename A = std::allocator<std::pair<K, V>>>
+class order_preserving_flat_hash_map
+    : public detailv3::sherwood_v3_table<
+          std::pair<K, V>,
+          K,
+          H,
+          detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+          E,
+          detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<std::pair<K, V>>>> {
+  using Table = detailv3::sherwood_v3_table<
+      std::pair<K, V>,
+      K,
+      H,
+      detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+      E,
+      detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<std::pair<K, V>>>>;
+
+ public:
+  using key_type = K;
+  using mapped_type = V;
+
+  using Table::Table;
+  order_preserving_flat_hash_map() = default;
+
+  inline V& operator[](const K& key) {
+    return emplace(key, convertible_to_value()).first->second;
+  }
+  inline V& operator[](K&& key) {
+    return emplace(std::move(key), convertible_to_value()).first->second;
+  }
+  V& at(const K& key) {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+  const V& at(const K& key) const {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+
+  using Table::emplace;
+  std::pair<typename Table::iterator, bool> emplace() {
+    return emplace(key_type(), convertible_to_value());
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      const key_type& key,
+      M&& m) {
+    auto emplace_result = emplace(key, std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      key_type&& key,
+      M&& m) {
+    auto emplace_result = emplace(std::move(key), std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator /*unused*/,
+      const key_type& key,
+      M&& m) {
+    return insert_or_assign(key, std::forward<M>(m)).first;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator /*unused*/,
+      key_type&& key,
+      M&& m) {
+    return insert_or_assign(std::move(key), std::forward<M>(m)).first;
+  }
+
+  friend bool operator==(
+      const order_preserving_flat_hash_map& lhs,
+      const order_preserving_flat_hash_map& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const typename Table::value_type& value : lhs) {
+      auto found = rhs.find(value.first);
+      if (found == rhs.end() || value.second != found->second)
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(
+      const order_preserving_flat_hash_map& lhs,
+      const order_preserving_flat_hash_map& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  struct convertible_to_value {
+    operator V() const {
+      return V();
+    }
+  };
+};
+
+template <
+    typename T,
+    typename H = std::hash<T>,
+    typename E = std::equal_to<T>,
+    typename A = std::allocator<T>>
+class flat_hash_set
+    : public detailv3::sherwood_v3_table<
+          T,
+          T,
+          H,
+          detailv3::functor_storage<uint64_t, H>,
+          E,
+          detailv3::functor_storage<bool, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<T>>> {
+  using Table = detailv3::sherwood_v3_table<
+      T,
+      T,
+      H,
+      detailv3::functor_storage<uint64_t, H>,
+      E,
+      detailv3::functor_storage<bool, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<T>>>;
+
+ public:
+  using key_type = T;
+
+  using Table::Table;
+  flat_hash_set() = default;
+
+  template <typename... Args>
+  std::pair<typename Table::iterator, bool> emplace(Args&&... args) {
+    return Table::emplace(T(std::forward<Args>(args)...));
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+
+  friend bool operator==(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const T& value : lhs) {
+      if (rhs.find(value) == rhs.end())
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <typename T>
+struct power_of_two_std_hash : std::hash<T> {
+  typedef ska_ordered::power_of_two_hash_policy hash_policy;
+};
+
+} // namespace ska_ordered
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h
new file mode 100644
index 0000000000000000000000000000000000000000..e414de5aaab43b00062139b718067b14be4422ac
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overflows.h
@@ -0,0 +1,105 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/complex.h>
+
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+namespace c10 {
+// In some versions of MSVC, there will be a compiler error when building.
+// C4146: unary minus operator applied to unsigned type, result still unsigned
+// C4804: unsafe use of type 'bool' in operation
+// It can be addressed by disabling the following warning.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#pragma warning(disable : 4804)
+#pragma warning(disable : 4018)
+#endif
+
+// The overflow checks may involve float to int conversion which may
+// trigger precision loss warning. Re-enable the warning once the code
+// is fixed. See T58053069.
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+// bool can be converted to any type.
+// Without specializing on bool, in pytorch_linux_trusty_py2_7_9_build:
+// `error: comparison of constant '255' with boolean expression is always false`
+// for `f > limit::max()` below
+template <typename To, typename From>
+std::enable_if_t<std::is_same_v<From, bool>, bool> overflows(
+    From /*f*/,
+    bool strict_unsigned [[maybe_unused]] = false) {
+  return false;
+}
+
+// skip isnan and isinf check for integral types
+template <typename To, typename From>
+std::enable_if_t<std::is_integral_v<From> && !std::is_same_v<From, bool>, bool>
+overflows(From f, bool strict_unsigned = false) {
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
+  if constexpr (!limit::is_signed && std::numeric_limits<From>::is_signed) {
+    // allow for negative numbers to wrap using two's complement arithmetic.
+    // For example, with uint8, this allows for `a - b` to be treated as
+    // `a + 255 * b`.
+    if (!strict_unsigned) {
+      return greater_than_max<To>(f) ||
+          (c10::is_negative(f) &&
+           -static_cast<uint64_t>(f) > static_cast<uint64_t>(limit::max()));
+    }
+  }
+  return c10::less_than_lowest<To>(f) || greater_than_max<To>(f);
+}
+
+template <typename To, typename From>
+std::enable_if_t<std::is_floating_point_v<From>, bool> overflows(
+    From f,
+    bool strict_unsigned [[maybe_unused]] = false) {
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
+  if (limit::has_infinity && std::isinf(static_cast<double>(f))) {
+    return false;
+  }
+  if (!limit::has_quiet_NaN && (f != f)) {
+    return true;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template <typename To, typename From>
+std::enable_if_t<is_complex<From>::value, bool> overflows(
+    From f,
+    bool strict_unsigned = false) {
+  // casts from complex to real are considered to overflow if the
+  // imaginary component is non-zero
+  if (!is_complex<To>::value && f.imag() != 0) {
+    return true;
+  }
+  // Check for overflow componentwise
+  // (Technically, the imag overflow check is guaranteed to be false
+  // when !is_complex<To>, but any optimizer worth its salt will be
+  // able to figure it out.)
+  return overflows<
+             typename scalar_value_type<To>::type,
+             typename From::value_type>(f.real(), strict_unsigned) ||
+      overflows<
+             typename scalar_value_type<To>::type,
+             typename From::value_type>(f.imag(), strict_unsigned);
+}
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c1571b57e808ab068dd5456e1ea83dfd9fd6342
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/overloaded.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <memory>
+namespace c10 {
+namespace detail {
+
+template <class... Ts>
+struct overloaded_t {};
+
+template <class T0>
+struct overloaded_t<T0> : T0 {
+  using T0::operator();
+  overloaded_t(T0 t0) : T0(std::move(t0)) {}
+};
+template <class T0, class... Ts>
+struct overloaded_t<T0, Ts...> : T0, overloaded_t<Ts...> {
+  using T0::operator();
+  using overloaded_t<Ts...>::operator();
+  overloaded_t(T0 t0, Ts... ts)
+      : T0(std::move(t0)), overloaded_t<Ts...>(std::move(ts)...) {}
+};
+
+} // namespace detail
+
+// Construct an overloaded callable combining multiple callables, e.g. lambdas
+template <class... Ts>
+detail::overloaded_t<Ts...> overloaded(Ts... ts) {
+  return {std::move(ts)...};
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..f457be5949a775e9ce3f4b8b39d8c4bbe95985b8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/python_stub.h
@@ -0,0 +1,9 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+struct _object;
+using PyObject = _object;
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b48a5a89c503e4a3ddae1aee65695044d1a3384
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint32.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/qint32.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f7a9e42540c917299479e9bda73da37083e082
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/qint8.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/qint8.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4603a707c35a3a24eee27c4eea54c025f49454b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/quint4x2.h
@@ -0,0 +1,6 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <torch/headeronly/util/quint4x2.h>
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..60b2c344e0639fb6490a1c300cb77469f111bd62
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/signal_handler.h
@@ -0,0 +1,124 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <csignal>
+#include <cstdint>
+#include <mutex>
+
+#include <c10/macros/Export.h>
+
+#if defined(__APPLE__)
+#define C10_SUPPORTS_SIGNAL_HANDLER
+#elif defined(__linux__) && !defined(C10_DISABLE_SIGNAL_HANDLERS)
+#define C10_SUPPORTS_FATAL_SIGNAL_HANDLERS
+#define C10_SUPPORTS_SIGNAL_HANDLER
+#endif
+
+#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+#include <pthread.h>
+#endif
+
+namespace c10 {
+
+class C10_API SignalHandler {
+ public:
+  enum class Action { NONE, STOP };
+
+  // Constructor. Specify what action to take when a signal is received.
+  SignalHandler(Action SIGINT_action, Action SIGHUP_action);
+
+  SignalHandler(const SignalHandler&) = delete;
+  SignalHandler(SignalHandler&&) = delete;
+  SignalHandler& operator=(const SignalHandler&) = delete;
+  SignalHandler& operator=(SignalHandler&&) = delete;
+  ~SignalHandler();
+
+  Action CheckForSignals();
+
+  bool GotSIGINT();
+  bool GotSIGHUP();
+
+  Action SIGINT_action_;
+  Action SIGHUP_action_;
+  std::atomic<uint64_t> my_sigint_count_;
+  std::atomic<uint64_t> my_sighup_count_;
+};
+
+#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+class C10_API FatalSignalHandler {
+  // This works by setting up certain fatal signal handlers. Previous fatal
+  // signal handlers will still be called when the signal is raised. Defaults
+  // to being off.
+ public:
+  C10_API void setPrintStackTracesOnFatalSignal(bool print);
+  C10_API bool printStackTracesOnFatalSignal();
+  static FatalSignalHandler& getInstance();
+  FatalSignalHandler(const FatalSignalHandler&) = delete;
+  FatalSignalHandler(FatalSignalHandler&&) = delete;
+  FatalSignalHandler& operator=(const FatalSignalHandler&) = delete;
+  FatalSignalHandler& operator=(FatalSignalHandler&&) = delete;
+  virtual ~FatalSignalHandler() = default;
+
+ protected:
+  explicit FatalSignalHandler();
+
+ private:
+  void installFatalSignalHandlers();
+  void uninstallFatalSignalHandlers();
+  static void fatalSignalHandlerStatic(int signum);
+  void fatalSignalHandler(int signum);
+  virtual void fatalSignalHandlerPostProcess();
+  struct sigaction* getPreviousSigaction(int signum);
+  const char* getSignalName(int signum);
+  void callPreviousSignalHandler(
+      struct sigaction* action,
+      int signum,
+      siginfo_t* info,
+      void* ctx);
+  void stacktraceSignalHandler(bool needsLock);
+  static void stacktraceSignalHandlerStatic(
+      int signum,
+      siginfo_t* info,
+      void* ctx);
+  void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx);
+
+  // The mutex protects the bool.
+  std::mutex fatalSignalHandlersInstallationMutex;
+  bool fatalSignalHandlersInstalled;
+  // We need to hold a reference to call the previous SIGUSR2 handler in case
+  // we didn't signal it
+  struct sigaction previousSigusr2{};
+  // Flag dictating whether the SIGUSR2 handler falls back to previous handlers
+  // or is intercepted in order to print a stack trace.
+  std::atomic<bool> fatalSignalReceived;
+  // Global state set when a fatal signal is received so that backtracing
+  // threads know why they're printing a stacktrace.
+  const char* fatalSignalName;
+  int fatalSignum = -1;
+  // This wait condition is used to wait for other threads to finish writing
+  // their stack trace when in fatal sig handler (we can't use pthread_join
+  // because there's no way to convert from a tid to a pthread_t).
+  std::condition_variable writingCond;
+  std::mutex writingMutex;
+  // used to indicate if the other thread responded to the signal
+  bool signalReceived;
+
+  struct signal_handler {
+    const char* name;
+    int signum;
+    struct sigaction previous;
+  };
+
+  // NOLINTNEXTLINE(*c-arrays*)
+  static signal_handler kSignalHandlers[];
+};
+
+#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h
new file mode 100644
index 0000000000000000000000000000000000000000..4030828469d45cdbef603bbb8588071a41b9b398
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint.h
@@ -0,0 +1,39 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) && \
+    !(defined(TORCH_DISABLE_SDT) && TORCH_DISABLE_SDT)
+
+#define TORCH_HAVE_SDT 1
+
+#include <c10/util/static_tracepoint_elfx86.h>
+
+#define TORCH_SDT(name, ...) \
+  TORCH_SDT_PROBE_N(         \
+      pytorch, name, 0, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+// Use TORCH_SDT_DEFINE_SEMAPHORE(name) to define the semaphore
+// as global variable before using the TORCH_SDT_WITH_SEMAPHORE macro
+#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \
+  TORCH_SDT_PROBE_N(                        \
+      pytorch, name, 1, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+#define TORCH_SDT_IS_ENABLED(name) (TORCH_SDT_SEMAPHORE(pytorch, name) > 0)
+
+#else
+
+#define TORCH_HAVE_SDT 0
+
+#define TORCH_SDT(name, ...) \
+  do {                       \
+  } while (0)
+#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \
+  do {                                      \
+  } while (0)
+#define TORCH_SDT_IS_ENABLED(name) (false)
+#define TORCH_SDT_DEFINE_SEMAPHORE(name)
+#define TORCH_SDT_DECLARE_SEMAPHORE(name)
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3afe767fee1e9cf92062b2ece5e2f0520dcb9e4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h
@@ -0,0 +1,149 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// clang-format off
+
+// Default constraint for the probe arguments as operands.
+#ifndef TORCH_SDT_ARG_CONSTRAINT
+#define TORCH_SDT_ARG_CONSTRAINT      "nor"
+#endif
+
+// Instruction to emit for the probe.
+#define TORCH_SDT_NOP                 nop
+
+// Note section properties.
+#define TORCH_SDT_NOTE_NAME           "stapsdt"
+#define TORCH_SDT_NOTE_TYPE           3
+
+// Semaphore variables are put in this section
+#define TORCH_SDT_SEMAPHORE_SECTION   ".probes"
+
+// Size of address depending on platform.
+#ifdef __LP64__
+#define TORCH_SDT_ASM_ADDR            .8byte
+#else
+#define TORCH_SDT_ASM_ADDR            .4byte
+#endif
+
+// Assembler helper Macros.
+#define TORCH_SDT_S(x)                #x
+#define TORCH_SDT_ASM_1(x)            TORCH_SDT_S(x) "\n"
+#define TORCH_SDT_ASM_2(a, b)         TORCH_SDT_S(a) "," TORCH_SDT_S(b) "\n"
+#define TORCH_SDT_ASM_3(a, b, c)      TORCH_SDT_S(a) "," TORCH_SDT_S(b) ","    \
+                                      TORCH_SDT_S(c) "\n"
+#define TORCH_SDT_ASM_STRING(x)       TORCH_SDT_ASM_1(.asciz TORCH_SDT_S(x))
+
+// Helper to determine the size of an argument.
+#define TORCH_SDT_IS_ARRAY_POINTER(x)  ((__builtin_classify_type(x) == 14) ||  \
+                                        (__builtin_classify_type(x) == 5))
+#define TORCH_SDT_ARGSIZE(x)  (TORCH_SDT_IS_ARRAY_POINTER(x)                   \
+                               ? sizeof(void*)                                 \
+                               : sizeof(x))
+
+// Format of each probe arguments as operand.
+// Size of the argument tagged with TORCH_SDT_Sn, with "n" constraint.
+// Value of the argument tagged with TORCH_SDT_An, with configured constraint.
+#define TORCH_SDT_ARG(n, x)                                                    \
+  [TORCH_SDT_S##n] "n"                ((size_t)TORCH_SDT_ARGSIZE(x)),          \
+  [TORCH_SDT_A##n] TORCH_SDT_ARG_CONSTRAINT (x)
+
+// Templates to append arguments as operands.
+#define TORCH_SDT_OPERANDS_0()        [__sdt_dummy] "g" (0)
+#define TORCH_SDT_OPERANDS_1(_1)      TORCH_SDT_ARG(1, _1)
+#define TORCH_SDT_OPERANDS_2(_1, _2)                                           \
+  TORCH_SDT_OPERANDS_1(_1), TORCH_SDT_ARG(2, _2)
+#define TORCH_SDT_OPERANDS_3(_1, _2, _3)                                       \
+  TORCH_SDT_OPERANDS_2(_1, _2), TORCH_SDT_ARG(3, _3)
+#define TORCH_SDT_OPERANDS_4(_1, _2, _3, _4)                                   \
+  TORCH_SDT_OPERANDS_3(_1, _2, _3), TORCH_SDT_ARG(4, _4)
+#define TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5)                               \
+  TORCH_SDT_OPERANDS_4(_1, _2, _3, _4), TORCH_SDT_ARG(5, _5)
+#define TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6)                           \
+  TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5), TORCH_SDT_ARG(6, _6)
+#define TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7)                       \
+  TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), TORCH_SDT_ARG(7, _7)
+#define TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8)                   \
+  TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7), TORCH_SDT_ARG(8, _8)
+#define TORCH_SDT_OPERANDS_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)               \
+  TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8), TORCH_SDT_ARG(9, _9)
+
+// Templates to reference the arguments from operands in note section.
+#define TORCH_SDT_ARGFMT(no)        %n[TORCH_SDT_S##no]@%[TORCH_SDT_A##no]
+#define TORCH_SDT_ARG_TEMPLATE_0    /*No arguments*/
+#define TORCH_SDT_ARG_TEMPLATE_1    TORCH_SDT_ARGFMT(1)
+#define TORCH_SDT_ARG_TEMPLATE_2    TORCH_SDT_ARG_TEMPLATE_1 TORCH_SDT_ARGFMT(2)
+#define TORCH_SDT_ARG_TEMPLATE_3    TORCH_SDT_ARG_TEMPLATE_2 TORCH_SDT_ARGFMT(3)
+#define TORCH_SDT_ARG_TEMPLATE_4    TORCH_SDT_ARG_TEMPLATE_3 TORCH_SDT_ARGFMT(4)
+#define TORCH_SDT_ARG_TEMPLATE_5    TORCH_SDT_ARG_TEMPLATE_4 TORCH_SDT_ARGFMT(5)
+#define TORCH_SDT_ARG_TEMPLATE_6    TORCH_SDT_ARG_TEMPLATE_5 TORCH_SDT_ARGFMT(6)
+#define TORCH_SDT_ARG_TEMPLATE_7    TORCH_SDT_ARG_TEMPLATE_6 TORCH_SDT_ARGFMT(7)
+#define TORCH_SDT_ARG_TEMPLATE_8    TORCH_SDT_ARG_TEMPLATE_7 TORCH_SDT_ARGFMT(8)
+#define TORCH_SDT_ARG_TEMPLATE_9    TORCH_SDT_ARG_TEMPLATE_8 TORCH_SDT_ARGFMT(9)
+
+// Resolvable by name macros
+// An attribute that marks a function or variable as needing to be resolvable
+// by name. This generally is needed if inline assembly refers to the variable
+// by string name.
+#ifdef __roar__
+#define TORCH_NAME_RESOLVABLE __attribute__((roar_resolvable_by_name))
+#else
+#define TORCH_NAME_RESOLVABLE
+#endif
+
+// Semaphore define, declare and probe note format
+
+#define TORCH_SDT_SEMAPHORE(provider, name)                                    \
+  torch_sdt_semaphore_##provider##_##name
+
+#define TORCH_SDT_DEFINE_SEMAPHORE(name)                                       \
+  extern "C" {                                                                 \
+    TORCH_NAME_RESOLVABLE                                                      \
+    volatile unsigned short TORCH_SDT_SEMAPHORE(pytorch, name)                 \
+    __attribute__((section(TORCH_SDT_SEMAPHORE_SECTION), used)) = 0;           \
+  }
+
+#define TORCH_SDT_DECLARE_SEMAPHORE(name)                                      \
+  extern "C" TORCH_NAME_RESOLVABLE volatile unsigned short                     \
+    TORCH_SDT_SEMAPHORE(pytorch, name)
+
+#define TORCH_SDT_SEMAPHORE_NOTE_0(provider, name)                             \
+  TORCH_SDT_ASM_1(     TORCH_SDT_ASM_ADDR 0) /*No Semaphore*/                  \
+
+#define TORCH_SDT_SEMAPHORE_NOTE_1(provider, name)                             \
+  TORCH_SDT_ASM_1(TORCH_SDT_ASM_ADDR TORCH_SDT_SEMAPHORE(provider, name))
+
+// Structure of note section for the probe.
+#define TORCH_SDT_NOTE_CONTENT(provider, name, has_semaphore, arg_template)    \
+  TORCH_SDT_ASM_1(990: TORCH_SDT_NOP)                                          \
+  TORCH_SDT_ASM_3(     .pushsection .note.stapsdt,"","note")                   \
+  TORCH_SDT_ASM_1(     .balign 4)                                              \
+  TORCH_SDT_ASM_3(     .4byte 992f-991f, 994f-993f, TORCH_SDT_NOTE_TYPE)       \
+  TORCH_SDT_ASM_1(991: .asciz TORCH_SDT_NOTE_NAME)                             \
+  TORCH_SDT_ASM_1(992: .balign 4)                                              \
+  TORCH_SDT_ASM_1(993: TORCH_SDT_ASM_ADDR 990b)                                \
+  TORCH_SDT_ASM_1(     TORCH_SDT_ASM_ADDR 0) /*Reserved for Base Address*/     \
+  TORCH_SDT_SEMAPHORE_NOTE_##has_semaphore(provider, name)                     \
+  TORCH_SDT_ASM_STRING(provider)                                               \
+  TORCH_SDT_ASM_STRING(name)                                                   \
+  TORCH_SDT_ASM_STRING(arg_template)                                           \
+  TORCH_SDT_ASM_1(994: .balign 4)                                              \
+  TORCH_SDT_ASM_1(     .popsection)
+
+// Main probe Macro.
+#define TORCH_SDT_PROBE(provider, name, has_semaphore, n, arglist)             \
+    __asm__ __volatile__ (                                                     \
+      TORCH_SDT_NOTE_CONTENT(                                                  \
+        provider, name, has_semaphore, TORCH_SDT_ARG_TEMPLATE_##n)             \
+      :: TORCH_SDT_OPERANDS_##n arglist                                        \
+    )                                                                          \
+
+// Helper Macros to handle variadic arguments.
+#define TORCH_SDT_NARG_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
+#define TORCH_SDT_NARG(...)                                                    \
+  TORCH_SDT_NARG_(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define TORCH_SDT_PROBE_N(provider, name, has_semaphore, N, ...)               \
+  TORCH_SDT_PROBE(provider, name, has_semaphore, N, (__VA_ARGS__))
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e74cffc5e6338d234846bac166d5fcac7db63b0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strides.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <algorithm>
+
+namespace c10 {
+
+// Computes the contiguous strides of a tensor, given its sizes.
+inline DimVector contiguous_strides(const IntArrayRef sizes) {
+  using Int = IntArrayRef::value_type;
+  const Int dims = static_cast<Int>(sizes.size());
+
+  // With this initialisation we get the case dim == 0 or 1 right
+  DimVector strides(dims, 1);
+
+  for (auto i = dims - 2; i >= 0; --i) {
+    // Strides can't be 0 even if sizes are 0.
+    strides[i] = strides[i + 1] * std::max(sizes[i + 1], Int{1});
+  }
+
+  return strides;
+}
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbcf0b1f3c95d2e0e572ae58b6e066efc893f582
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_utils.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <string>
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NO_DEPRECATED)
+
+namespace c10 {
+
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stod;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoi;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoll;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoull;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::to_string;
+
+} // namespace c10
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..559cde09f9c35071293f0ed62d481ea7f6940710
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/string_view.h
@@ -0,0 +1,648 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * Port of std::string_view with methods from C++20.
+ * Implemented following the interface definition in
+ * https://en.cppreference.com/w/cpp/string/basic_string_view
+ * See there for the API documentation.
+ *
+ * Difference: We don't have a Traits template parameter because
+ * std::char_traits isn't constexpr and we'd have to reimplement
+ * std::char_traits if we wanted to use it with our constexpr basic_string_view.
+ */
+template <class CharT>
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class basic_string_view final {
+ public:
+  using value_type = CharT;
+  using pointer = CharT*;
+  using const_pointer = const CharT*;
+  using reference = CharT&;
+  using const_reference = const CharT&;
+  using const_iterator = const CharT*;
+  using iterator = const_iterator;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = const_reverse_iterator;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr size_type npos = size_type(-1);
+
+  constexpr basic_string_view() noexcept : begin_(nullptr) {}
+
+  explicit constexpr basic_string_view(const_pointer str, size_type count)
+      : begin_(str), size_(count) {}
+
+  /* implicit */ constexpr basic_string_view(const_pointer str)
+      : basic_string_view(str, strlen_(str)) {}
+
+  /* implicit */ basic_string_view(const ::std::basic_string<CharT>& str)
+      : basic_string_view(str.data(), str.size()) {}
+
+  /* implicit */ constexpr basic_string_view(
+      const ::std::basic_string_view<CharT>& str)
+      : basic_string_view(str.data(), str.size()) {}
+
+  constexpr basic_string_view(const basic_string_view&) noexcept = default;
+
+  constexpr basic_string_view& operator=(
+      const basic_string_view& rhs) noexcept = default;
+
+  constexpr operator ::std::basic_string_view<CharT>() const {
+    return ::std::basic_string_view<CharT>(data(), size());
+  }
+
+  explicit operator ::std::basic_string<CharT>() const {
+    return ::std::basic_string<CharT>(data(), size());
+  }
+
+  constexpr const_iterator begin() const noexcept {
+    return cbegin();
+  }
+
+  constexpr const_iterator cbegin() const noexcept {
+    return begin_;
+  }
+
+  constexpr const_iterator end() const noexcept {
+    return cend();
+  }
+
+  constexpr const_iterator cend() const noexcept {
+    return begin_ + size_;
+  }
+
+  constexpr const_reverse_iterator rbegin() const noexcept {
+    return crbegin();
+  }
+
+  constexpr const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(this->end());
+  }
+
+  constexpr const_reverse_iterator rend() const noexcept {
+    return crend();
+  }
+
+  constexpr const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(this->begin());
+  }
+
+  friend constexpr const_iterator begin(basic_string_view sv) noexcept {
+    return sv.begin();
+  }
+
+  friend constexpr const_iterator end(basic_string_view sv) noexcept {
+    return sv.end();
+  }
+
+  constexpr const_reference operator[](size_type pos) const {
+    // TODO: split out
+    return at_(pos);
+  }
+
+  constexpr const_reference at(size_type pos) const {
+#if !defined( \
+    __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code
+    return C10_UNLIKELY(pos >= size_)
+        ? (throw std::out_of_range(
+               "string_view::operator[] or string_view::at() out of range. Index: " +
+               std::to_string(pos) + ", size: " + std::to_string(size())),
+           at_(0))
+        : at_(pos);
+#else
+    return at_(pos);
+#endif
+  }
+
+  constexpr const_reference front() const {
+    return *begin_;
+  }
+
+  constexpr const_reference back() const {
+    return *(begin_ + size_ - 1);
+  }
+
+  constexpr const_pointer data() const noexcept {
+    return begin_;
+  }
+
+  constexpr size_type size() const noexcept {
+    return size_;
+  }
+
+  constexpr size_type length() const noexcept {
+    return size();
+  }
+
+  constexpr size_type max_size() const noexcept {
+    return std::numeric_limits<difference_type>::max();
+  }
+
+  [[nodiscard]] constexpr bool empty() const noexcept {
+    return size() == 0;
+  }
+
+  constexpr void remove_prefix(size_type n) {
+    if (n > size()) {
+      throw std::out_of_range(
+          "basic_string_view::remove_prefix: out of range. PrefixLength: " +
+          std::to_string(n) + ", size: " + std::to_string(size()));
+    }
+    begin_ += n;
+    size_ -= n;
+  }
+
+  constexpr void remove_suffix(size_type n) {
+    if (n > size()) {
+      throw std::out_of_range(
+          "basic_string_view::remove_suffix: out of range. SuffixLength: " +
+          std::to_string(n) + ", size: " + std::to_string(size()));
+    }
+    size_ -= n;
+  }
+
+  constexpr void swap(basic_string_view& sv) noexcept {
+    auto tmp = *this;
+    *this = sv;
+    sv = tmp;
+  }
+
+  size_type copy(pointer dest, size_type count, size_type pos = 0) const {
+    if (pos > size_) {
+      throw std::out_of_range(
+          "basic_string_view::copy: out of range. Index: " +
+          std::to_string(pos) + ", size: " + std::to_string(size()));
+    }
+    size_type copy_length = std::min(count, size_ - pos);
+    for (auto iter = begin() + pos, end = iter + copy_length; iter != end;) {
+      *(dest++) = *(iter++);
+    }
+    return copy_length;
+  }
+
+  constexpr basic_string_view substr(size_type pos = 0, size_type count = npos)
+      const {
+#if !defined( \
+    __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code
+    return (pos > size_)
+        ? (throw std::out_of_range(
+               "basic_string_view::substr parameter out of bounds. Index: " +
+               std::to_string(pos) + ", size: " + std::to_string(size())),
+           substr_())
+        : substr_(pos, count);
+#else
+    return substr_(pos, count);
+#endif
+  }
+
+  constexpr int compare(basic_string_view rhs) const noexcept {
+    // Write it iteratively. This is faster.
+    for (size_t i = 0, end = std::min(size(), rhs.size()); i < end; ++i) {
+      if (at_(i) < rhs.at_(i)) {
+        return -1;
+      } else if (at_(i) > rhs.at_(i)) {
+        return 1;
+      }
+    }
+    if (size() < rhs.size()) {
+      return -1;
+    } else if (size() > rhs.size()) {
+      return 1;
+    }
+    return 0;
+  }
+
+  constexpr int compare(size_type pos1, size_type count1, basic_string_view v)
+      const {
+    return substr(pos1, count1).compare(v);
+  }
+
+  constexpr int compare(
+      size_type pos1,
+      size_type count1,
+      basic_string_view v,
+      size_type pos2,
+      size_type count2) const {
+    return substr(pos1, count1).compare(v.substr(pos2, count2));
+  }
+
+  constexpr int compare(const_pointer s) const {
+    return compare(basic_string_view(s));
+  }
+
+  constexpr int compare(size_type pos1, size_type count1, const_pointer s)
+      const {
+    return substr(pos1, count1).compare(basic_string_view(s));
+  }
+
+  constexpr int compare(
+      size_type pos1,
+      size_type count1,
+      const_pointer s,
+      size_type count2) const {
+    return substr(pos1, count1).compare(basic_string_view(s, count2));
+  }
+
+  friend constexpr bool operator==(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return lhs.equals_(rhs);
+  }
+
+  friend constexpr bool operator!=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return lhs.compare(rhs) < 0;
+  }
+
+  friend constexpr bool operator>=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs < rhs);
+  }
+
+  friend constexpr bool operator>(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return rhs < lhs;
+  }
+
+  friend constexpr bool operator<=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs > rhs);
+  }
+
+  constexpr bool starts_with(basic_string_view prefix) const noexcept {
+    return (prefix.size() > size()) ? false
+                                    : prefix.equals_(substr_(0, prefix.size()));
+  }
+
+  constexpr bool starts_with(CharT prefix) const noexcept {
+    return !empty() && prefix == front();
+  }
+
+  constexpr bool starts_with(const_pointer prefix) const {
+    return starts_with(basic_string_view(prefix));
+  }
+
+  constexpr bool ends_with(basic_string_view suffix) const noexcept {
+    return (suffix.size() > size())
+        ? false
+        : suffix.equals_(substr_(size() - suffix.size(), suffix.size()));
+  }
+
+  constexpr bool ends_with(CharT suffix) const noexcept {
+    return !empty() && suffix == back();
+  }
+
+  constexpr bool ends_with(const_pointer suffix) const {
+    return ends_with(basic_string_view(suffix));
+  }
+
+  constexpr size_type find(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    if (v.empty()) {
+      return pos <= size() ? pos : npos;
+    }
+
+    if (pos + v.size() <= size()) {
+      for (size_type cur = pos, end = size() - v.size(); cur <= end; ++cur) {
+        if (v.at_(0) == at_(cur) &&
+            v.substr_(1).equals_(substr_(cur + 1, v.size() - 1))) {
+          return cur;
+        }
+      }
+    }
+    return npos;
+  }
+
+  constexpr size_type find(CharT ch, size_type pos = 0) const noexcept {
+    return find_first_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find(const_pointer s, size_type pos, size_type count)
+      const {
+    return find(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find(const_pointer s, size_type pos = 0) const {
+    return find(basic_string_view(s), pos);
+  }
+
+  constexpr size_type rfind(basic_string_view v, size_type pos = npos)
+      const noexcept {
+    // Write it iteratively. This is faster.
+    if (v.empty()) {
+      return pos <= size() ? pos : size();
+    }
+
+    if (v.size() <= size()) {
+      pos = std::min(size() - v.size(), pos);
+      do {
+        if (v.at_(0) == at_(pos) &&
+            v.substr_(1).equals_(substr_(pos + 1, v.size() - 1))) {
+          return pos;
+        }
+      } while (pos-- > 0);
+    }
+    return npos;
+  }
+
+  constexpr size_type rfind(CharT ch, size_type pos = npos) const noexcept {
+    return find_last_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type rfind(const_pointer s, size_type pos, size_type count)
+      const {
+    return rfind(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type rfind(const_pointer s, size_type pos = npos) const {
+    return rfind(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_first_of(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, stringViewContainsChar_{v});
+  }
+
+  constexpr size_type find_first_of(CharT ch, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find_first_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_first_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_first_of(const_pointer s, size_type pos = 0) const {
+    return find_first_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_last_of(basic_string_view v, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, stringViewContainsChar_{v});
+  }
+
+  constexpr size_type find_last_of(CharT ch, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find_last_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_last_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_last_of(const_pointer s, size_type pos = npos)
+      const {
+    return find_last_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_first_not_of(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, stringViewDoesNotContainChar_{v});
+  }
+
+  constexpr size_type find_first_not_of(CharT ch, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, charIsNotEqual_{ch});
+  }
+
+  constexpr size_type find_first_not_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_first_not_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_first_not_of(const_pointer s, size_type pos = 0)
+      const {
+    return find_first_not_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_last_not_of(
+      basic_string_view v,
+      size_type pos = npos) const noexcept {
+    return find_last_if_(pos, stringViewDoesNotContainChar_{v});
+  }
+
+  constexpr size_type find_last_not_of(CharT ch, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, charIsNotEqual_{ch});
+  }
+
+  constexpr size_type find_last_not_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_last_not_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_last_not_of(const_pointer s, size_type pos = npos)
+      const {
+    return find_last_not_of(basic_string_view(s), pos);
+  }
+
+ private:
+  static constexpr size_type strlen_(const_pointer str) noexcept {
+    const_pointer current = str;
+    while (*current != '\0') {
+      ++current;
+    }
+    return current - str;
+  }
+
+  constexpr const_reference at_(size_type pos) const noexcept {
+    return *(begin_ + pos);
+  }
+
+  constexpr basic_string_view substr_(size_type pos = 0, size_type count = npos)
+      const {
+    return basic_string_view{begin_ + pos, std::min(count, size() - pos)};
+  }
+
+  template <class Condition>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  constexpr size_type find_first_if_(size_type pos, Condition&& condition)
+      const noexcept {
+    if (pos + 1 <= size()) {
+      for (size_type cur = pos; cur < size(); ++cur) {
+        if (condition(at_(cur))) {
+          return cur;
+        }
+      }
+    }
+    return npos;
+  }
+
+  template <class Condition>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  constexpr size_type find_last_if_(size_type pos, Condition&& condition)
+      const noexcept {
+    // Write it iteratively. This is faster.
+    if (!empty()) {
+      pos = std::min(size() - 1, pos);
+      do {
+        if (condition(at_(pos))) {
+          return pos;
+        }
+      } while (pos-- > 0);
+    }
+    return npos;
+  }
+
+  constexpr bool equals_(basic_string_view rhs) const {
+    // We don't use string_view::compare() here but implement it manually
+    // because only looking at equality allows for more optimized code.
+#if defined(__GNUC__) && !defined(__CUDACC__)
+    return size() == rhs.size() &&
+        0 == __builtin_memcmp(data(), rhs.data(), size());
+#else
+    if (size() != rhs.size()) {
+      return false;
+    }
+    // Yes, memcmp would be laster than this loop, but memcmp isn't constexpr
+    // and I didn't feel like implementing a constexpr memcmp variant.
+    // TODO At some point this should probably be done, including tricks
+    // like comparing one machine word instead of a byte per iteration.
+    for (typename basic_string_view<CharT>::size_type pos = 0; pos < size();
+         ++pos) {
+      if (at_(pos) != rhs.at_(pos)) {
+        return false;
+      }
+    }
+    return true;
+#endif
+  }
+
+  struct charIsEqual_ final {
+    CharT expected;
+    constexpr bool operator()(CharT actual) const noexcept {
+      return expected == actual;
+    }
+  };
+
+  struct charIsNotEqual_ final {
+    CharT expected;
+    constexpr bool operator()(CharT actual) const noexcept {
+      return expected != actual;
+    }
+  };
+
+  struct stringViewContainsChar_ final {
+    basic_string_view expected;
+    constexpr bool operator()(CharT ch) const noexcept {
+      return npos != expected.find(ch);
+    }
+  };
+
+  struct stringViewDoesNotContainChar_ final {
+    basic_string_view expected;
+    constexpr bool operator()(CharT ch) const noexcept {
+      return npos == expected.find(ch);
+    }
+  };
+
+  const_pointer begin_;
+  size_type size_{};
+};
+
+template <class CharT>
+inline std::basic_ostream<CharT>& operator<<(
+    std::basic_ostream<CharT>& stream,
+    basic_string_view<CharT> sv) {
+  // The rules for operator<< are quite complex, so lets defer to the
+  // STL implementation.
+  using std_string_type = ::std::basic_string_view<CharT>;
+  return stream << std_string_type(sv.data(), sv.size());
+}
+
+template <class CharT>
+constexpr inline void swap(
+    basic_string_view<CharT>& lhs,
+    basic_string_view<CharT>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+using string_view = std::string_view;
+using c10_string_view = basic_string_view<char>;
+
+// NOTE: In C++20, this function should be replaced by string_view.starts_with
+constexpr bool starts_with(
+    const std::string_view s,
+    const std::string_view prefix) noexcept {
+  return (prefix.size() > s.size()) ? false
+                                    : prefix == s.substr(0, prefix.size());
+}
+
+// NOTE: In C++20, this function should be replaced by string_view.starts_with
+constexpr bool starts_with(
+    const std::string_view s,
+    const char prefix) noexcept {
+  return !s.empty() && prefix == s.front();
+}
+
+// NOTE: In C++20, this function should be replaced by string_view.ends_with
+constexpr bool ends_with(
+    const std::string_view s,
+    const std::string_view suffix) noexcept {
+  return (suffix.size() > s.size())
+      ? false
+      : suffix == s.substr(s.size() - suffix.size(), suffix.size());
+}
+
+// NOTE: In C++20, this function should be replaced by string_view.ends_with
+constexpr bool ends_with(const std::string_view s, const char prefix) noexcept {
+  return !s.empty() && prefix == s.back();
+}
+
+} // namespace c10
+
+namespace std {
+template <class CharT>
+struct hash<::c10::basic_string_view<CharT>> {
+  size_t operator()(::c10::basic_string_view<CharT> x) const {
+    // The standard says that std::string_view hashing must do the same as
+    // std::string hashing but leaves the details of std::string hashing
+    // up to the implementer. So, to be conformant, we need to reuse and
+    // existing STL type's hash function. The std::string fallback is probably
+    // slow but the only way to be conformant.
+
+    using std_string_type = ::std::basic_string_view<CharT>;
+    return ::std::hash<std_string_type>{}(std_string_type(x.data(), x.size()));
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e3d1a431c19958786bba8245d56bb12854fd5e3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/strong_type.h
@@ -0,0 +1,1669 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * strong_type C++14/17/20 strong typedef library
+ *
+ * Copyright (C) Björn Fahller
+ *
+ *  Use, modification and distribution is subject to the
+ *  Boost Software License, Version 1.0. (See accompanying
+ *  file LICENSE_1_0.txt or copy at
+ *  http://www.boost.org/LICENSE_1_0.txt)
+ *
+ * Project home: https://github.com/rollbear/strong_type
+ */
+
+#ifndef ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
+#define ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
+
+#include <functional>
+#include <istream>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+#ifndef STRONG_HAS_STD_FORMAT
+#define STRONG_HAS_STD_FORMAT 0
+#endif
+
+#ifndef STRONG_HAS_FMT_FORMAT
+#define STRONG_HAS_FMT_FORMAT 0
+#endif
+
+#if STRONG_HAS_STD_FORMAT
+#include <format>
+#if !defined(__cpp_lib_format) || __cpp_lib_format < 201907
+#undef STRONG_HAS_STD_FORMAT
+#define STRONG_HAS_STD_FORMAT 0
+#endif
+#endif
+
+#if STRONG_HAS_FMT_FORMAT
+#include <fmt/format.h>
+#endif
+
+namespace strong
+{
+
+namespace impl
+{
+  template <typename T, typename ... V>
+  using WhenConstructible = std::enable_if_t<std::is_constructible_v<T, V...>>;
+}
+
+template <typename M, typename T>
+using modifier = typename M::template modifier<T>;
+
+struct uninitialized_t {};
+static constexpr uninitialized_t uninitialized{};
+
+struct default_constructible
+{
+  template <typename T>
+  class modifier
+  {
+  };
+};
+
+namespace impl {
+  template <typename T>
+  constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>* /*unused*/)
+  {
+    return true;
+  }
+}
+
+template <typename T, typename Tag, typename ... M>
+class type : public modifier<M, type<T, Tag, M...>>...
+{
+public:
+  template <typename TT = T, typename = std::enable_if_t<std::is_trivially_constructible<TT>{}>>
+  explicit type(uninitialized_t /*unused*/)
+    noexcept
+  {
+  }
+  template <typename type_ = type,
+            bool = impl::supports_default_construction(static_cast<type_*>(nullptr))>
+  constexpr
+  type()
+    noexcept(noexcept(T{}))
+  : val{}
+  {
+  }
+
+  template <typename U,
+    typename = impl::WhenConstructible<T, std::initializer_list<U>>>
+  constexpr
+  explicit
+  type(
+    std::initializer_list<U> us
+  )
+    noexcept(noexcept(T{us}))
+  : val{us}
+  {
+  }
+  template <typename ... U,
+            typename = std::enable_if_t<std::is_constructible_v<T, U&&...> && (sizeof...(U) > 0)>>
+  constexpr
+  explicit
+  type(
+    U&& ... u)
+  noexcept(std::is_nothrow_constructible_v<T, U...>)
+  : val(std::forward<U>(u)...)
+  {}
+
+  friend constexpr void swap(type& a, type& b) noexcept(
+                                                        std::is_nothrow_move_constructible_v<T> &&
+                                                        std::is_nothrow_move_assignable_v<T>
+                                                      )
+  {
+    using std::swap;
+    swap(a.val, b.val);
+  }
+
+  [[nodiscard]]
+  constexpr T& value_of() & noexcept { return val;}
+  [[nodiscard]]
+  constexpr const T& value_of() const & noexcept { return val;}
+  [[nodiscard]]
+  constexpr T&& value_of() && noexcept { return std::move(val);}
+
+  [[nodiscard]]
+  friend constexpr T& value_of(type& t) noexcept { return t.val;}
+  [[nodiscard]]
+  friend constexpr const T& value_of(const type& t) noexcept { return t.val;}
+  [[nodiscard]]
+  friend constexpr T&& value_of(type&& t) noexcept { return std::move(t).val;}
+private:
+  T val;
+};
+
+namespace impl {
+  template <typename T, typename Tag, typename ... Ms>
+  constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>* /*unused*/) { return true;}
+  constexpr bool is_strong_type_func(...) { return false;}
+  template <typename T, typename Tag, typename ... Ms>
+  constexpr T underlying_type(strong::type<T, Tag, Ms...>*);
+
+}
+
+template <typename T>
+struct is_strong_type : std::integral_constant<bool, impl::is_strong_type_func(static_cast<T *>(nullptr))> {};
+
+namespace impl {
+  template <typename T>
+  using WhenStrongType = std::enable_if_t<is_strong_type<std::decay_t<T>>::value>;
+  template <typename T>
+  using WhenNotStrongType = std::enable_if_t<!is_strong_type<std::decay_t<T>>::value>;
+}
+
+template <typename T, bool = is_strong_type<T>::value>
+struct underlying_type
+{
+  using type = decltype(impl::underlying_type(static_cast<T*>(nullptr)));
+};
+
+template <typename T>
+struct underlying_type<T, false>
+{
+  using type = T;
+};
+
+template <typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+
+
+namespace impl {
+  template<
+    typename T,
+    typename = impl::WhenNotStrongType<T>>
+  constexpr
+  T &&
+  access(T &&t)
+  noexcept {
+    return std::forward<T>(t);
+  }
+  template <
+    typename T,
+    typename = impl::WhenStrongType<T>>
+  [[nodiscard]]
+  constexpr
+  auto
+  access(T&& t)
+  noexcept
+  -> decltype(value_of(std::forward<T>(t)))
+  {
+    return value_of(std::forward<T>(t));
+  }
+
+}
+struct equality
+{
+  template <typename T>
+  class modifier;
+};
+
+
+template <typename T, typename Tag, typename ... M>
+class equality::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator==(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() == std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() == std::declval<const T&>())
+  {
+    return value_of(lh) == value_of(rh);
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator!=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() != std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() != std::declval<const T&>())
+  {
+    return value_of(lh) != value_of(rh);
+  }
+};
+
+namespace impl
+{
+  template <typename T, typename Other>
+  class typed_equality
+  {
+  private:
+    using TT = underlying_type_t<T>;
+    using OT = underlying_type_t<Other>;
+  public:
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator==(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() == std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() == std::declval<const OT&>())
+    {
+      return value_of(lh) == impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator==(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() == std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() == std::declval<const TT&>())
+    {
+      return impl::access(lh) == value_of(rh) ;
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator!=(const T& lh, const Other rh)
+    noexcept(noexcept(std::declval<const TT&>() != std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() != std::declval<const OT&>())
+    {
+      return value_of(lh) != impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator!=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() != std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() != std::declval<const TT&>())
+    {
+      return impl::access(lh) != value_of(rh) ;
+    }
+  };
+}
+template <typename ... Ts>
+struct equality_with
+{
+  template <typename T>
+  class modifier : public impl::typed_equality<T, Ts>...
+  {
+  };
+};
+
+namespace impl
+{
+  template <typename T, typename Other>
+  class typed_ordering
+  {
+  private:
+    using TT = underlying_type_t<T>;
+    using OT = underlying_type_t<Other>;
+  public:
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator<(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() < std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() < std::declval<const OT&>())
+    {
+      return value_of(lh) < impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator<(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() < std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() < std::declval<const TT&>())
+    {
+      return impl::access(lh) < value_of(rh) ;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator<=(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() <= std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() <= std::declval<const OT&>())
+    {
+      return value_of(lh) <= impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator<=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() <= std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() <= std::declval<const TT&>())
+    {
+      return impl::access(lh) <= value_of(rh) ;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator>(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() > std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() > std::declval<const OT&>())
+    {
+      return value_of(lh) > impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator>(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() > std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() > std::declval<const TT&>())
+    {
+      return impl::access(lh) > value_of(rh) ;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator>=(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() >= std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() >= std::declval<const OT&>())
+    {
+      return value_of(lh) >= impl::access(rh);
+    }
+    [[nodiscard]]
+    friend
+    constexpr
+    auto operator>=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() >= std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() >= std::declval<const TT&>())
+    {
+      return impl::access(lh) >= value_of(rh) ;
+    }
+  };
+}
+
+template <typename ... Ts>
+struct ordered_with
+{
+  template <typename T>
+  class modifier : public impl::typed_ordering<T, Ts>...
+  {
+  };
+};
+
+namespace impl
+{
+  template <typename T>
+  struct require_copy_constructible
+  {
+    static constexpr bool value = std::is_copy_constructible<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be copy constructible");
+  };
+  template <typename T>
+  struct require_move_constructible
+  {
+    static constexpr bool value = std::is_move_constructible<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be move constructible");
+  };
+  template <typename T>
+  struct require_copy_assignable
+  {
+    static constexpr bool value = std::is_copy_assignable<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be copy assignable");
+  };
+  template <typename T>
+  struct require_move_assignable
+  {
+    static constexpr bool value = std::is_move_assignable<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be move assignable");
+  };
+
+  template <bool> struct valid_type;
+  template <>
+  struct valid_type<true> {};
+
+  template <typename T>
+  struct require_semiregular
+    : valid_type<require_copy_constructible<T>::value &&
+                 require_move_constructible<T>::value &&
+                 require_copy_assignable<T>::value &&
+                 require_move_assignable<T>::value>
+  {
+  };
+
+}
+struct semiregular
+{
+  template <typename>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class semiregular::modifier<::strong::type<T, Tag, M...>>
+  : public default_constructible::modifier<T>
+  , private impl::require_semiregular<T>
+{
+};
+
+struct regular
+{
+  template <typename T>
+  class modifier
+    : public semiregular::modifier<T>
+    , public equality::modifier<T>
+  {
+  };
+};
+
+struct unique
+{
+  template <typename T>
+  class modifier
+    : private impl::valid_type<
+      impl::require_move_constructible<T>::value &&
+      impl::require_move_assignable<T>::value
+    >
+  {
+  public:
+    constexpr modifier() = default;
+    modifier(const modifier&) = delete;
+    constexpr modifier(modifier&&) = default;
+    modifier& operator=(const modifier&) = delete;
+    constexpr modifier& operator=(modifier&&) = default;
+  };
+};
+struct ordered
+{
+  template <typename T>
+  class modifier;
+};
+
+
+template <typename T, typename Tag, typename ... M>
+class ordered::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator<(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() < std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() < std::declval<const T&>())
+  {
+    return value_of(lh) < value_of(rh);
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator<=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() <= std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() <= std::declval<const T&>())
+  {
+    return value_of(lh) <= value_of(rh);
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator>(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() > std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() > std::declval<const T&>())
+  {
+    return value_of(lh) > value_of(rh);
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+
+  auto
+  operator>=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() >= std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() >= std::declval<const T&>())
+  {
+    return value_of(lh) >= value_of(rh);
+  }
+};
+
+struct ostreamable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    std::ostream&
+    operator<<(
+      std::ostream &os,
+      const T &t)
+    {
+      return os << value_of(t);
+    }
+  };
+};
+
+struct istreamable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    std::istream&
+    operator>>(
+      std::istream &is,
+      T &t)
+    {
+      return is >> value_of(t);
+    }
+  };
+};
+
+struct iostreamable
+{
+  template <typename T>
+  class modifier
+    : public ostreamable::modifier<T>
+    , public istreamable::modifier<T>
+  {
+  };
+};
+
+struct incrementable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    constexpr
+    T&
+    operator++(T& t)
+    noexcept(noexcept(++std::declval<T&>().value_of()))
+    {
+      ++value_of(t);
+      return t;
+    }
+
+    friend
+    constexpr
+    T
+    operator++(T& t, int)
+    {
+      auto copy = t;
+      ++t;
+      return copy;
+    }
+  };
+};
+
+struct decrementable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    constexpr
+    T&
+    operator--(T& t)
+    noexcept(noexcept(--std::declval<T&>().value_of()))
+    {
+      --value_of(t);
+      return t;
+    }
+
+    friend
+    constexpr
+    T
+    operator--(T& t, int)
+    {
+      auto copy = t;
+      --t;
+      return copy;
+    }
+  };
+};
+
+struct bicrementable
+{
+  template <typename T>
+  class modifier
+    : public incrementable::modifier<T>
+    , public decrementable::modifier<T>
+  {
+  };
+};
+
+struct boolean
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    explicit constexpr operator bool() const
+    noexcept(noexcept(static_cast<bool>(value_of(std::declval<const T&>()))))
+    {
+      const auto& self = static_cast<const T&>(*this);
+      return static_cast<bool>(value_of(self));
+    }
+  };
+};
+
+struct hashable
+{
+  template <typename T>
+  class modifier{};
+};
+
+struct difference
+{
+  template <typename T>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class difference::modifier<::strong::type<T, Tag, M...>>
+: public ordered::modifier<::strong::type<T, Tag, M...>>
+, public equality::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  friend
+  constexpr
+  type& operator+=(type& lh, const type& rh)
+  noexcept(noexcept(value_of(lh) += value_of(rh)))
+  {
+    value_of(lh) += value_of(rh);
+    return lh;
+  }
+
+  friend
+  constexpr
+  type& operator-=(type& lh, const type& rh)
+    noexcept(noexcept(value_of(lh) -= value_of(rh)))
+  {
+    value_of(lh) -= value_of(rh);
+    return lh;
+  }
+
+  friend
+  constexpr
+  type& operator*=(type& lh, const T& rh)
+  noexcept(noexcept(value_of(lh) *= rh))
+  {
+    value_of(lh) *= rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  type& operator/=(type& lh, const T& rh)
+    noexcept(noexcept(value_of(lh) /= rh))
+  {
+    value_of(lh) /= rh;
+    return lh;
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT&>()%= std::declval<const TT&>())>
+  friend
+  constexpr
+  type& operator%=(type& lh, const T& rh)
+    noexcept(noexcept(value_of(lh) %= rh))
+  {
+    value_of(lh)%= rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  type operator+(type lh, const type& rh)
+  {
+    lh += rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  type operator-(type lh, const type& rh)
+  {
+    lh -= rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  type operator*(type lh, const T& rh)
+  {
+    lh *= rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  type operator*(const T& lh, type rh)
+  {
+    rh *= lh;
+    return rh;
+  }
+
+  friend
+  constexpr
+  type operator/(type lh, const T& rh)
+  {
+    lh /= rh;
+    return lh;
+  }
+
+  friend
+  constexpr
+  T operator/(const type& lh, const type& rh)
+  {
+    return value_of(lh) / value_of(rh);
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT&>() %= std::declval<const TT&>())>
+  friend
+  constexpr
+  type operator%(type lh, const T& rh)
+    noexcept(noexcept(lh%= rh))
+  {
+      lh %= rh;
+      return lh;
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT>() % std::declval<TT>())>
+  friend
+  constexpr
+  T operator%(type lh, type rh)
+    noexcept(noexcept(value_of(lh) % value_of(rh)))
+  {
+      return value_of(lh) % value_of(rh);
+  }
+};
+
+template <typename D = void>
+struct affine_point
+{
+  template <typename T>
+  class modifier;
+};
+
+namespace impl
+{
+  template <typename ...>
+  using void_t = void;
+
+  template <typename T, typename = void>
+  struct subtractable : std::false_type {};
+
+  template <typename T>
+  struct subtractable<T, void_t<decltype(std::declval<const T&>() - std::declval<const T&>())>>
+  : std::true_type {};
+}
+
+
+template <typename D>
+template <typename T, typename Tag, typename ... M>
+class affine_point<D>::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  static_assert(impl::subtractable<T>::value, "it must be possible to subtract instances of your underlying type");
+  using base_diff_type = decltype(std::declval<const T&>() - std::declval<const T&>());
+public:
+  using difference = std::conditional_t<std::is_same<D, void>{}, strong::type<base_diff_type, Tag, strong::difference>, D>;
+  static_assert(std::is_constructible_v<difference, base_diff_type>, "");
+  [[nodiscard]]
+  friend
+  constexpr
+  difference
+  operator-(
+    const type& lh,
+    const type& rh)
+  {
+    return difference(value_of(lh) - value_of(rh));
+  }
+
+  friend
+  constexpr
+  type&
+  operator+=(
+    type& lh,
+    const difference& d)
+  noexcept(noexcept(value_of(lh) += impl::access(d)))
+  {
+    value_of(lh) += impl::access(d);
+    return lh;
+  }
+
+  friend
+  constexpr
+  type&
+  operator-=(
+    type& lh,
+    const difference& d)
+  noexcept(noexcept(value_of(lh) -= impl::access(d)))
+  {
+    value_of(lh) -= impl::access(d);
+    return lh;
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  type
+  operator+(
+    type lh,
+    const difference& d)
+  {
+    return lh += d;
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  type
+  operator+(
+    const difference& d,
+    type rh)
+  {
+    return rh+= d;
+  }
+
+  [[nodiscard]]
+  friend
+  constexpr
+  type
+  operator-(
+    type lh,
+    const difference& d)
+  {
+    return lh -= d;
+  }
+};
+
+
+struct pointer
+{
+  template <typename T>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class pointer::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = strong::type<T, Tag, M...>;
+public:
+  template <typename TT = T>
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator==(
+    const type& t,
+    std::nullptr_t)
+  noexcept(noexcept(std::declval<const TT&>() == nullptr))
+  -> decltype(std::declval<const TT&>() == nullptr)
+  {
+    return value_of(t) == nullptr;
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator==(
+    std::nullptr_t,
+    const type& t)
+  noexcept(noexcept(nullptr == std::declval<const TT&>()))
+  -> decltype(nullptr == std::declval<const TT&>())
+  {
+    return value_of(t) == nullptr;
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator!=(
+    const type& t,
+    std::nullptr_t)
+  noexcept(noexcept(std::declval<const TT&>() != nullptr))
+  -> decltype(std::declval<const TT&>() != nullptr)
+  {
+    return value_of(t) != nullptr;
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  friend
+  constexpr
+  auto
+  operator!=(
+    std::nullptr_t,
+    const type& t)
+  noexcept(noexcept(nullptr != std::declval<const TT&>()))
+  -> decltype(nullptr != std::declval<const TT&>())
+  {
+    return value_of(t) != nullptr;
+  }
+
+  [[nodiscard]]
+  constexpr
+  decltype(*std::declval<const T&>())
+  operator*()
+  const
+  {
+    auto& self = static_cast<const type&>(*this);
+    return *value_of(self);
+  }
+
+  [[nodiscard]]
+  constexpr
+  decltype(&(*std::declval<const T&>())) operator->() const { return &operator*();}
+};
+
+struct arithmetic
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator-(
+      const T &lh)
+    {
+      return T{-value_of(lh)};
+    }
+
+    friend
+    constexpr
+    T&
+    operator+=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) += value_of(rh)))
+    {
+      value_of(lh) += value_of(rh);
+      return lh;
+    }
+
+    friend
+    constexpr
+    T&
+    operator-=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) -= value_of(rh)))
+    {
+      value_of(lh) -= value_of(rh);
+      return lh;
+    }
+
+    friend
+    constexpr
+    T&
+    operator*=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) *= value_of(rh)))
+    {
+      value_of(lh) *= value_of(rh);
+      return lh;
+    }
+
+    friend
+    constexpr
+    T&
+    operator/=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) /= value_of(rh)))
+    {
+      value_of(lh) /= value_of(rh);
+      return lh;
+    }
+
+    template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
+    friend
+    constexpr
+    T&
+    operator%=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) %= value_of(rh)))
+    {
+      value_of(lh) %= value_of(rh);
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator+(
+      T lh,
+      const T &rh)
+    {
+      lh += rh;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator-(
+      T lh,
+      const T &rh)
+    {
+      lh -= rh;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator*(
+      T lh,
+      const T &rh)
+    {
+      lh *= rh;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator/(
+      T lh,
+      const T &rh)
+    {
+      lh /= rh;
+      return lh;
+    }
+
+    template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator%(
+      T lh,
+      const T &rh)
+    {
+      lh %= rh;
+      return lh;
+    }
+
+  };
+};
+
+
+struct bitarithmetic
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    constexpr
+    T&
+    operator&=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) &= value_of(rh)))
+    {
+      value_of(lh) &= value_of(rh);
+      return lh;
+    }
+
+    friend
+    constexpr
+    T&
+    operator|=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) |= value_of(rh)))
+    {
+      value_of(lh) |= value_of(rh);
+      return lh;
+    }
+
+    friend
+    constexpr
+    T&
+    operator^=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) ^= value_of(rh)))
+    {
+      value_of(lh) ^= value_of(rh);
+      return lh;
+    }
+
+    template <typename C>
+    friend
+    constexpr
+    T&
+    operator<<=(
+      T &lh,
+      C c)
+    noexcept(noexcept(value_of(lh) <<= c))
+    {
+      value_of(lh) <<= c;
+      return lh;
+    }
+
+    template <typename C>
+    friend
+    constexpr
+    T&
+    operator>>=(
+      T &lh,
+      C c)
+    noexcept(noexcept(value_of(lh) >>= c))
+    {
+      value_of(lh) >>= c;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator~(
+      const T &lh)
+    {
+      auto v = value_of(lh);
+      v = ~v;
+      return T(v);
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator&(
+      T lh,
+      const T &rh)
+    {
+      lh &= rh;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator|(
+      T lh,
+      const T &rh)
+    {
+      lh |= rh;
+      return lh;
+    }
+
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator^(
+      T lh,
+      const T &rh)
+    {
+      lh ^= rh;
+      return lh;
+    }
+
+    template <typename C>
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator<<(
+      T lh,
+      C c)
+    {
+      lh <<= c;
+      return lh;
+    }
+
+    template <typename C>
+    [[nodiscard]]
+    friend
+    constexpr
+    T
+    operator>>(
+      T lh,
+      C c)
+    {
+      lh >>= c;
+      return lh;
+    }
+  };
+};
+template <typename I = void>
+struct indexed
+{
+  template <typename T>
+  class modifier;
+};
+
+template <>
+struct indexed<void> {
+  template<typename>
+  class modifier;
+
+  template <typename T, typename Tag, typename ... Ms>
+  class modifier<type<T, Tag, Ms...>> {
+    using ref = T&;
+    using cref = const T&;
+    using rref = T&&;
+    using type = strong::type<T, Tag, Ms...>;
+  public:
+    template<typename I>
+    [[nodiscard]]
+    auto
+    operator[](
+      const I &i)
+    const &
+    noexcept(noexcept(std::declval<cref>()[impl::access(i)]))
+    -> decltype(std::declval<cref>()[impl::access(i)]) {
+      auto& self = static_cast<const type&>(*this);
+      return value_of(self)[impl::access(i)];
+    }
+
+    template<typename I>
+    [[nodiscard]]
+    auto
+    operator[](
+      const I &i)
+    &
+    noexcept(noexcept(std::declval<ref>()[impl::access(i)]))
+    -> decltype(std::declval<ref>()[impl::access(i)]) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(self)[impl::access(i)];
+    }
+
+    template<typename I>
+    [[nodiscard]]
+    auto
+    operator[](
+      const I &i)
+    &&
+    noexcept(noexcept(std::declval<rref>()[impl::access(i)]))
+    -> decltype(std::declval<rref>()[impl::access(i)]) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(std::move(self))[impl::access(i)];
+    }
+
+    template<typename I, typename C = cref>
+    [[nodiscard]]
+    auto
+    at(
+      const I &i)
+    const &
+    -> decltype(std::declval<C>().at(impl::access(i))) {
+      auto& self = static_cast<const type&>(*this);
+      return value_of(self).at(impl::access(i));
+    }
+
+    template<typename I, typename R = ref>
+    [[nodiscard]]
+    auto
+    at(
+      const I &i)
+    &
+    -> decltype(std::declval<R>().at(impl::access(i))) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(self).at(impl::access(i));
+    }
+
+    template<typename I, typename R = rref>
+    [[nodiscard]]
+    auto
+    at(
+      const I &i)
+    &&
+    -> decltype(std::declval<R>().at(impl::access(i))) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(std::move(self)).at(impl::access(i));
+    }
+  };
+};
+
+template <typename I>
+template <typename T, typename Tag, typename ... M>
+class indexed<I>::modifier<type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  [[nodiscard]]
+  auto
+  operator[](
+    const I& i)
+  const &
+  noexcept(noexcept(std::declval<const T&>()[impl::access(i)]))
+  -> decltype(std::declval<const T&>()[impl::access(i)])
+  {
+    auto& self = static_cast<const type&>(*this);
+    return value_of(self)[impl::access(i)];
+  }
+
+  [[nodiscard]]
+  auto
+  operator[](
+    const I& i)
+  &
+  noexcept(noexcept(std::declval<T&>()[impl::access(i)]))
+  -> decltype(std::declval<T&>()[impl::access(i)])
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(self)[impl::access(i)];
+  }
+
+  [[nodiscard]]
+  auto
+  operator[](
+    const I& i)
+  &&
+  noexcept(noexcept(std::declval<T&&>()[impl::access(i)]))
+  -> decltype(std::declval<T&&>()[impl::access(i)])
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(std::move(self))[impl::access(i)];
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  auto
+  at(
+    const I& i)
+  const &
+  -> decltype(std::declval<const TT&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return value_of(self).at(impl::access(i));
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  auto
+  at(
+    const I& i)
+  &
+  -> decltype(std::declval<TT&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(self).at(impl::access(i));
+  }
+
+  template <typename TT = T>
+  [[nodiscard]]
+  auto
+  at(
+    const I& i)
+  &&
+  -> decltype(std::declval<TT&&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(std::move(self)).at(impl::access(i));
+  }
+};
+
+class iterator
+{
+public:
+  template <typename I, typename category = typename std::iterator_traits<underlying_type_t<I>>::iterator_category>
+  class modifier
+    : public pointer::modifier<I>
+    , public equality::modifier<I>
+    , public incrementable::modifier<I>
+  {
+  public:
+    using difference_type = typename std::iterator_traits<underlying_type_t<I>>::difference_type;
+    using value_type = typename std::iterator_traits<underlying_type_t<I>>::value_type;
+    using pointer = typename std::iterator_traits<underlying_type_t<I>>::value_type;
+    using reference = typename std::iterator_traits<underlying_type_t<I>>::reference;
+    using iterator_category = typename std::iterator_traits<underlying_type_t<I>>::iterator_category;
+  };
+
+  template <typename I>
+  class modifier<I, std::bidirectional_iterator_tag>
+    : public modifier<I, std::forward_iterator_tag>
+      , public decrementable::modifier<I>
+  {
+  };
+  template <typename I>
+  class modifier<I, std::random_access_iterator_tag>
+    : public modifier<I, std::bidirectional_iterator_tag>
+      , public affine_point<typename std::iterator_traits<underlying_type_t<I>>::difference_type>::template modifier<I>
+      , public indexed<>::modifier<I>
+      , public ordered::modifier<I>
+  {
+  };
+};
+
+class range
+{
+public:
+  template <typename R>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class range::modifier<type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  using r_iterator = decltype(std::declval<T&>().begin());
+  using r_const_iterator = decltype(std::declval<const T&>().begin());
+public:
+  using iterator = ::strong::type<r_iterator, Tag, strong::iterator>;
+  using const_iterator = ::strong::type<r_const_iterator, Tag, strong::iterator>;
+
+  iterator
+  begin()
+  noexcept(noexcept(std::declval<T&>().begin()))
+  {
+    auto& self = static_cast<type&>(*this);
+    return iterator{value_of(self).begin()};
+  }
+
+  iterator
+  end()
+  noexcept(noexcept(std::declval<T&>().end()))
+  {
+    auto& self = static_cast<type&>(*this);
+    return iterator{value_of(self).end()};
+  }
+
+  const_iterator
+  cbegin()
+    const
+  noexcept(noexcept(std::declval<const T&>().begin()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).begin()};
+  }
+
+  const_iterator
+  cend()
+    const
+  noexcept(noexcept(std::declval<const T&>().end()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).end()};
+  }
+
+  const_iterator
+  begin()
+  const
+  noexcept(noexcept(std::declval<const T&>().begin()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).begin()};
+  }
+
+  const_iterator
+  end()
+  const
+  noexcept(noexcept(std::declval<const T&>().end()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).end()};
+  }
+};
+
+namespace impl {
+
+  template<typename T, typename D>
+  struct converter
+  {
+    constexpr explicit operator D() const
+    noexcept(noexcept(static_cast<D>(std::declval<const underlying_type_t<T>&>())))
+    {
+      auto& self = static_cast<const T&>(*this);
+      return static_cast<D>(value_of(self));
+    }
+  };
+  template<typename T, typename D>
+  struct implicit_converter
+  {
+    constexpr operator D() const
+    noexcept(noexcept(static_cast<D>(std::declval<const underlying_type_t<T>&>())))
+    {
+      auto& self = static_cast<const T&>(*this);
+      return static_cast<D>(value_of(self));
+    }
+  };
+}
+template <typename ... Ts>
+struct convertible_to
+{
+  template <typename T>
+  struct modifier : impl::converter<T, Ts>...
+  {
+  };
+};
+
+template <typename ... Ts>
+struct implicitly_convertible_to
+{
+  template <typename T>
+  struct modifier : impl::implicit_converter<T, Ts>...
+  {
+  };
+
+};
+
+struct formattable
+{
+    template <typename T>
+    class modifier{};
+};
+
+}
+
+namespace std {
+template <typename T, typename Tag, typename ... M>
+struct hash<::strong::type<T, Tag, M...>>
+  : std::conditional_t<
+    std::is_base_of<
+      ::strong::hashable::modifier<
+        ::strong::type<T, Tag, M...>
+      >,
+      ::strong::type<T, Tag, M...>
+    >::value,
+    hash<T>,
+    std::false_type>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  decltype(auto)
+  operator()(
+    const ::strong::hashable::modifier<type>& t)
+  const
+  noexcept(noexcept(std::declval<hash<T>>()(value_of(std::declval<const type&>()))))
+  {
+    auto& tt = static_cast<const type&>(t);
+    return hash<T>::operator()(value_of(tt));
+  }
+};
+
+#if STRONG_HAS_STD_FORMAT
+template<typename T, typename Tag, typename... M, typename Char>
+struct formatter<::strong::type<T, Tag, M...>, Char,
+                 std::enable_if_t<
+                     std::is_base_of<
+                         ::strong::formattable::modifier<
+                             ::strong::type<T, Tag, M...>
+                             >,
+                         ::strong::type<T, Tag, M...>
+                         >::value
+                     >>
+    : formatter<T>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  template<typename FormatContext>
+  constexpr
+  decltype(auto)
+  format(const ::strong::formattable::modifier<type>& t, FormatContext& fc)
+      noexcept(noexcept(std::declval<formatter<T, Char>>().format(value_of(std::declval<const type&>()), fc)))
+  {
+    const auto& tt = static_cast<const type&>(t);
+    return formatter<T, Char>::format(value_of(tt), fc);
+  }
+};
+#endif
+
+}
+
+#if STRONG_HAS_FMT_FORMAT
+namespace fmt
+{
+template<typename T, typename Tag, typename... M, typename Char>
+struct formatter<::strong::type<T, Tag, M...>, Char,
+                 std::enable_if_t<
+                   std::is_base_of<
+                     ::strong::formattable::modifier<
+                       ::strong::type<T, Tag, M...>
+                     >,
+                     ::strong::type<T, Tag, M...>
+                   >::value
+                 >>
+  : formatter<T>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  template<typename FormatContext>
+  constexpr
+  decltype(auto)
+  format(const ::strong::formattable::modifier<type>& t, FormatContext& fc)
+      noexcept(noexcept(std::declval<formatter<T, Char>>().format(value_of(std::declval<const type&>()), fc)))
+  {
+    const auto& tt = static_cast<const type&>(t);
+    return formatter<T, Char>::format(value_of(tt), fc);
+  }
+};
+}
+#endif
+#endif //ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cda361bc8f17f673fb6735b76261b82d821f26d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/util/thread_name.h
@@ -0,0 +1,18 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <string>
+
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+C10_API void setThreadName(std::string name);
+
+C10_API std::string getThreadName();
+
+} // namespace c10
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fe6ecf7e59c18eaa8cd6afc37aa06a2045cf8aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
@@ -0,0 +1,121 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/AllocatorConfig.h>
+#include <c10/core/CachingDeviceAllocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu::XPUCachingAllocator {
+
+class XPUAllocator : public DeviceAllocator {
+ public:
+  virtual void init(c10::DeviceIndex device_count) = 0;
+  virtual void* raw_alloc(size_t nbytes) = 0;
+  virtual void raw_delete(void* ptr) = 0;
+};
+
+C10_XPU_API extern std::atomic<XPUAllocator*> allocator;
+
+inline XPUAllocator* get() {
+  return allocator.load();
+}
+
+inline void init(c10::DeviceIndex device_count) {
+  get()->init(device_count);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  get()->emptyCache(mempool_id);
+}
+
+inline void resetPeakStats(DeviceIndex device) {
+  get()->resetPeakStats(device);
+}
+
+inline void resetAccumulatedStats(DeviceIndex device) {
+  get()->resetAccumulatedStats(device);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void* raw_alloc(size_t size) {
+  return get()->raw_alloc(size);
+}
+
+inline void raw_delete(void* ptr) {
+  get()->raw_delete(ptr);
+}
+
+inline void recordStream(const DataPtr& dataPtr, XPUStream stream) {
+  get()->recordStream(dataPtr, stream);
+}
+
+C10_XPU_API void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access);
+
+C10_XPU_API double getMemoryFraction(DeviceIndex device);
+
+C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
+
+C10_XPU_API void createOrIncrefPool(
+    c10::DeviceIndex device,
+    c10::MempoolId_t mempool_id,
+    XPUAllocator* allocator = nullptr);
+
+C10_XPU_API void beginAllocateToPool(
+    c10::DeviceIndex device,
+    c10::MempoolId_t mempool_id,
+    std::function<bool(sycl::queue*)> filter);
+
+C10_XPU_API void endAllocateToPool(
+    c10::DeviceIndex device,
+    c10::MempoolId_t mempool_id);
+
+C10_XPU_API void releasePool(
+    c10::DeviceIndex device,
+    c10::MempoolId_t mempool_id);
+
+C10_XPU_API int getPoolUseCount(
+    c10::DeviceIndex device,
+    c10::MempoolId_t mempool_id);
+
+} // namespace c10::xpu::XPUCachingAllocator
+
+namespace c10::xpu {
+
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+struct C10_XPU_API MemPool {
+  MemPool(
+      XPUCachingAllocator::XPUAllocator* allocator = nullptr,
+      bool is_user_created = true,
+      bool use_on_oom = false);
+  MemPool(const MemPool&) = delete;
+  MemPool(MemPool&&) = default;
+  MemPool& operator=(const MemPool&) = delete;
+  MemPool& operator=(MemPool&&) = default;
+  ~MemPool();
+
+  MempoolId_t id();
+  XPUCachingAllocator::XPUAllocator* allocator();
+  int use_count();
+  c10::DeviceIndex device();
+  static MempoolId_t graph_pool_handle(bool is_user_created = true);
+
+ private:
+  static std::atomic<CaptureId_t> uid_;
+  static std::atomic<CaptureId_t> uuid_;
+  XPUCachingAllocator::XPUAllocator* allocator_;
+  bool is_user_created_;
+  MempoolId_t id_;
+  c10::DeviceIndex device_;
+};
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b85a34f0bc3d032fe403c5e758cfbba252b27871
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
@@ -0,0 +1,212 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/xpu/XPUMacros.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+#define AT_FORALL_XPU_DEVICE_PROPERTIES(_)                                     \
+  /* the device name of this SYCL device. */                                   \
+  _(name)                                                                      \
+                                                                               \
+  /* the device type associated with the device. */                            \
+  _(device_type)                                                               \
+                                                                               \
+  /* the vendor of this SYCL device. */                                        \
+  _(vendor)                                                                    \
+                                                                               \
+  /* a backend-defined driver version as a std::string. */                     \
+  _(driver_version)                                                            \
+                                                                               \
+  /* the SYCL version as a std::string in the form <major>.<minor> */          \
+  _(version)                                                                   \
+                                                                               \
+  /* true if the SYCL device is available. Otherwise, return false. */         \
+  _(is_available)                                                              \
+                                                                               \
+  /* the maximum size in bytes of the arguments that can be passed to a        \
+   * kernel. */                                                                \
+  _(max_parameter_size)                                                        \
+                                                                               \
+  /* the number of parallel compute units available to the device. */          \
+  _(max_compute_units)                                                         \
+                                                                               \
+  /* the maximum dimensions that specify the global and local work-item IDs    \
+   * used by the data parallel execution model. */                             \
+  _(max_work_item_dimensions)                                                  \
+                                                                               \
+  /* the maximum number of workitems that are permitted in a work-group        \
+   * executing a kernel on a single compute unit. */                           \
+  _(max_work_group_size)                                                       \
+                                                                               \
+  /* the maximum number of subgroups in a work-group for any kernel executed   \
+   * on the device. */                                                         \
+  _(max_num_sub_groups)                                                        \
+                                                                               \
+  /* a std::vector of size_t containing the set of sub-group sizes  supported  \
+   * by the device. */                                                         \
+  _(sub_group_sizes)                                                           \
+                                                                               \
+  /* the maximum configured clock frequency of this SYCL device in MHz. */     \
+  _(max_clock_frequency)                                                       \
+                                                                               \
+  /* the default compute device address space size specified as an unsigned    \
+   * integer value in bits. Must return either 32 or 64. */                    \
+  _(address_bits)                                                              \
+                                                                               \
+  /* the maximum size of memory object allocation in bytes. */                 \
+  _(max_mem_alloc_size)                                                        \
+                                                                               \
+  /* the minimum value in bits of the largest supported SYCL built-in data     \
+   * type if this SYCL device is not of device type                            \
+   * sycl::info::device_type::custom. */                                       \
+  _(mem_base_addr_align)                                                       \
+                                                                               \
+  /* a std::vector of info::fp_config describing the half/single/double        \
+   * precision floating-point capability of this SYCL device. */               \
+  _(half_fp_config)                                                            \
+  _(single_fp_config)                                                          \
+  _(double_fp_config)                                                          \
+                                                                               \
+  /* the size of global device memory in bytes. */                             \
+  _(global_mem_size)                                                           \
+                                                                               \
+  /* the type of global memory cache supported. */                             \
+  _(global_mem_cache_type)                                                     \
+                                                                               \
+  /* the size of global memory cache in bytes. */                              \
+  _(global_mem_cache_size)                                                     \
+                                                                               \
+  /* the size of global memory cache line in bytes. */                         \
+  _(global_mem_cache_line_size)                                                \
+                                                                               \
+  /* the type of local memory supported. */                                    \
+  _(local_mem_type)                                                            \
+                                                                               \
+  /* the size of local memory arena in bytes. */                               \
+  _(local_mem_size)                                                            \
+                                                                               \
+  /* the maximum number of sub-devices that can be created when this device is \
+   * partitioned. */                                                           \
+  _(partition_max_sub_devices)                                                 \
+                                                                               \
+  /* the resolution of device timer in nanoseconds. */                         \
+  _(profiling_timer_resolution)                                                \
+                                                                               \
+  /* the preferred native vector width size for built-in scalar types that can \
+   * be put into vectors. */                                                   \
+  _(preferred_vector_width_char)                                               \
+  _(preferred_vector_width_short)                                              \
+  _(preferred_vector_width_int)                                                \
+  _(preferred_vector_width_long)                                               \
+  _(preferred_vector_width_float)                                              \
+  _(preferred_vector_width_double)                                             \
+  _(preferred_vector_width_half)                                               \
+                                                                               \
+  /* the native ISA vector width. The vector width is defined as the number of \
+   * scalar elements that can be stored in the vector. */                      \
+  _(native_vector_width_char)                                                  \
+  _(native_vector_width_short)                                                 \
+  _(native_vector_width_int)                                                   \
+  _(native_vector_width_long)                                                  \
+  _(native_vector_width_float)                                                 \
+  _(native_vector_width_double)                                                \
+  _(native_vector_width_half)
+
+#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)                                \
+  /* the number of EUs associated with the Intel GPU. */                      \
+  _(gpu_eu_count, gpu_eu_count, 512)                                          \
+                                                                              \
+  /* the number of EUs in a subslice. */                                      \
+  _(gpu_eu_count_per_subslice, gpu_eu_count_per_subslice, 8)                  \
+                                                                              \
+  /* the simd width of EU of GPU. */                                          \
+  _(gpu_eu_simd_width, gpu_eu_simd_width, 8)                                  \
+                                                                              \
+  /* the number of hardware threads per EU of GPU. */                         \
+  _(gpu_hw_threads_per_eu, gpu_hw_threads_per_eu, 8)                          \
+                                                                              \
+  /* the device identifier of the Intel GPU, also known as the product ID. */ \
+  _(device_id, device_id, 0)                                                  \
+                                                                              \
+  /* the device descriptor for device Universal Unique ID, 16 bytes*/         \
+  _(uuid, device_info_uuid, (std::array<unsigned char, 16>{}))
+
+#define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
+  /* sycl::half is supported on device. */              \
+  _(fp16)                                               \
+                                                        \
+  /* double is supported on device. */                  \
+  _(fp64)                                               \
+                                                        \
+  /* 64-bit atomic operation is supported on device. */ \
+  _(atomic64)
+
+#define AT_FORALL_XPU_EXP_CL_ASPECT(_)                                         \
+  /* conversion between single-precision 32-bit floating-point values and      \
+   * 16-bit bfloat16 values is supported on device. */                         \
+  _(bfloat16_conversions)                                                      \
+                                                                               \
+  /* specialized hardware to compute MMA is supported on device. */            \
+  _(subgroup_matrix_multiply_accumulate)                                       \
+                                                                               \
+  /* specialized hardware to compute MMA for 32-bit floating-point is          \
+   * supported on device. */                                                   \
+  _(subgroup_matrix_multiply_accumulate_tensor_float32)                        \
+                                                                               \
+  /* block read operations for efficient matrix multiplication is supported on \
+   * device. */                                                                \
+  _(subgroup_2d_block_io)
+
+#define AT_FORALL_XPU_EXP_DEVICE_PROPERTIES(_)       \
+  /* the device architecture of this SYCL device. */ \
+  _(architecture)
+
+#define _DEFINE_SYCL_PROP(ns, property, member) \
+  ns::property::return_type member;
+
+#define DEFINE_DEVICE_PROP(property) \
+  _DEFINE_SYCL_PROP(sycl::info::device, property, property)
+
+#define DEFINE_PLATFORM_PROP(property, member) \
+  _DEFINE_SYCL_PROP(sycl::info::platform, property, member)
+
+#define DEFINE_EXT_DEVICE_PROP(property, ...) \
+  _DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property)
+
+#define DEFINE_DEVICE_ASPECT(member) bool has_##member;
+
+#define DEFINE_EXP_DEVICE_PROP(property) \
+  _DEFINE_SYCL_PROP(                     \
+      sycl::ext::oneapi::experimental::info::device, property, property)
+
+struct C10_XPU_API DeviceProp {
+  AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP);
+
+  // the platform name.
+  DEFINE_PLATFORM_PROP(name, platform_name);
+
+  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
+
+  AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
+
+  AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
+
+#if SYCL_COMPILER_VERSION >= 20250000
+  AT_FORALL_XPU_EXP_DEVICE_PROPERTIES(DEFINE_EXP_DEVICE_PROP);
+#endif
+};
+
+#undef _DEFINE_SYCL_PROP
+#undef DEFINE_DEVICE_PROP
+#undef DEFINE_PLATFORM_PROP
+#undef DEFINE_EXT_DEVICE_PROP
+#undef DEFINE_DEVICE_ASPECT
+#undef DEFINE_EXP_DEVICE_PROP
+
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..596fdfcc0ff06ccdb4395c3989e987836804eddc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUEvent.h
@@ -0,0 +1,183 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu {
+
+/*
+ * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
+ * constructed lazily when first recorded. It has a device, and this device is
+ * acquired from the first recording stream. Later streams that record the event
+ * must match the same device.
+ *
+ * Currently, XPUEvent does NOT support to export an inter-process event from
+ * another process via inter-process communication(IPC). So it means that
+ * inter-process communication for event handles between different processes is
+ * not available. This could impact some applications that rely on cross-process
+ * synchronization and communication.
+ */
+struct XPUEvent {
+  // Constructors
+  XPUEvent(bool enable_timing = false) noexcept
+      : enable_timing_{enable_timing} {}
+
+  ~XPUEvent() {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_deletion(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    }
+  }
+
+  C10_DISABLE_COPY_AND_ASSIGN(XPUEvent);
+
+  XPUEvent(XPUEvent&& other) = default;
+  XPUEvent& operator=(XPUEvent&& other) = default;
+
+  operator sycl::event&() const {
+    return event();
+  }
+
+  std::optional<c10::Device> device() const {
+    if (isCreated()) {
+      return c10::Device(c10::kXPU, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  inline bool isCreated() const {
+    return (event_.get() != nullptr);
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  sycl::event& event() const {
+    return *event_;
+  }
+
+  bool query() const {
+    using namespace sycl::info;
+    if (!isCreated()) {
+      return true;
+    }
+
+    return event().get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  void record() {
+    record(getCurrentXPUStream());
+  }
+
+  void recordOnce(const XPUStream& stream) {
+    if (!isCreated()) {
+      record(stream);
+    }
+  }
+
+  void record(const XPUStream& stream) {
+    if (!isCreated()) {
+      device_index_ = stream.device_index();
+      assignEvent(stream.queue());
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_creation(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    } else {
+      TORCH_CHECK(
+          device_index_ == stream.device_index(),
+          "Event device ",
+          device_index_,
+          " does not match recording stream's device ",
+          stream.device_index(),
+          ".");
+      reassignEvent(stream.queue());
+    }
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(event_.get()),
+          reinterpret_cast<uintptr_t>(&stream.queue()));
+    }
+  }
+
+  void block(const XPUStream& stream) {
+    if (isCreated()) {
+      std::vector<sycl::event> event_list{event()};
+      // Make this stream wait until event_ is completed.
+      stream.queue().ext_oneapi_submit_barrier(event_list);
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            c10::kXPU,
+            reinterpret_cast<uintptr_t>(event_.get()),
+            reinterpret_cast<uintptr_t>(&stream.queue()));
+      }
+    }
+  }
+
+  double elapsed_time(const XPUEvent& other) const {
+    TORCH_CHECK(
+        isCreated() && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+    TORCH_CHECK(
+        enable_timing_ && other.enable_timing_,
+        "Both events must be created with argument 'enable_timing=True'.");
+
+    using namespace sycl::info::event_profiling;
+    // Block until both of the recorded events are completed.
+    uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
+    uint64_t start_time_ns = event().get_profiling_info<command_end>();
+    // Return the eplased time in milliseconds.
+    return 1e-6 *
+        (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
+  }
+
+  void synchronize() const {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_synchronization(
+            c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+      event().wait_and_throw();
+    }
+  }
+
+ private:
+  void assignEvent(sycl::queue& queue) {
+    if (enable_timing_) {
+      event_ = std::make_unique<sycl::event>(
+          sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
+    } else {
+      event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
+    }
+  }
+
+  void reassignEvent(sycl::queue& queue) {
+    event_.reset();
+    assignEvent(queue);
+  }
+
+  bool enable_timing_ = false;
+  c10::DeviceIndex device_index_ = -1;
+  // Only need to track the last event, as events in an in-order queue are
+  // executed sequentially.
+  std::unique_ptr<sycl::event> event_;
+};
+
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5d6d56a1560728c6604d04aaa2aa75c4c615aae
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUException.h
@@ -0,0 +1,27 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
+  if (el.size() == 0) {
+    return;
+  }
+  for (const auto& e : el) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception& e) {
+      TORCH_WARN("SYCL Exception: ", e.what());
+    }
+  }
+  throw;
+};
+
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5017a054d32448a372290fcab2adfdea3e7fb36
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUFunctions.h
@@ -0,0 +1,50 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/xpu/XPUDeviceProp.h>
+#include <c10/xpu/XPUMacros.h>
+
+// The naming convention used here matches the naming convention of torch.xpu
+
+namespace c10::xpu {
+
+// Log a warning only once if no devices are detected.
+C10_XPU_API DeviceIndex device_count();
+
+// Throws an error if no devices are detected.
+C10_XPU_API DeviceIndex device_count_ensure_non_zero();
+
+C10_XPU_API DeviceIndex current_device();
+
+C10_XPU_API void set_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex exchange_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device);
+
+C10_XPU_API sycl::device& get_raw_device(DeviceIndex device);
+
+C10_XPU_API sycl::context& get_device_context();
+
+C10_XPU_API void get_device_properties(
+    DeviceProp* device_prop,
+    DeviceIndex device);
+
+C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr);
+
+static inline void check_device_index(DeviceIndex device_index) {
+  TORCH_CHECK(
+      device_index >= 0 && device_index < c10::xpu::device_count(),
+      "The device index is out of range. It must be in [0, ",
+      static_cast<int>(c10::xpu::device_count()),
+      "), but got ",
+      static_cast<int>(device_index),
+      ".");
+}
+
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..437dda44bfc4826d05389b530a7cd54083ec8f08
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUGraphsC10Utils.h
@@ -0,0 +1,47 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/xpu/XPUStream.h>
+#include <iostream>
+
+// XPU Graphs utils used by c10 and aten.
+using namespace sycl::ext::oneapi::experimental;
+namespace c10::xpu {
+
+static_assert(
+    int8_t(queue_state::executing) == 0,
+    "unexpected int(queue_state::executing) value");
+static_assert(
+    int8_t(queue_state::recording) == 1,
+    "unexpected int(queue_state::recording) value");
+
+enum class CaptureStatus : int8_t {
+  Executing = int8_t(queue_state::executing),
+  Recording = int8_t(queue_state::recording)
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::Executing:
+      os << "Executing";
+      break;
+    case CaptureStatus::Recording:
+      os << "Recording";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown XPU graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+  auto state = c10::xpu::getCurrentXPUStream().queue().ext_oneapi_get_state();
+  return CaptureStatus(state);
+}
+
+} // namespace c10::xpu
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..43a42c2a6f8a47a2276268e58edc176ec5f6a781
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUMacros.h
@@ -0,0 +1,38 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/xpu/impl/xpu_cmake_macros.h>
+#endif
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_XPU_BUILD_SHARED_LIBS)
+#define C10_XPU_EXPORT __declspec(dllexport)
+#define C10_XPU_IMPORT __declspec(dllimport)
+#else
+#define C10_XPU_EXPORT
+#define C10_XPU_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_XPU_EXPORT
+#endif // defined(__GNUC__)
+#define C10_XPU_IMPORT C10_XPU_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_xpu.so
+#ifdef C10_XPU_BUILD_MAIN_LIB
+#define C10_XPU_API C10_XPU_EXPORT
+#else
+#define C10_XPU_API C10_XPU_IMPORT
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..df79df4945aa93da62b5faf0bf931a44cd09bf2d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/XPUStream.h
@@ -0,0 +1,217 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/Stream.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace c10::xpu {
+
+/*
+ * Note [Stream Management]
+ *
+ * An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel
+ * can execute. Currently, there are several pools per device to manage SYCL
+ * queue, and a device's pool is lazily created.
+ *
+ * There are two pools per device. The first pool contains "normal priority"
+ * queues. The second pool is the "high priority" queues. There are 32 queues in
+ * per pool per device, and when a queue is requested one of these queues is
+ * returned round-robin. That is, the first queue requested is at index 0, the
+ * second at index 1... to index 31, then index 0 again.
+ *
+ * This means that if 33 queues are requested, the first and last queues
+ * requested are actually the same queue (under the covers) and kernels enqueued
+ * on them cannot run concurrently.
+ *
+ * It is safe to enqueue a kernel on the same queue from two different
+ * threads as the SYCL specification described.
+ */
+
+static constexpr int max_compile_time_stream_priorities = 3;
+
+/*
+ * This serves as a wrapper around c10::Stream and acts as a representation for
+ * a SYCL queue, which allows asynchronous execution of XPU tasks.
+ */
+class C10_XPU_API XPUStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a XPUStream from a Stream. This construction is checked, and
+  /// will raise an error if the Stream is not, in fact, a XPU stream.
+  explicit XPUStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::XPU);
+  }
+
+  /// Construct a XPUStream from a Stream with no error checking.
+  explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const XPUStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const XPUStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to sycl::queue&.
+  operator sycl::queue&() const {
+    return queue();
+  }
+
+  /// Implicit conversion to sycl::queue*.
+  operator sycl::queue*() const {
+    return &queue();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// XPU stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Get the XPU device type that this stream is associated with.
+  DeviceType device_type() const {
+    return DeviceType::XPU;
+  }
+
+  /// Get the XPU device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with. The Device is
+  /// guaranteed to be a XPU device.
+  Device device() const {
+    return Device(DeviceType::XPU, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream. StreamId is
+  /// a int64_t representation generated by its type and index.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  /// Return true if all enqueued tasks in this stream have been completed,
+  /// otherwise return false.
+  bool query() const {
+    return queue().ext_oneapi_empty();
+  }
+
+  /// Performs a blocking wait for the completion of all enqueued tasks in this
+  /// stream.
+  void synchronize() const {
+    queue().wait_and_throw();
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_stream_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(&queue()));
+    }
+  }
+
+  /// Return the priority that this stream is associated with. Lower numbers
+  /// represent higher priority.
+  int priority() const;
+
+  /// Explicit conversion to sycl::queue&.
+  sycl::queue& queue() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a XPUStream into a struct representation. The XPUStream
+  /// can be unpacked using unpack3().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  /// Unpack a XPUStream from the 3 fields generated by pack3().
+  static XPUStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return XPUStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  /// Return the range of priority **supported by PyTorch**.
+  static std::tuple<int, int> priority_range() {
+    // See Note [XPU Stream priorities]
+    return std::make_tuple(1, -max_compile_time_stream_priorities + 2);
+  }
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get an XPUStream from an external SYCL queue.
+ *
+ * This function allows interoperability with other libraries by enabling
+ * the use of an external SYCL queue that was not created by PyTorch. This
+ * can be useful for data exchange or other operations where integration
+ * with non-PyTorch queues is required.
+ *
+ * NOTE: It is the user's responsibility to ensure that the referenced SYCL
+ * queue remains alive while the corresponding XPUStream, or any c10::Stream
+ * derived from it, is in use. The different SYCL queue pointers will result in
+ * distinct XPUStream instances, even if the SYCL queues they dereference are
+ * equivalent.
+ */
+C10_XPU_API XPUStream
+getStreamFromExternal(sycl::queue* ext_queue, DeviceIndex device_index);
+
+/**
+ * Get the current XPU stream, for the passed XPU device, or for the current
+ * device if no device index is passed.
+ */
+C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+C10_XPU_API void setCurrentXPUStream(XPUStream stream);
+
+C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s);
+
+/**
+ * Block all reserved SYCL queues in the stream pools on the device, and wait
+ * for their synchronizations.
+ */
+C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1);
+
+} // namespace c10::xpu
+
+namespace std {
+template <>
+struct hash<c10::xpu::XPUStream> {
+  size_t operator()(c10::xpu::XPUStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d700f946ebe76abf99c2641448f5b4e2c3241eb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
@@ -0,0 +1,223 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <vector>
+
+namespace c10::xpu::impl {
+
+struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = kXPU;
+
+  XPUGuardImpl() = default;
+
+  explicit XPUGuardImpl(DeviceType t) {
+    TORCH_CHECK(
+        t == kXPU, "XPUGuardImpl initialized with non-XPU DeviceType: ", t);
+  }
+
+  DeviceType type() const override {
+    return kXPU;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d);
+    const auto old_device_index = c10::xpu::exchange_device(d.index());
+    return Device(kXPU, old_device_index);
+  }
+
+  Device getDevice() const override {
+    const auto device = c10::xpu::current_device();
+    return Device(kXPU, device);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d);
+    c10::xpu::set_device(d.index());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    c10::xpu::set_device(d.index());
+  }
+
+  Stream getStream(Device d) const override {
+    return getCurrentXPUStream(d.index()).unwrap();
+  }
+
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const override {
+    const XPUStream stream(s);
+    const auto old_stream = getCurrentXPUStream(s.device().index());
+    setCurrentXPUStream(stream);
+    return old_stream.unwrap();
+  }
+
+  DeviceIndex deviceCount() const noexcept override {
+    return c10::xpu::device_count();
+  }
+
+  // Event-related functions
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::kXPU, reinterpret_cast<uintptr_t>(event));
+    }
+
+    delete reinterpret_cast<sycl::event*>(event);
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
+    const XPUStream xpu_stream{stream};
+
+    // Delete the event previously recorded.
+    if (xpu_event)
+      delete xpu_event;
+#if SYCL_COMPILER_VERSION >= 20250000
+    if (flag == EventFlag::BACKEND_DEFAULT) {
+      // Use the profiling tag to record the event to enable timing feature.
+      xpu_event =
+          new sycl::event(sycl::ext::oneapi::experimental::submit_profiling_tag(
+              xpu_stream.queue()));
+    } else {
+      xpu_event =
+          new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
+    }
+#else
+    xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
+#endif
+    *event = reinterpret_cast<void*>(xpu_event);
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    std::vector<sycl::event> event_list{*xpu_event};
+    const XPUStream xpu_stream(stream);
+    xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  bool queryEvent(void* event) const override {
+    using namespace sycl::info;
+    if (!event)
+      return true;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    return xpu_event->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  double elapsedTime(
+      void* start_event,
+      void* end_event,
+      const DeviceIndex device_index) const override {
+#if SYCL_COMPILER_VERSION < 20250000
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "elapsedTime requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
+#endif
+    TORCH_CHECK(
+        start_event && end_event,
+        "Both events must be recorded before calculating elapsed time.");
+    auto* xpu_start_event = reinterpret_cast<sycl::event*>(start_event);
+    auto* xpu_end_event = reinterpret_cast<sycl::event*>(end_event);
+
+    using namespace sycl::info::event_profiling;
+    // Block until both of the recorded events are completed.
+    uint64_t end_time_ns = xpu_end_event->get_profiling_info<command_end>();
+    uint64_t start_time_ns = xpu_start_event->get_profiling_info<command_end>();
+    // Return the eplased time in milliseconds.
+    return 1e-6 *
+        (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    return xpu_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    xpu_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
+    }
+    xpu_event->wait_and_throw();
+  }
+
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_device_synchronization(c10::kXPU);
+    }
+    c10::xpu::syncStreamsOnDevice(device_index);
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    const XPUStream xpu_stream{stream};
+    XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
+  }
+};
+
+} // namespace c10::xpu::impl
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..336c8349121389fd6dc64732ef50977e1cb2e0d2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/c10/xpu/test/impl/XPUTest.h
@@ -0,0 +1,26 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <gtest/gtest.h>
+
+#include <c10/util/irange.h>
+
+static inline void initHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = i;
+  }
+}
+
+static inline void clearHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = 0;
+  }
+}
+
+static inline void validateHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    EXPECT_EQ(hostData[i], i);
+  }
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8de86b9ed8e3fc25a3e6efe20bc36f5b29336c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/common.h
@@ -0,0 +1,66 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_CORE_COMMON_H_
+#define CAFFE2_CORE_COMMON_H_
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(_MSC_VER)
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+// Macros used during the build of this caffe2 instance. This header file
+// is automatically generated by the cmake script during build.
+#include "caffe2/core/macros.h"
+
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+// Using statements for common classes that we refer to in caffe2 very often.
+// Note that we only place it inside caffe2 so the global namespace is not
+// polluted.
+/* using override */
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+// Define alignment macro that is cross platform
+#if (defined _MSC_VER && !defined NOMINMAX)
+#define NOMINMAX
+#endif
+
+using std::make_unique;
+
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+using ::round;
+#else
+using std::round;
+#endif // defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+
+// Returns which setting Caffe2 was configured and built with (exported from
+// CMake)
+TORCH_API const std::map<string, string>& GetBuildOptions();
+
+} // namespace caffe2
+
+#endif // CAFFE2_CORE_COMMON_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae86a3366590c8538b92cc7e92191365ef3545c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/macros.h
@@ -0,0 +1,75 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Automatically generated header file for caffe2 macros. These
+// macros are used to build the Caffe2 binary, and if you are
+// building a dependent library, they will need to be set as well
+// for your program to link correctly.
+
+#pragma once
+
+#define CAFFE2_BUILD_SHARED_LIBS
+/* #undef CAFFE2_FORCE_FALLBACK_CUDA_MPI */
+/* #undef CAFFE2_HAS_MKL_DNN */
+/* #undef CAFFE2_HAS_MKL_SGEMM_PACK */
+#define CAFFE2_PERF_WITH_AVX
+#define CAFFE2_PERF_WITH_AVX2
+/* #undef CAFFE2_THREADPOOL_MAIN_IMBALANCE */
+/* #undef CAFFE2_THREADPOOL_STATS */
+/* #undef CAFFE2_USE_ACCELERATE */
+#define CAFFE2_USE_CUDNN
+/* #undef CAFFE2_USE_EIGEN_FOR_BLAS */
+/* #undef CAFFE2_USE_FBCODE */
+/* #undef CAFFE2_USE_GOOGLE_GLOG */
+/* #undef CAFFE2_USE_LITE_PROTO */
+#define CAFFE2_USE_MKL
+#define USE_MKLDNN
+/* #undef CAFFE2_USE_NVTX */
+/* #undef CAFFE2_USE_ITT */
+
+#ifndef EIGEN_MPL2_ONLY
+#define EIGEN_MPL2_ONLY
+#endif
+
+// Useful build settings that are recorded in the compiled binary
+// torch.__config__.show()
+#define CAFFE2_BUILD_STRINGS { \
+  {"TORCH_VERSION", "2.10.0"}, \
+  {"CXX_COMPILER", "/opt/rh/gcc-toolset-13/root/usr/bin/c++"}, \
+  {"CXX_FLAGS", " -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_FBGEMM_GENAI -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow"}, \
+  {"BUILD_TYPE", "Release"}, \
+  {"BLAS_INFO", "mkl"}, \
+  {"LAPACK_INFO", "mkl"}, \
+  {"USE_CUDA", "ON"}, \
+  {"USE_ROCM", "OFF"}, \
+  {"CUDA_VERSION", "12.8"}, \
+  {"ROCM_VERSION", ""}, \
+  {"USE_CUDNN", "ON"}, \
+  {"COMMIT_SHA", "449b1768410104d3ed79d3bcfe4ba1d65c7f22c0"}, \
+  {"CUDNN_VERSION", "9.10.2"}, \
+  {"USE_NCCL", "1"}, \
+  {"USE_MPI", "OFF"}, \
+  {"USE_GFLAGS", "OFF"}, \
+  {"USE_GLOG", "OFF"}, \
+  {"USE_GLOO", "ON"}, \
+  {"USE_NNPACK", "ON"}, \
+  {"USE_OPENMP", "ON"}, \
+  {"FORCE_FALLBACK_CUDA_MPI", ""}, \
+  {"HAS_MKL_DNN", ""}, \
+  {"HAS_MKL_SGEMM_PACK", ""}, \
+  {"PERF_WITH_AVX", "1"}, \
+  {"PERF_WITH_AVX2", "1"}, \
+  {"USE_ACCELERATE", ""}, \
+  {"USE_EIGEN_FOR_BLAS", ""}, \
+  {"USE_LITE_PROTO", ""}, \
+  {"USE_MKL", "ON"}, \
+  {"USE_MKLDNN", "ON"}, \
+  {"USE_NVTX", ""}, \
+  {"USE_ITT", ""}, \
+  {"USE_ROCM_KERNEL_ASSERT", "OFF"}, \
+  {"USE_CUSPARSELT", "1"}, \
+  {"USE_XPU", "OFF"}, \
+  {"USE_XCCL", "OFF"}, \
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..54ff81fc25e27eb38cc23e497b692f321b71c6b4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/core/timer.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_CORE_TIMER_H_
+#define CAFFE2_CORE_TIMER_H_
+
+#include <chrono>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+/**
+ * @brief A simple timer object for measuring time.
+ *
+ * This is a minimal class around a std::chrono::high_resolution_clock that
+ * serves as a utility class for testing code.
+ */
+class Timer {
+ public:
+  typedef std::chrono::high_resolution_clock clock;
+  typedef std::chrono::nanoseconds ns;
+  Timer() { Start(); }
+  /**
+   * @brief Starts a timer.
+   */
+  inline void Start() { start_time_ = clock::now(); }
+  inline float NanoSeconds() {
+    return static_cast<float>(
+        std::chrono::duration_cast<ns>(clock::now() - start_time_).count());
+  }
+  /**
+   * @brief Returns the elapsed time in milliseconds.
+   */
+  inline float MilliSeconds() { return NanoSeconds() / 1000000.f; }
+  /**
+   * @brief Returns the elapsed time in microseconds.
+   */
+  inline float MicroSeconds() { return NanoSeconds() / 1000.f; }
+  /**
+   * @brief Returns the elapsed time in seconds.
+   */
+  inline float Seconds() { return NanoSeconds() / 1000000000.f; }
+
+ protected:
+  std::chrono::time_point<clock> start_time_;
+  C10_DISABLE_COPY_AND_ASSIGN(Timer);
+};
+}
+
+#endif  // CAFFE2_CORE_TIMER_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c7c0b7ec332ff3e66c897806c6e26dbbb7dee9d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/batch_box_cox_vec.h
@@ -0,0 +1,326 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+#include <cmath>
+#include <vector>
+#include <c10/util/irange.h>
+#include <folly/SingletonThreadLocal.h>
+#include <caffe2/perfkernels/common.h>
+#include "vectorizer.h"
+#include <mkl.h>
+
+namespace caffe2::details {
+
+namespace {
+void TileIndicesInPlace(std::vector<int>& v, const std::size_t D, const std::size_t K) {
+  auto n = v.size();
+  v.resize(K * n);
+  for (const auto k : c10::irange(1, K)) {
+    for (const auto j : c10::irange(n)) {
+      v[k * n + j] = v[j] + k * D;
+    }
+  }
+}
+
+// MKL VML function templates.
+template <typename T>
+void PackV(const int N, const T* a, const int* ia, T* y);
+template <typename T>
+void UnpackV(const int N, const T* a, T* y, const int* iy);
+
+#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                   \
+  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
+    OriginalFunc(N, a, ia, y);                                  \
+  }
+DELEGATE_PACKV_FUNCTION(float, vsPackV)
+DELEGATE_PACKV_FUNCTION(double, vdPackV)
+#undef DELEGATE_PACKV_FUNCTION
+
+#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                     \
+  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
+    OriginalFunc(N, a, y, iy);                                    \
+  }
+DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
+DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
+#undef DELEGATE_UNPACKV_FUNCTION
+
+#ifndef FAST_VECTORIZED_KERNEL
+template <typename T>
+void box_cox_zero_lambda(
+    size_t D,
+    const T* const self_data,
+    const T* const lambda2_data,
+    T k_eps,
+    T* const output_data) {
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(self_data + j);
+    auto lambda2 = Vec::loadu(lambda2_data + j);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto res = max.log();
+    res.store(output_data + j);
+  }
+  for ( ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
+  }
+}
+
+template <typename T>
+at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
+    at::vec::Vectorized<T> data,
+    at::vec::Vectorized<T> lambda1,
+    at::vec::Vectorized<T> lambda2,
+    at::vec::Vectorized<T> k_eps) {
+  auto sum = data + lambda2;
+  auto max = at::vec::max(sum, k_eps);
+  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+  auto pow = max.pow(lambda1);
+  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+}
+
+template <typename T>
+void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(data_ptr + j);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j);
+  }
+  if (j < D) {
+    auto remaining = D - j;
+    auto data = Vec::loadu(data_ptr + j, remaining);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j, remaining);
+  }
+}
+#else
+template <typename T>
+void box_cox_zero_lambda(
+    size_t D,
+    const T* const self_data,
+    const T* const lambda2_data,
+    T k_eps,
+    T* const output_data) {
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
+  }
+}
+
+template <typename T>
+void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    FAST_MATH
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lamda1 = lambda1_ptr[j];
+    auto lambda_over_1 = 1 / lamda1;
+    if constexpr (std::is_same<T, float>::value) {
+      lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
+      lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
+    }
+    auto pow = std::pow(max, lamda1);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
+  }
+}
+#endif // FAST_VECTORIZED_KERNEL
+
+template <typename T>
+void box_cox_mixed_lambda(
+    const T* const self_data,
+    const std::vector<int>& nonzeros,
+    const std::vector<int>& zeros,
+    const T* const lambda1,
+    const T* const lambda2,
+    const T* const lambda2_z_,
+    T k_eps,
+    T* const buffer,
+    T* const output_data) {
+  PackV(nonzeros.size(), self_data, nonzeros.data(), buffer);
+  box_cox_nonzero_lambda<T>(
+      nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
+  UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data());
+
+  PackV(zeros.size(), self_data, zeros.data(), buffer);
+  box_cox_zero_lambda<T>(
+      zeros.size(), buffer, lambda2_z_, k_eps, buffer);
+  UnpackV(zeros.size(), buffer, output_data, zeros.data());
+}
+
+template <typename T>
+void TileArrayIntoVector(
+    const T* const a,
+    const size_t D,
+    const int K,
+    std::vector<T>& b) {
+  b.resize(K * D);
+  for (const auto k : c10::irange(K)) {
+    std::copy(a, a + D, b.begin() + k * D);
+  }
+}
+
+template <typename T>
+void compute_batch_box_cox_vec_fma(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const T* self_data,
+    const T* __restrict lambda1_data,
+    const T* __restrict lambda2_data,
+    T* output_data) {
+  constexpr T k_eps = static_cast<T>(1e-6);
+
+  FOLLY_DECLARE_REUSED(zeros, std::vector<int>);
+  FOLLY_DECLARE_REUSED(nonzeros, std::vector<int>);
+  // Don't bother calling reserve; calls after the first will get a
+  // correctly-sized allocation anyway.
+  for (const auto j : c10::irange(D)) {
+    if (lambda1_data[j] == 0) {
+      zeros.push_back(j);
+    } else {
+      nonzeros.push_back(j);
+    }
+  }
+
+  // Process K rows at a time for effective vectorization with small rows.
+  const auto K = std::min(N, (block_size + D - 1) / D);
+
+  FOLLY_DECLARE_REUSED(lambda1_, std::vector<T>);
+  FOLLY_DECLARE_REUSED(lambda2_, std::vector<T>);
+  FOLLY_DECLARE_REUSED(lambda2_z_, std::vector<T>);
+
+  if (nonzeros.size() == D) {
+    // ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
+    size_t i = 0;
+    if (K > 1) {
+      TileArrayIntoVector(lambda1_data, D, K, lambda1_);
+      TileArrayIntoVector(lambda2_data, D, K, lambda2_);
+      DCHECK_EQ(K * D, lambda1_.size());
+      DCHECK_EQ(K * D, lambda2_.size());
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_nonzero_lambda<T>(
+            K * D,
+            self_data,
+            lambda1_.data(),
+            lambda2_.data(),
+            k_eps,
+            output_data);
+      }
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_nonzero_lambda<T>(
+          D, self_data, lambda1_data, lambda2_data, k_eps, output_data);
+    }
+  } else if (zeros.size() == D) {
+    // ln(x + lambda2), if lambda1 == 0
+    size_t i = 0;
+    if (K > 1) {
+      TileArrayIntoVector(lambda2_data, D, K, lambda2_z_);
+      DCHECK_EQ(K * D, lambda2_z_.size());
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_zero_lambda<T>(
+            K * D, self_data, lambda2_z_.data(), k_eps, output_data);
+      }
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_zero_lambda<T>(
+          D, self_data, lambda2_data, k_eps, output_data);
+    }
+  } else {
+    // mix zeros and nonzeros
+    const size_t n = nonzeros.size();
+    if (K > 1) {
+      TileIndicesInPlace(nonzeros, 0, K);
+      TileIndicesInPlace(zeros, 0, K);
+    }
+
+    FOLLY_DECLARE_REUSED(buffer, std::vector<T>);
+
+    buffer.resize(std::max(nonzeros.size(), zeros.size()));
+    lambda1_.resize(nonzeros.size());
+    lambda2_.resize(nonzeros.size());
+    lambda2_z_.resize(zeros.size());
+    PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data());
+    PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data());
+    PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data());
+
+    size_t i = 0;
+    if (K > 1) {
+      // Truncate to original size, and re-tile with offsets this time.
+      nonzeros.resize(n);
+      DCHECK_GT(D, n);
+      zeros.resize(D - n);
+      TileIndicesInPlace(nonzeros, D, K);
+      TileIndicesInPlace(zeros, D, K);
+      DCHECK_EQ(nonzeros.size(), lambda1_.size());
+      DCHECK_EQ(nonzeros.size(), lambda2_.size());
+      DCHECK_EQ(zeros.size(), lambda2_z_.size());
+
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_mixed_lambda<T>(
+            self_data,
+            nonzeros,
+            zeros,
+            lambda1_.data(),
+            lambda2_.data(),
+            lambda2_z_.data(),
+            k_eps,
+            buffer.data(),
+            output_data);
+      }
+      // Truncate to original size.
+      nonzeros.resize(n);
+      zeros.resize(D - n);
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_mixed_lambda<T>(
+          self_data,
+          nonzeros,
+          zeros,
+          lambda1_.data(),
+          lambda2_.data(),
+          lambda2_z_.data(),
+          k_eps,
+          buffer.data(),
+          output_data);
+    }
+  }
+}
+}  // namespace
+
+}   // namespace caffe2::details
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..f927b1ac74631203bfb9ac4bf869d0e2fa7b0a7c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/common.h
@@ -0,0 +1,145 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// !!!! PLEASE READ !!!!
+// Minimize (transitively) included headers from _avx*.cc because some of the
+// functions defined in the headers compiled with platform dependent compiler
+// options can be reused by other translation units generating illegal
+// instruction run-time error.
+
+// Common utilities for writing performance kernels and easy dispatching of
+// different backends.
+/*
+The general workflow shall be as follows, say we want to
+implement a functionality called void foo(int a, float b).
+
+In foo.h, do:
+   void foo(int a, float b);
+
+In foo_avx512.cc, do:
+   void foo__avx512(int a, float b) {
+     [actual avx512 implementation]
+   }
+
+In foo_avx2.cc, do:
+   void foo__avx2(int a, float b) {
+     [actual avx2 implementation]
+   }
+
+In foo_avx.cc, do:
+   void foo__avx(int a, float b) {
+     [actual avx implementation]
+   }
+
+In foo.cc, do:
+   // The base implementation should *always* be provided.
+   void foo__base(int a, float b) {
+     [base, possibly slow implementation]
+   }
+   decltype(foo__base) foo__avx512;
+   decltype(foo__base) foo__avx2;
+   decltype(foo__base) foo__avx;
+   void foo(int a, float b) {
+     // You should always order things by their preference, faster
+     // implementations earlier in the function.
+     AVX512_DO(foo, a, b);
+     AVX2_DO(foo, a, b);
+     AVX_DO(foo, a, b);
+     BASE_DO(foo, a, b);
+   }
+
+*/
+// Details: this functionality basically covers the cases for both build time
+// and run time architecture support.
+//
+// During build time:
+//    The build system should provide flags CAFFE2_PERF_WITH_AVX512,
+//    CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
+//    __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
+//    compiler provides. Note that we do not use the compiler flags but rely on
+//    the build system flags, because the common files (like foo.cc above) will
+//    always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
+//    and __AVX__.
+// During run time:
+//    we use cpuinfo to identify cpu support and run the proper functions.
+
+#pragma once
+#if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \
+    defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX)
+#include <cpuinfo.h>
+#endif
+
+// DO macros: these should be used in your entry function, similar to foo()
+// above, that routes implementations based on CPU capability.
+
+#define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
+
+#ifdef CAFFE2_PERF_WITH_SVE
+#define SVE_DO(funcname, ...)                                               \
+  {                                                                         \
+    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \
+    if (isDo) {                                                             \
+      return funcname##__sve(__VA_ARGS__);                                  \
+    }                                                                       \
+  }
+#else // CAFFE2_PERF_WITH_SVE
+#define SVE_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_SVE
+
+#ifdef CAFFE2_PERF_WITH_AVX512
+#define AVX512_DO(funcname, ...)                                   \
+  {                                                                \
+    static const bool isDo = cpuinfo_initialize() &&               \
+        cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
+        cpuinfo_has_x86_avx512vl();                                \
+    if (isDo) {                                                    \
+      return funcname##__avx512(__VA_ARGS__);                      \
+    }                                                              \
+  }
+#else // CAFFE2_PERF_WITH_AVX512
+#define AVX512_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_AVX512
+
+#ifdef CAFFE2_PERF_WITH_AVX2
+#define AVX2_DO(funcname, ...)                                               \
+  {                                                                          \
+    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
+    if (isDo) {                                                              \
+      return funcname##__avx2(__VA_ARGS__);                                  \
+    }                                                                        \
+  }
+#define AVX2_FMA_DO(funcname, ...)                                             \
+  {                                                                            \
+    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
+        cpuinfo_has_x86_fma3();                                                \
+    if (isDo) {                                                                \
+      return funcname##__avx2_fma(__VA_ARGS__);                                \
+    }                                                                          \
+  }
+#else // CAFFE2_PERF_WITH_AVX2
+#define AVX2_DO(funcname, ...)
+#define AVX2_FMA_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_AVX2
+
+#ifdef CAFFE2_PERF_WITH_AVX
+#define AVX_DO(funcname, ...)                                               \
+  {                                                                         \
+    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
+    if (isDo) {                                                             \
+      return funcname##__avx(__VA_ARGS__);                                  \
+    }                                                                       \
+  }
+#define AVX_F16C_DO(funcname, ...)                                            \
+  {                                                                           \
+    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
+        cpuinfo_has_x86_f16c();                                               \
+    if (isDo) {                                                               \
+      return funcname##__avx_f16c(__VA_ARGS__);                               \
+    }                                                                         \
+  }
+#else // CAFFE2_PERF_WITH_AVX
+#define AVX_DO(funcname, ...)
+#define AVX_F16C_DO(funcname, ...)
+#endif // CAFFE2_PERF_WITH_AVX
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h
new file mode 100644
index 0000000000000000000000000000000000000000..45eb7106de95e6ae73e4a99b020339aadb7fc527
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/perfkernels/embedding_lookup_idx.h
@@ -0,0 +1,62 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstdint>
+
+namespace caffe2 {
+
+// clang-format off
+/**
+ * Embedding lookup with reduction.
+ *
+ * `input` of size data_size * block_size
+ * `indices` of size index_size
+ * `offsets` of size output_size
+ * `weights` nullptr or array of size index_size
+ * `out` of size output_size * block_size
+ *
+ * Behavior is roughly equivalent to pseudocode:
+ *
+ * pos = 0
+ * for (i = 0..output_size-1)
+ *   for (k = 0..block_size-1)
+ *     out[i*block_size + k] = 0
+ *   start_offset = offsets[i]
+ *   end_offset = offsets[i+1]
+ *   length = end_offset - start_offset
+ *   for (j = start_offset..end_offset-1)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] += input[indices[pos]*block_size + k] *
+ *           (weights ? weights[IS_WEIGHT_POSITIONAL ? j - start_offset : pos] : 1.0)
+ *     pos += 1
+ *   if (normalize_weights && length > 0)
+ *     for (k = 0..block_size-1)
+ *       out[i*block_size + k] /= length
+ *
+ * TODO: make this API also take "offsets" rather than "lengths" to match the
+ *       API for PyTorch's EmbeddingBag
+ */
+// clang-format on
+template <
+    typename IndexType,
+    typename InType,
+    typename OutType,
+    bool IS_WEIGHT_POSITIONAL = false>
+void EmbeddingLookupIdx(
+    const std::int64_t block_size,
+    const std::int64_t output_size,
+    const std::int64_t index_size,
+    const std::int64_t data_size,
+    const InType* input,
+    const IndexType* indices,
+    const IndexType* offsets,
+    const float* weights, // optional, can be null for non-weighted sum
+    const float* scale_bias, // optional scale & bias params for uint8 input
+    bool normalize_by_lengths,
+    OutType* out);
+
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5586b37e59707104b1138b4f806200ded8466e87
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/crc_alt.h
@@ -0,0 +1,1348 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// //////////////////////////////////////////////////////////
+// Crc32.h
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table by undefining these lines:
+#define CRC32_USE_LOOKUP_TABLE_BYTE
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte_tableless and crc32_1byte_tableless2 don't need it at all
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+// using the aforementioned #defines the table is automatically fitted to your needs
+
+// uint8_t, uint32_t, int32_t
+#include <stdint.h>
+// size_t
+#include <cstddef>
+
+// crc32_fast selects the fastest algorithm depending on flags (CRC32_USE_LOOKUP_...)
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast    (const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (half-byte algorithm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte   (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32 = 0, size_t prefetchAhead = 256);
+#endif
+
+// //////////////////////////////////////////////////////////
+// Crc32.cpp
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table:
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+
+
+#ifndef __LITTLE_ENDIAN
+  #define __LITTLE_ENDIAN 1234
+#endif
+#ifndef __BIG_ENDIAN
+  #define __BIG_ENDIAN    4321
+#endif
+
+// define endianness and some integer data types
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  // Windows always little endian
+  #define __BYTE_ORDER __LITTLE_ENDIAN
+
+  // intrinsics / prefetching
+  #if defined(_M_ARM64)
+    #include <intrin.h>
+  #else
+    #include <xmmintrin.h>
+  #endif
+
+  #ifdef __MINGW32__
+    #define PREFETCH(location) __builtin_prefetch(location)
+  #else
+    #if defined(_M_ARM64)
+      #define PREFETCH(location) __prefetch(location)
+    #else
+      #define PREFETCH(location) _mm_prefetch(location, _MM_HINT_T0)
+    #endif
+  #endif
+#elif defined(__APPLE__)
+  #include <TargetConditionals.h>
+    #if TARGET_IPHONE_SIMULATOR
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_IPHONE
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_MAC
+      #include <machine/endian.h>
+      #if defined(__BIG_ENDIAN__)
+          #define __BYTE_ORDER __BIG_ENDIAN
+      #endif
+      #if defined(__LITTLE_ENDIAN__)
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+      #endif
+    #else
+      # error "Unknown Apple platform"
+    #endif
+#elif defined(__ARMEB__)
+  #define __BYTE_ORDER __BIG_ENDIAN
+#elif (defined(__BYTE_ORDER__) and !defined(__BYTE_ORDER))
+    #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        #define __BYTE_ORDER __BIG_ENDIAN
+    #else
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+    #endif
+#else
+  // defines __BYTE_ORDER as __LITTLE_ENDIAN or __BIG_ENDIAN
+  #include <sys/param.h>
+#endif
+
+// intrinsics / prefetching
+#ifdef __GNUC__
+  #define PREFETCH(location) __builtin_prefetch(location)
+#else
+#ifndef PREFETCH
+  // no prefetching
+  #define PREFETCH(location) ;
+#endif
+#endif
+
+// abort if byte order is undefined
+#ifndef __BYTE_ORDER
+#error undefined byte order, compile with -D__BYTE_ORDER=1234 (if little endian) or -D__BYTE_ORDER=4321 (big endian)
+#endif
+
+
+namespace
+{
+  /// zlib's CRC32 polynomial
+  const uint32_t Polynomial = 0xEDB88320;
+
+  /// swap endianness
+  static inline uint32_t swap(uint32_t x)
+  {
+  #if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(x);
+  #else
+    return (x >> 24) |
+          ((x >>  8) & 0x0000FF00) |
+          ((x <<  8) & 0x00FF0000) |
+           (x << 24);
+  #endif
+  }
+
+  /// Slicing-By-16
+  #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  const size_t MaxSlice = 16;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  const size_t MaxSlice = 8;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  const size_t MaxSlice = 4;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  const size_t MaxSlice = 1;
+  #else
+    #define NO_LUT // don't need Crc32Lookup at all
+  #endif
+
+} // anonymous namespace
+
+#ifndef NO_LUT
+/// forward declaration, table is at the end of this file
+extern const uint32_t Crc32Lookup[MaxSlice][256]; // extern is needed to keep compiler happy
+#endif
+
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc ^= *current++;
+
+    for (int j = 0; j < 8; j++)
+    {
+      // branch-free
+      crc = (crc >> 1) ^ (-int32_t(crc & 1) & Polynomial);
+
+      // branching, much slower:
+      //if (crc & 1)
+      //  crc = (crc >> 1) ^ Polynomial;
+      //else
+      //  crc =  crc >> 1;
+    }
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (half-byte algorithm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  /// look-up table for half-byte, same as crc32Lookup[0][16*i]
+  static const uint32_t Crc32Lookup16[16] =
+  {
+    0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C,
+    0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C
+  };
+
+  while (length-- != 0)
+  {
+    crc = Crc32Lookup16[(crc ^  *current      ) & 0x0F] ^ (crc >> 4);
+    crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4);
+    current++;
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *current++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    uint8_t s = uint8_t(crc) ^ *current++;
+
+    // Hagai Gold made me aware of this table-less algorithm and send me code
+
+    // polynomial 0xEDB88320 can be written in binary as 11101101101110001000001100100000b
+    // reverse the bits (or just assume bit 0 is the first one)
+    // and we have bits set at position 0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 16, 22, 23, 26
+    // => those are the shift offsets:
+    //crc = (crc >> 8) ^
+    //       t ^
+    //      (t >>  1) ^ (t >>  2) ^ (t >>  4) ^ (t >>  5) ^  // == y
+    //      (t >>  7) ^ (t >>  8) ^ (t >> 10) ^ (t >> 11) ^  // == y >> 6
+    //      (t >> 12) ^ (t >> 16) ^                          // == z
+    //      (t >> 22) ^ (t >> 26) ^                          // == z >> 10
+    //      (t >> 23);
+
+    // the fastest I can come up with:
+    uint32_t low = (s ^ (s << 6)) & 0xFF;
+    uint32_t a   = (low * ((1 << 23) + (1 << 14) + (1 << 2)));
+    crc = (crc >> 8) ^
+          (low * ((1 << 24) + (1 << 16) + (1 << 8))) ^
+           a ^
+          (a >> 1) ^
+          (low * ((1 << 20) + (1 << 12)           )) ^
+          (low << 19) ^
+          (low << 17) ^
+          (low >>  2);
+
+    // Hagai's code:
+    /*uint32_t t = (s ^ (s << 6)) << 24;
+    // some temporaries to optimize XOR
+    uint32_t x = (t >> 1) ^ (t >> 2);
+    uint32_t y = x ^ (x >> 3);
+    uint32_t z = (t >> 12) ^ (t >> 16);
+    crc = (crc >> 8) ^
+           t ^ (t >> 23) ^
+           y ^ (y >>  6) ^
+           z ^ (z >> 10);*/
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32)
+{
+  int32_t crc = ~previousCrc32; // note: signed integer, right shift distributes sign bit into lower bits
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc = crc ^ *current++;
+
+    uint32_t c = (((crc << 31) >> 31) & ((Polynomial >> 7)  ^ (Polynomial >> 1))) ^
+                 (((crc << 30) >> 31) & ((Polynomial >> 6)  ^  Polynomial)) ^
+                 (((crc << 29) >> 31) &  (Polynomial >> 5)) ^
+                 (((crc << 28) >> 31) &  (Polynomial >> 4)) ^
+                 (((crc << 27) >> 31) &  (Polynomial >> 3)) ^
+                 (((crc << 26) >> 31) &  (Polynomial >> 2)) ^
+                 (((crc << 25) >> 31) &  (Polynomial >> 1)) ^
+                 (((crc << 24) >> 31) &   Polynomial);
+
+    crc = ((uint32_t)crc >> 8) ^ c; // convert to unsigned integer before right shift
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t  crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process four bytes at once (Slicing-by-4)
+  while (length >= 4)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    crc = Crc32Lookup[0][ one      & 0xFF] ^
+          Crc32Lookup[1][(one>> 8) & 0xFF] ^
+          Crc32Lookup[2][(one>>16) & 0xFF] ^
+          Crc32Lookup[3][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    crc = Crc32Lookup[0][(one>>24) & 0xFF] ^
+          Crc32Lookup[1][(one>>16) & 0xFF] ^
+          Crc32Lookup[2][(one>> 8) & 0xFF] ^
+          Crc32Lookup[3][ one      & 0xFF];
+#endif
+
+    length -= 4;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 3 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process eight bytes at once (Slicing-by-8)
+  while (length >= 8)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][ two      & 0xFF] ^
+          Crc32Lookup[1][(two>> 8) & 0xFF] ^
+          Crc32Lookup[2][(two>>16) & 0xFF] ^
+          Crc32Lookup[3][(two>>24) & 0xFF] ^
+          Crc32Lookup[4][ one      & 0xFF] ^
+          Crc32Lookup[5][(one>> 8) & 0xFF] ^
+          Crc32Lookup[6][(one>>16) & 0xFF] ^
+          Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+          Crc32Lookup[1][(two>>16) & 0xFF] ^
+          Crc32Lookup[2][(two>> 8) & 0xFF] ^
+          Crc32Lookup[3][ two      & 0xFF] ^
+          Crc32Lookup[4][(one>>24) & 0xFF] ^
+          Crc32Lookup[5][(one>>16) & 0xFF] ^
+          Crc32Lookup[6][(one>> 8) & 0xFF] ^
+          Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    length -= 8;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 7 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 8 * Unroll;
+
+  // process 4x eight bytes at once (Slicing-by-8)
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+      uint32_t one = *current++ ^ swap(crc);
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][ two      & 0xFF] ^
+            Crc32Lookup[1][(two>> 8) & 0xFF] ^
+            Crc32Lookup[2][(two>>16) & 0xFF] ^
+            Crc32Lookup[3][(two>>24) & 0xFF] ^
+            Crc32Lookup[4][ one      & 0xFF] ^
+            Crc32Lookup[5][(one>> 8) & 0xFF] ^
+            Crc32Lookup[6][(one>>16) & 0xFF] ^
+            Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+      uint32_t one = *current++ ^ crc;
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+            Crc32Lookup[1][(two>>16) & 0xFF] ^
+            Crc32Lookup[2][(two>> 8) & 0xFF] ^
+            Crc32Lookup[3][ two      & 0xFF] ^
+            Crc32Lookup[4][(one>>24) & 0xFF] ^
+            Crc32Lookup[5][(one>>16) & 0xFF] ^
+            Crc32Lookup[6][(one>> 8) & 0xFF] ^
+            Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 31 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32, size_t prefetchAhead)
+{
+  // CRC code is identical to crc32_16bytes (including unrolling), only added prefetching
+  // 256 bytes look-ahead seems to be the sweet spot on Core i7 CPUs
+
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce + prefetchAhead)
+  {
+    PREFETCH(((const char*) current) + prefetchAhead);
+
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast(const void* data, size_t length, uint32_t previousCrc32)
+{
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  return crc32_16bytes (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  return crc32_8bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  return crc32_4bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  return crc32_1byte   (data, length, previousCrc32);
+#else
+  return crc32_halfbyte(data, length, previousCrc32);
+#endif
+}
+
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
+{
+  // based on Mark Adler's crc_combine from
+  // https://github.com/madler/pigz/blob/master/pigz.c
+
+  // main idea:
+  // - if you have two equally-sized blocks A and B,
+  //   then you can create a block C = A ^ B
+  //   which has the property crc(C) = crc(A) ^ crc(B)
+  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
+  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
+  //   then exists a C' = A' ^ B'
+  // - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
+  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
+  // - the trick is to compute crc(A') based on crc(A)
+  //                       and crc(B') based on crc(B)
+  // - since B' starts with many zeros, the crc of those initial zeros is still zero
+  // - that means crc(B') = crc(B)
+  // - unfortunately the trailing zeros of A' change the crc, so usually crc(A') != crc(A)
+  // - the following code is a fast algorithm to compute crc(A')
+  // - starting with crc(A) and appending length(B) zeros, needing just log2(length(B)) iterations
+  // - the details are explained by the original author at
+  //   https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768
+  //
+  // notes:
+  // - I squeezed everything into one function to keep global namespace clean (original code two helper functions)
+  // - most original comments are still in place, I added comments where these helper functions where made inline code
+  // - performance-wise there isn't any differenze to the original zlib/pigz code
+
+  // degenerated case
+  if (lengthB == 0)
+    return crcA;
+
+  /// CRC32 => 32 bits
+  const uint32_t CrcBits = 32;
+
+  uint32_t odd [CrcBits]; // odd-power-of-two  zeros operator
+  uint32_t even[CrcBits]; // even-power-of-two zeros operator
+
+  // put operator for one zero bit in odd
+  odd[0] = Polynomial;    // CRC-32 polynomial
+  for (uint32_t i = 1; i < CrcBits; i++)
+    odd[i] = 1 << (i - 1);
+
+  // put operator for two zero bits in even
+  // same as gf2_matrix_square(even, odd);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = odd[i];
+    even[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        even[i] ^= odd[j];
+  }
+  // put operator for four zero bits in odd
+  // same as gf2_matrix_square(odd, even);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = even[i];
+    odd[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        odd[i] ^= even[j];
+  }
+
+  // the following loop becomes much shorter if I keep swapping even and odd
+  uint32_t* a = even;
+  uint32_t* b = odd;
+  // apply secondLength zeros to firstCrc32
+  for (; lengthB > 0; lengthB >>= 1)
+  {
+    // same as gf2_matrix_square(a, b);
+    for (uint32_t i = 0; i < CrcBits; i++)
+    {
+      uint32_t vec = b[i];
+      a[i] = 0;
+      for (int j = 0; vec != 0; j++, vec >>= 1)
+        if (vec & 1)
+          a[i] ^= b[j];
+    }
+
+    // apply zeros operator for this bit
+    if (lengthB & 1)
+    {
+      // same as firstCrc32 = gf2_matrix_times(a, firstCrc32);
+      uint32_t sum = 0;
+      for (int i = 0; crcA != 0; i++, crcA >>= 1)
+        if (crcA & 1)
+          sum ^= a[i];
+      crcA = sum;
+    }
+
+    // switch even and odd
+    uint32_t* t = a; a = b; b = t;
+  }
+
+  // return combined crc
+  return crcA ^ crcB;
+}
+
+
+// //////////////////////////////////////////////////////////
+// constants
+
+
+#ifndef NO_LUT
+/// look-up table, already declared above
+const uint32_t Crc32Lookup[MaxSlice][256] =
+{
+  //// same algorithm as crc32_bitwise
+  //for (int i = 0; i <= 0xFF; i++)
+  //{
+  //  uint32_t crc = i;
+  //  for (int j = 0; j < 8; j++)
+  //    crc = (crc >> 1) ^ ((crc & 1) * Polynomial);
+  //  Crc32Lookup[0][i] = crc;
+  //}
+  //// ... and the following slicing-by-8 algorithm (from Intel):
+  //// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+  //// http://sourceforge.net/projects/slicing-by-8/
+  //for (int slice = 1; slice < MaxSlice; slice++)
+  //  Crc32Lookup[slice][i] = (Crc32Lookup[slice - 1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[slice - 1][i] & 0xFF];
+  {
+    // note: the first number of every second row corresponds to the half-byte look-up table !
+    0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3,
+    0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91,
+    0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7,
+    0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,
+    0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,
+    0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,
+    0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
+    0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D,
+    0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433,
+    0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01,
+    0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,
+    0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,
+    0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,
+    0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
+    0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F,
+    0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD,
+    0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683,
+    0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,
+    0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,
+    0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,
+    0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
+    0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79,
+    0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F,
+    0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,
+    0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,
+    0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,
+    0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,
+    0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
+    0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB,
+    0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9,
+    0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF,
+    0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D,
+  }
+
+#if defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-4, Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7,
+    0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF,
+    0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496,
+    0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E,
+    0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265,
+    0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D,
+    0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034,
+    0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C,
+    0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2,
+    0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA,
+    0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93,
+    0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B,
+    0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60,
+    0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768,
+    0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31,
+    0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539,
+    0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C,
+    0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484,
+    0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD,
+    0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5,
+    0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E,
+    0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026,
+    0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F,
+    0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277,
+    0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189,
+    0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81,
+    0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8,
+    0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0,
+    0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B,
+    0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23,
+    0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A,
+    0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72,
+  },
+
+  {
+    0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685,
+    0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D,
+    0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5,
+    0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D,
+    0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065,
+    0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD,
+    0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315,
+    0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD,
+    0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45,
+    0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD,
+    0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835,
+    0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D,
+    0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5,
+    0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D,
+    0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5,
+    0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D,
+    0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05,
+    0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD,
+    0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75,
+    0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD,
+    0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5,
+    0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D,
+    0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895,
+    0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D,
+    0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5,
+    0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D,
+    0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5,
+    0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D,
+    0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625,
+    0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D,
+    0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555,
+    0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED,
+  },
+
+  {
+    0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9,
+    0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056,
+    0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26,
+    0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9,
+    0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787,
+    0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68,
+    0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018,
+    0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7,
+    0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084,
+    0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B,
+    0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B,
+    0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4,
+    0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA,
+    0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755,
+    0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825,
+    0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA,
+    0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82,
+    0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D,
+    0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D,
+    0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2,
+    0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC,
+    0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953,
+    0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623,
+    0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC,
+    0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF,
+    0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50,
+    0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120,
+    0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF,
+    0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981,
+    0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E,
+    0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E,
+    0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1,
+  }
+#endif // defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+#if defined (CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10,
+    0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1,
+    0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92,
+    0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053,
+    0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314,
+    0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5,
+    0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496,
+    0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57,
+    0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459,
+    0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98,
+    0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB,
+    0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A,
+    0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D,
+    0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C,
+    0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF,
+    0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E,
+    0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82,
+    0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743,
+    0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00,
+    0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1,
+    0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386,
+    0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847,
+    0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404,
+    0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5,
+    0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB,
+    0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A,
+    0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349,
+    0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888,
+    0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF,
+    0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E,
+    0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D,
+    0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C,
+  },
+
+  {
+    0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8,
+    0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5,
+    0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223,
+    0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E,
+    0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E,
+    0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3,
+    0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715,
+    0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578,
+    0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4,
+    0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9,
+    0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F,
+    0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22,
+    0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2,
+    0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F,
+    0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79,
+    0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14,
+    0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460,
+    0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D,
+    0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB,
+    0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496,
+    0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156,
+    0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B,
+    0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD,
+    0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0,
+    0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C,
+    0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61,
+    0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97,
+    0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA,
+    0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A,
+    0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957,
+    0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1,
+    0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC,
+  },
+
+  {
+    0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E,
+    0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9,
+    0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240,
+    0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27,
+    0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712,
+    0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975,
+    0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC,
+    0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB,
+    0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7,
+    0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590,
+    0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739,
+    0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E,
+    0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B,
+    0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C,
+    0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5,
+    0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2,
+    0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C,
+    0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B,
+    0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2,
+    0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5,
+    0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0,
+    0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387,
+    0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E,
+    0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49,
+    0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105,
+    0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62,
+    0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB,
+    0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC,
+    0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899,
+    0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE,
+    0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457,
+    0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30,
+  },
+
+  {
+    0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919,
+    0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC,
+    0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832,
+    0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387,
+    0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F,
+    0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA,
+    0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64,
+    0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1,
+    0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4,
+    0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041,
+    0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF,
+    0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A,
+    0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2,
+    0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217,
+    0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889,
+    0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C,
+    0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3,
+    0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776,
+    0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8,
+    0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D,
+    0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95,
+    0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520,
+    0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE,
+    0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B,
+    0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E,
+    0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B,
+    0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05,
+    0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0,
+    0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78,
+    0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD,
+    0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53,
+    0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 || CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  // beyond this point only relevant for Slicing-by-16
+  ,{
+    0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9,
+    0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1,
+    0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8,
+    0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0,
+    0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A,
+    0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72,
+    0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B,
+    0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03,
+    0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE,
+    0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6,
+    0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF,
+    0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7,
+    0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D,
+    0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75,
+    0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C,
+    0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04,
+    0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86,
+    0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E,
+    0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7,
+    0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF,
+    0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25,
+    0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D,
+    0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54,
+    0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C,
+    0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81,
+    0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99,
+    0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0,
+    0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8,
+    0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22,
+    0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A,
+    0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53,
+    0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B,
+  },
+
+  {
+    0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79,
+    0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D,
+    0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91,
+    0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65,
+    0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9,
+    0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D,
+    0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941,
+    0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5,
+    0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9,
+    0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D,
+    0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31,
+    0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5,
+    0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09,
+    0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD,
+    0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1,
+    0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15,
+    0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278,
+    0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C,
+    0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390,
+    0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364,
+    0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8,
+    0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C,
+    0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040,
+    0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4,
+    0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8,
+    0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C,
+    0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430,
+    0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4,
+    0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608,
+    0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC,
+    0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0,
+    0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714,
+  },
+
+  {
+    0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583,
+    0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6,
+    0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148,
+    0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D,
+    0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54,
+    0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11,
+    0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F,
+    0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA,
+    0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C,
+    0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29,
+    0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7,
+    0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2,
+    0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB,
+    0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE,
+    0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770,
+    0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635,
+    0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C,
+    0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159,
+    0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7,
+    0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592,
+    0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB,
+    0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E,
+    0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00,
+    0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45,
+    0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3,
+    0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6,
+    0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38,
+    0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D,
+    0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624,
+    0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761,
+    0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF,
+    0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA,
+  },
+
+  {
+    0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F,
+    0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71,
+    0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473,
+    0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D,
+    0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277,
+    0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489,
+    0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B,
+    0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975,
+    0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F,
+    0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881,
+    0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383,
+    0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D,
+    0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587,
+    0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379,
+    0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B,
+    0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85,
+    0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F,
+    0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091,
+    0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93,
+    0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D,
+    0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97,
+    0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69,
+    0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B,
+    0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695,
+    0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F,
+    0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761,
+    0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63,
+    0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D,
+    0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67,
+    0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99,
+    0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B,
+    0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165,
+  },
+
+  {
+    0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658,
+    0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535,
+    0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082,
+    0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF,
+    0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD,
+    0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0,
+    0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77,
+    0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A,
+    0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3,
+    0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E,
+    0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129,
+    0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244,
+    0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06,
+    0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B,
+    0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC,
+    0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1,
+    0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F,
+    0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022,
+    0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595,
+    0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8,
+    0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA,
+    0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7,
+    0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60,
+    0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D,
+    0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4,
+    0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189,
+    0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E,
+    0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753,
+    0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911,
+    0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C,
+    0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB,
+    0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6,
+  },
+
+  {
+    0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0,
+    0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6,
+    0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC,
+    0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A,
+    0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218,
+    0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E,
+    0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74,
+    0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042,
+    0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31,
+    0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307,
+    0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D,
+    0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B,
+    0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9,
+    0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF,
+    0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985,
+    0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3,
+    0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522,
+    0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14,
+    0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E,
+    0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778,
+    0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA,
+    0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC,
+    0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196,
+    0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0,
+    0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3,
+    0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5,
+    0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF,
+    0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089,
+    0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B,
+    0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D,
+    0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667,
+    0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851,
+  },
+
+  {
+    0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A,
+    0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D,
+    0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055,
+    0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2,
+    0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184,
+    0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03,
+    0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB,
+    0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C,
+    0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467,
+    0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0,
+    0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28,
+    0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF,
+    0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9,
+    0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E,
+    0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6,
+    0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931,
+    0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1,
+    0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326,
+    0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE,
+    0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69,
+    0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F,
+    0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8,
+    0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70,
+    0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7,
+    0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC,
+    0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B,
+    0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93,
+    0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714,
+    0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42,
+    0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5,
+    0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D,
+    0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A,
+  },
+
+  {
+    0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875,
+    0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A,
+    0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB,
+    0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4,
+    0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308,
+    0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47,
+    0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96,
+    0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9,
+    0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F,
+    0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0,
+    0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011,
+    0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E,
+    0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2,
+    0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD,
+    0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C,
+    0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423,
+    0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581,
+    0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE,
+    0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F,
+    0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450,
+    0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC,
+    0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3,
+    0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062,
+    0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D,
+    0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B,
+    0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34,
+    0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5,
+    0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA,
+    0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806,
+    0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749,
+    0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698,
+    0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+};
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..398b9e97c0bb68bb2fe2e3e223c641b7dd114acb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/file_adapter.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <fstream>
+#include <memory>
+
+#include "caffe2/serialize/istream_adapter.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+
+namespace caffe2 {
+namespace serialize {
+
+class TORCH_API FileAdapter final : public ReadAdapterInterface {
+ public:
+  C10_DISABLE_COPY_AND_ASSIGN(FileAdapter);
+  explicit FileAdapter(const std::string& file_name);
+  size_t size() const override;
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override;
+  ~FileAdapter() override;
+
+ private:
+  // An RAII Wrapper for a FILE pointer. Closes on destruction.
+  struct RAIIFile {
+    FILE* fp_;
+    explicit RAIIFile(const std::string& file_name);
+    ~RAIIFile();
+  };
+
+  RAIIFile file_;
+  // The size of the opened file in bytes
+  uint64_t size_;
+};
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..394898e5ed08ec4c62c8868ae12cf846ad7bf22f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h
@@ -0,0 +1,35 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <caffe2/serialize/read_adapter_interface.h>
+#include <cstring>
+
+namespace caffe2 {
+namespace serialize {
+
+class MemoryReadAdapter final : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  explicit MemoryReadAdapter(const void* data, off_t size)
+      : data_(data), size_(size) {}
+
+  size_t size() const override {
+    return size_;
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override {
+    (void)what;
+    memcpy(buf, (int8_t*)(data_) + pos, n);
+    return n;
+  }
+
+ private:
+  const void* data_;
+  off_t size_;
+};
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef3436b6fece5e661fa4977cafb8d8534f2235fd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/inline_container.h
@@ -0,0 +1,315 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <istream>
+#include <mutex>
+#include <ostream>
+#include <unordered_set>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Backend.h>
+
+#include "caffe2/serialize/istream_adapter.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+#include "caffe2/serialize/versions.h"
+
+extern "C" {
+typedef struct mz_zip_archive mz_zip_archive;
+}
+
+// PyTorch containers are a special zip archive with the following layout
+// archive_name.zip contains:
+//    archive_name/
+//        version # a file with a single decimal number written in ascii,
+//                # used to establish the version of the archive format
+//        model.json # overall model description, this is a json output of
+//                   # ModelDef from torch.proto
+//        # the following names are by convention only, model.json will
+//        # refer to these files by full names
+//        tensors/
+//          0 # flat storage for tensor data, meta-data about shapes, etc. is
+//            # in model.json
+//          1
+//          ...
+//        # code entries will only exist for modules that have methods attached
+//        code/
+//          archive_name.py # serialized torch script code (python syntax, using
+//          PythonPrint) archive_name_my_submodule.py # submodules have separate
+//          files
+//
+// The PyTorchStreamWriter also ensures additional useful properties for these
+// files
+// 1. All files are stored uncompressed.
+// 2. All files in the archive are aligned to 64 byte boundaries such that
+//    it is possible to mmap the entire file and get an aligned pointer to
+//    tensor data.
+// 3. We universally write in ZIP64 format for consistency.
+
+// The PyTorchStreamReader also provides additional properties:
+// 1. It can read zip files that are created with common
+//    zip tools. This means that even though our writer doesn't compress files,
+//    the reader can still read files that were compressed.
+// 2. It provides a getRecordOffset function which returns the offset into the
+//    raw file where file data lives. If the file was written with
+//    PyTorchStreamWriter it is guaranteed to be 64 byte aligned.
+
+// PyTorchReader/Writer handle checking the version number on the archive format
+// and ensure that all files are written to a archive_name directory so they
+// unzip cleanly.
+
+// When developing this format we want to pay particular attention to the
+// following use cases:
+//
+// -- Reading --
+// 1) Reading with full random access
+//   a) Reading with file api's such as fread()
+//   b) mmaping the file and jumping around the mapped region
+// 2) Reading with 1-pass sequential access
+//      -> A reader will need to build up a data structure of parsed structures
+//         as it reads
+//
+// -- Writing --
+// 1) Writing with full random access
+// 2) Writing with 1-pass sequential access
+//      -> We must take care not to require updating values that have already
+//         been written. We place the variable-length index at the end and do
+//         not put any index into the header to fulfill this constraint.
+
+// The model.json, which contains all the metadata information,
+// should be written as the last file. One reason is that the size of tensor
+// data is usually stable. As long as the shape and type of the tensor do not
+// change, the size of the data won't change. On the other sied, the size of the
+// serialized model is likely to change, so we store it as the last record, and
+// we don't need to move previous records when updating the model data.
+
+// The zip format is sufficiently flexible to handle the above use-case.
+// it puts its central directory at the end of the archive and we write
+// model.json as the last file when writing after we have accumulated all
+// other information.
+
+namespace caffe2 {
+namespace serialize {
+
+static constexpr const char* kSerializationIdRecordName =
+    ".data/serialization_id";
+
+struct MzZipReaderIterWrapper;
+
+class TORCH_API ChunkRecordIterator {
+ public:
+  ~ChunkRecordIterator();
+
+  // Read at most `chunkSize` into `buf`. Return the number of actual bytes
+  // read.
+  size_t next(void* buf);
+  size_t recordSize() const {
+    return recordSize_;
+  }
+
+ private:
+  ChunkRecordIterator(
+      size_t recordSize,
+      size_t chunkSize,
+      std::unique_ptr<MzZipReaderIterWrapper> iter);
+
+  const size_t recordSize_;
+  const size_t chunkSize_;
+  size_t offset_;
+  std::unique_ptr<MzZipReaderIterWrapper> iter_;
+
+  friend class PyTorchStreamReader;
+};
+
+class TORCH_API PyTorchStreamReader final {
+ public:
+  explicit PyTorchStreamReader(const std::string& file_name);
+  explicit PyTorchStreamReader(std::istream* in);
+  explicit PyTorchStreamReader(std::shared_ptr<ReadAdapterInterface> in);
+
+  // return dataptr, size
+  // set allocator to override default cpu allocator
+  std::tuple<at::DataPtr, size_t> getRecord(
+      const std::string& name,
+      std::optional<at::Allocator*> allocator = std::nullopt);
+  // multi-thread getRecord
+  std::tuple<at::DataPtr, size_t> getRecord(
+      const std::string& name,
+      std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders,
+      std::optional<at::Allocator*> allocator = std::nullopt);
+  // inplace memory writing
+  size_t getRecord(const std::string& name, void* dst, size_t n);
+  // inplace memory writing, multi-threads.
+  // When additionalReaders is empty, the default behavior is call
+  // getRecord(name, dst, n) with default reader This approach can be used for
+  // reading large tensors.
+  size_t getRecord(
+      const std::string& name,
+      void* dst,
+      size_t n,
+      std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders);
+  size_t getRecord(
+      const std::string& name,
+      void* dst,
+      size_t n,
+      size_t chunk_size,
+      void* buf,
+      const std::function<void(void*, const void*, size_t)>& memcpy_func =
+          nullptr);
+
+  // Concurrent reading records with multiple readers.
+  // additionalReaders are additional clients to access the underlying record at
+  // different offsets and write to different trunks of buffers. If the overall
+  // size of the tensor is 10, and size of additionalReader is 2. The default
+  // thread will read [0,4), the additional reader will read [4,8). The default
+  // reader will read [8,10). The default reader will write to buffer[0,4), the
+  // additional reader will write to buffer[4,8), the additional reader will
+  // write to buffer[8,10). When additionalReaders is empty, the default
+  // behavior is call getRecord(name) with default reader This approach can be
+  // used for reading large tensors.
+  size_t getRecordMultiReaders(
+      const std::string& name,
+      std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders,
+      void* dst,
+      size_t n);
+
+  size_t getRecordSize(const std::string& name);
+  size_t getRecordHeaderOffset(const std::string& name);
+  size_t getRecordOffset(const std::string& name);
+  size_t getRecordOffsetNoRead(
+      size_t cursor,
+      std::string filename,
+      size_t size,
+      uint64_t alignment);
+  bool hasRecord(const std::string& name);
+  std::vector<std::string> getAllRecords();
+
+  ChunkRecordIterator createChunkReaderIter(
+      const std::string& name,
+      const size_t recordSize,
+      const size_t chunkSize);
+
+  ~PyTorchStreamReader();
+  uint64_t version() const {
+    return version_;
+  }
+  const std::string& serializationId() {
+    return serialization_id_;
+  }
+
+  void setShouldLoadDebugSymbol(bool should_load_debug_symbol) {
+    load_debug_symbol_ = should_load_debug_symbol;
+  }
+  void setAdditionalReaderSizeThreshold(const size_t& size) {
+    additional_reader_size_threshold_ = size;
+  }
+
+ private:
+  void init();
+  size_t read(uint64_t pos, char* buf, size_t n);
+  void valid(const char* what, const char* info = "");
+  size_t getRecordID(const std::string& name);
+
+  friend size_t
+  istream_read_func(void* pOpaque, uint64_t file_ofs, void* pBuf, size_t n);
+  std::unique_ptr<mz_zip_archive> ar_;
+  std::string archive_name_;
+  std::string archive_name_plus_slash_;
+  std::shared_ptr<ReadAdapterInterface> in_;
+  int64_t version_;
+  std::mutex reader_lock_;
+  bool load_debug_symbol_ = true;
+  std::string serialization_id_;
+  size_t additional_reader_size_threshold_;
+};
+
+class TORCH_API PyTorchStreamWriter final {
+ public:
+  explicit PyTorchStreamWriter(
+      const std::string& archive_name,
+      bool compute_crc32 = true,
+      uint64_t alignment = 64);
+  explicit PyTorchStreamWriter(
+      const std::function<size_t(const void*, size_t)> writer_func,
+      bool compute_crc32 = true,
+      uint64_t alignment = 64);
+
+  void setMinVersion(const uint64_t version);
+
+  void writeRecord(
+      const std::string& name,
+      const void* data,
+      size_t size,
+      bool compress = false);
+  void writeEndOfFile();
+
+  const std::unordered_set<std::string>& getAllWrittenRecords();
+
+  bool finalized() const {
+    return finalized_;
+  }
+
+  const std::string& archiveName() {
+    return archive_name_;
+  }
+
+  const std::string& serializationId() {
+    return serialization_id_;
+  }
+
+  ~PyTorchStreamWriter();
+
+ private:
+  void setup(const std::string& file_name);
+  void valid(const char* what, const char* info = "");
+  void writeSerializationId();
+  size_t current_pos_ = 0;
+  std::unordered_set<std::string> files_written_;
+  std::unique_ptr<mz_zip_archive> ar_;
+  std::string archive_name_;
+  std::string archive_name_plus_slash_;
+  std::string padding_;
+  std::ofstream file_stream_;
+  std::function<size_t(const void*, size_t)> writer_func_;
+  uint64_t combined_uncomp_crc32_ = 0;
+  std::string serialization_id_;
+  bool compute_crc32_;
+  uint64_t alignment_;
+
+  // This number will be updated when the model has operators
+  // that have valid upgraders.
+  uint64_t version_ = kMinProducedFileFormatVersion;
+  bool finalized_ = false;
+  bool err_seen_ = false;
+  friend size_t ostream_write_func(
+      void* pOpaque,
+      uint64_t file_ofs,
+      const void* pBuf,
+      size_t n);
+};
+
+namespace detail {
+
+// Returns a record to be appended to the local user extra data entry in order
+// to make data beginning aligned at kFieldAlignment bytes boundary.
+size_t getPadding(
+    size_t cursor,
+    size_t filename_size,
+    size_t size,
+    std::string& padding_buf,
+    uint64_t alignment);
+
+std::tuple<size_t, size_t>
+getOffset(size_t cursor, size_t filename_size, size_t size, uint64_t alignment);
+
+} // namespace detail
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e205be7f1ceef1ffc92f686d1cd464f60899ae3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/istream_adapter.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <istream>
+
+#include "c10/macros/Macros.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+
+namespace caffe2 {
+namespace serialize {
+
+// this is a reader implemented by std::istream
+class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
+ public:
+  C10_DISABLE_COPY_AND_ASSIGN(IStreamAdapter);
+  explicit IStreamAdapter(std::istream* istream);
+  size_t size() const override;
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override;
+  ~IStreamAdapter() override;
+
+ private:
+  std::istream* istream_;
+  void validate(const char* what) const;
+};
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc4b4505f4b786a0c8088e7ecc2253b877a20298
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "c10/macros/Macros.h"
+
+namespace caffe2 {
+namespace serialize {
+
+// this is the interface for the (file/stream/memory) reader in
+// PyTorchStreamReader. with this interface, we can extend the support
+// besides standard istream
+class TORCH_API ReadAdapterInterface {
+ public:
+  virtual size_t size() const = 0;
+  virtual size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const = 0;
+  virtual ~ReadAdapterInterface();
+};
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f21f4db27caa05bf69b3f05fdcf93ccf241d0944
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/serialize/versions.h
@@ -0,0 +1,138 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cstdint>
+
+namespace caffe2 {
+namespace serialize {
+
+constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
+
+constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL;
+
+// Versions (i.e. why was the version number bumped?)
+
+// Note [Dynamic Versions and torch.jit.save vs. torch.save]
+//
+// Our versioning scheme has a "produced file format version" which
+// describes how an archive is to be read. The version written in an archive
+// is at least this current produced file format version, but may be greater
+// if it includes certain symbols. We refer to these conditional versions
+// as "dynamic," since they are identified at runtime.
+//
+// Dynamic versioning is useful when an operator's semantics are updated.
+// When using torch.jit.save we want those semantics to be preserved. If
+// we bumped the produced file format version on every change, however,
+// then older versions of PyTorch couldn't read even simple archives, like
+// a single tensor, from newer versions of PyTorch. Instead, we
+// assign dynamic versions to these changes that override the
+// produced file format version as needed. That is, when the semantics
+// of torch.div changed it was assigned dynamic version 4, and when
+// torch.jit.saving modules that use torch.div those archives also have
+// (at least) version 4. This prevents earlier versions of PyTorch
+// from accidentally performing the wrong kind of division. Modules
+// that don't use torch.div or other operators with dynamic versions
+// can write the produced file format version, and these programs will
+// run as expected on earlier versions of PyTorch.
+//
+// While torch.jit.save attempts to preserve operator semantics,
+// torch.save does not. torch.save is analogous to pickling Python, so
+// a function that uses torch.div will have different behavior if torch.saved
+// and torch.loaded across PyTorch versions. From a technical perspective,
+// torch.save ignores dynamic versioning.
+
+// 1. Initial version
+// 2. Removed op_version_set version numbers
+// 3. Added type tags to pickle serialization of container types
+// 4. (Dynamic) Stopped integer division using torch.div
+//      (a versioned symbol preserves the historic behavior of versions 1--3)
+// 5. (Dynamic) Stops torch.full inferring a floating point dtype
+//      when given bool or integer fill values.
+// 6. Write version string to `./data/version` instead of `version`.
+
+// [12/15/2021]
+// kProducedFileFormatVersion is set to 7 from 3 due to a different
+// interpretation of what file format version is.
+// Whenever there is new upgrader introduced,
+// this number should be bumped.
+// The reasons that version is bumped in the past:
+//     1. aten::div is changed at version 4
+//     2. aten::full is changed at version 5
+//     3. torch.package uses version 6
+//     4. Introduce new upgrader design and set the version number to 7
+//        mark this change
+// --------------------------------------------------
+// We describe new operator version bump reasons here:
+// 1) [01/24/2022]
+//     We bump the version number to 8 to update aten::linspace
+//     and aten::linspace.out to error out when steps is not
+//     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+// 2) [01/30/2022]
+//     Bump the version number to 9 to update aten::logspace and
+//     and aten::logspace.out to error out when steps is not
+//     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+// 3) [02/11/2022]
+//     Bump the version number to 10 to update aten::gelu and
+//     and aten::gelu.out to support the new approximate kwarg.
+//     (see: https://github.com/pytorch/pytorch/pull/61439)
+constexpr uint64_t kProducedFileFormatVersion = 0xAL;
+
+// Absolute minimum version we will write packages. This
+// means that every package from now on will always be
+// greater than this number.
+constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
+
+// The version we write when the archive contains bytecode.
+// It must be higher or eq to kProducedFileFormatVersion.
+// Because torchscript changes is likely introduce bytecode change.
+// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
+// should be increased too. The relationship is:
+// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
+//   >= kProducedFileFormatVersion
+// If a format change is forward compatible (still readable by older
+// executables), we will not increment the version number, to minimize the
+// risk of breaking existing clients. TODO: A better way would be to allow
+// the caller that creates a model to specify a maximum version that its
+// clients can accept.
+// Versions:
+//  0x1L: Initial version
+//  0x2L: (Comment missing)
+//  0x3L: (Comment missing)
+//  0x4L: (update) Added schema to function tuple. Forward-compatible change.
+//  0x5L: (update) Update bytecode is sharing constant tensor files from
+//  torchscript, and only serialize extra tensors that are not in the
+//  torchscript constant table. Also update tensor storage schema adapting to
+//  the unify format, the root key of tensor storage is updated from {index} to
+//  {the_pointer_value_the_tensor.storage}, for example:
+//  `140245072983168.storage` Forward-compatibility change.
+//  0x6L: Implicit opereator versioning using number of specified argument.
+//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for
+//  details.
+//  0x7L: Enable support for operators with default arguments plus out
+//  arguments. Refer. See https://github.com/pytorch/pytorch/pull/63651 for
+//  details.
+//  0x8L: Emit promoted operators as instructions. See
+//  https://github.com/pytorch/pytorch/pull/71662 for details.
+//  0x9L: Change serialization format from pickle to format This version is to
+//  serve migration. v8 pickle and v9 flatbuffer are the same. Refer to the
+//  summary of https://github.com/pytorch/pytorch/pull/75201 for more details.
+constexpr uint64_t kProducedBytecodeVersion = 0x8L;
+
+// static_assert(
+//     kProducedBytecodeVersion >= kProducedFileFormatVersion,
+//     "kProducedBytecodeVersion must be higher or equal to
+//     kProducedFileFormatVersion.");
+
+// Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion
+// for limited backward/forward compatibility support of bytecode. If
+// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion
+// (in loader), we should support this model_version. For example, we provide a
+// wrapper to handle an updated operator.
+constexpr uint64_t kMinSupportedBytecodeVersion = 0x4L;
+constexpr uint64_t kMaxSupportedBytecodeVersion = 0x9L;
+
+} // namespace serialize
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h
new file mode 100644
index 0000000000000000000000000000000000000000..8041a2723c8603b05b26956126e37eff436ac905
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/fixed_divisor.h
@@ -0,0 +1,137 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_UTILS_FIXED_DIVISOR_H_
+#define CAFFE2_UTILS_FIXED_DIVISOR_H_
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+// See Note [hip-clang differences to hcc]
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__) || \
+    (defined(__clang__) && defined(__CUDA__))
+#define FIXED_DIVISOR_DECL inline __host__ __device__
+#else
+#define FIXED_DIVISOR_DECL inline
+#endif
+
+namespace caffe2 {
+
+// Utility class for quickly calculating quotients and remainders for
+// a known integer divisor
+template <typename T>
+class FixedDivisor {};
+
+// Works for any positive divisor, 1 to INT_MAX. One 64-bit
+// multiplication and one 64-bit shift is used to calculate the
+// result.
+template <>
+class FixedDivisor<std::int32_t> {
+ public:
+  FixedDivisor() = default;
+
+  explicit FixedDivisor(const std::int32_t d) : d_(d) {
+#if !defined(USE_ROCM)
+    CalcSignedMagic();
+#endif // USE_ROCM
+  }
+
+  FIXED_DIVISOR_DECL std::int32_t d() const {
+    return d_;
+  }
+
+#if !defined(USE_ROCM)
+  FIXED_DIVISOR_DECL std::uint64_t magic() const {
+    return magic_;
+  }
+
+  FIXED_DIVISOR_DECL int shift() const {
+    return shift_;
+  }
+#endif // USE_ROCM
+
+  /// Calculates `q = n / d`.
+  FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const {
+#if defined(USE_ROCM)
+    return n / d_;
+#else // USE_ROCM
+    // In lieu of a mulhi instruction being available, perform the
+    // work in uint64
+    return (int32_t)((magic_ * (uint64_t)n) >> shift_);
+#endif // USE_ROCM
+  }
+
+  /// Calculates `r = n % d`.
+  FIXED_DIVISOR_DECL std::int32_t Mod(const std::int32_t n) const {
+    return n - d_ * Div(n);
+  }
+
+  /// Calculates `q = n / d` and `r = n % d` together.
+  FIXED_DIVISOR_DECL void
+  DivMod(const std::int32_t n, std::int32_t* q, int32_t* r) const {
+    *q = Div(n);
+    *r = n - d_ * *q;
+  }
+
+ private:
+#if !defined(USE_ROCM)
+  // Calculates magic multiplicative value and shift amount for calculating `q =
+  // n / d` for signed 32-bit integers.
+  // Implementation taken from Hacker's Delight section 10.
+  void CalcSignedMagic() {
+    if (d_ == 1) {
+      magic_ = UINT64_C(0x1) << 32;
+      shift_ = 32;
+      return;
+    }
+
+    const std::uint32_t two31 = UINT32_C(0x80000000);
+    const std::uint32_t ad = std::abs(d_);
+    const std::uint32_t t = two31 + ((uint32_t)d_ >> 31);
+    const std::uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
+    std::uint32_t p = 31; // Init. p.
+    std::uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
+    std::uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
+    std::uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
+    std::uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
+    std::uint32_t delta = 0;
+    do {
+      ++p;
+      q1 <<= 1; // Update q1 = 2**p/|nc|.
+      r1 <<= 1; // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) { // (Must be an unsigned
+        ++q1; // comparison here).
+        r1 -= anc;
+      }
+      q2 <<= 1; // Update q2 = 2**p/|d|.
+      r2 <<= 1; // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) { // (Must be an unsigned
+        ++q2; // comparison here).
+        r2 -= ad;
+      }
+      delta = ad - r2;
+    } while (q1 < delta || (q1 == delta && r1 == 0));
+    std::int32_t magic = q2 + 1;
+    if (d_ < 0) {
+      magic = -magic;
+    }
+    shift_ = p;
+    magic_ = (std::uint64_t)(std::uint32_t)magic;
+  }
+#endif // USE_ROCM
+
+  std::int32_t d_ = 1;
+
+#if !defined(USE_ROCM)
+  std::uint64_t magic_;
+  int shift_;
+#endif // USE_ROCM
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_FIXED_DIVISOR_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h
new file mode 100644
index 0000000000000000000000000000000000000000..29b58072e159b1ca826fc4b6d8631e7590943969
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/proto_wrap.h
@@ -0,0 +1,42 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_UTILS_PROTO_WRAP_H_
+#define CAFFE2_UTILS_PROTO_WRAP_H_
+
+#include <c10/util/Logging.h>
+
+namespace caffe2 {
+
+// A wrapper function to shut down protobuf library (this is needed in ASAN
+// testing and valgrind cases to avoid protobuf appearing to "leak" memory).
+TORCH_API void ShutdownProtobufLibrary();
+
+// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited()
+// function used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited();
+} // namespace caffe2
+
+namespace ONNX_NAMESPACE {
+
+// ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited();
+
+} // namespace ONNX_NAMESPACE
+
+namespace torch {
+
+// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited()
+// function used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited();
+
+void ShutdownProtobufLibrary();
+
+} // namespace torch
+#endif // CAFFE2_UTILS_PROTO_WRAP_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8d2d49efdb0ca402a6e6b60c0c6de7db9249684
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/string_utils.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Export.h>
+
+namespace caffe2 {
+
+TORCH_API std::vector<std::string>
+split(char separator, const std::string& string, bool ignore_empty = false);
+
+TORCH_API std::string trim(const std::string& str);
+
+TORCH_API size_t editDistance(
+    const std::string& s1,
+    const std::string& s2,
+    size_t max_distance = 0);
+
+TORCH_API inline bool StartsWith(
+    const std::string& str,
+    const std::string& prefix) {
+  return str.length() >= prefix.length() &&
+      std::mismatch(prefix.begin(), prefix.end(), str.begin()).first ==
+      prefix.end();
+}
+
+TORCH_API inline bool EndsWith(
+    const std::string& full,
+    const std::string& ending) {
+  if (full.length() >= ending.length()) {
+    return (
+        0 ==
+        full.compare(full.length() - ending.length(), ending.length(), ending));
+  } else {
+    return false;
+  }
+}
+
+TORCH_API int32_t editDistanceHelper(
+    const char* s1,
+    size_t s1_len,
+    const char* s2,
+    size_t s2_len,
+    std::vector<size_t>& current,
+    std::vector<size_t>& previous,
+    std::vector<size_t>& previous1,
+    size_t max_distance);
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3769ec59ebdc60be60a685779aa4c3903e0f721
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPool.h
@@ -0,0 +1,84 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_UTILS_THREADPOOL_H_
+#define CAFFE2_UTILS_THREADPOOL_H_
+
+#include "ThreadPoolCommon.h"
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "c10/util/Flags.h"
+#include "caffe2/core/common.h"
+
+//
+// A work-stealing threadpool loosely based off of pthreadpool
+//
+
+namespace caffe2 {
+
+struct Task;
+class WorkersPool;
+
+constexpr size_t kCacheLineSize = 64;
+
+// A threadpool with the given number of threads.
+// NOTE: the kCacheLineSize alignment is present only for cache
+// performance, and is not strictly enforced (for example, when
+// the object is created on the heap). Thus, in order to avoid
+// misaligned intrinsics, no SSE instructions shall be involved in
+// the ThreadPool implementation.
+// Note: alignas is disabled because some compilers do not deal with
+// TORCH_API and alignas annotations at the same time.
+class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool {
+ public:
+  static ThreadPool* createThreadPool(int numThreads);
+  static std::unique_ptr<ThreadPool> defaultThreadPool();
+  virtual ~ThreadPool() = default;
+  // Returns the number of threads currently in use
+  virtual int getNumThreads() const = 0;
+  virtual void setNumThreads(size_t numThreads) = 0;
+
+  // Sets the minimum work size (range) for which to invoke the
+  // threadpool; work sizes smaller than this will just be run on the
+  // main (calling) thread
+  void setMinWorkSize(size_t size) {
+    std::lock_guard<std::mutex> guard(executionMutex_);
+    minWorkSize_ = size;
+  }
+
+  size_t getMinWorkSize() const {
+    return minWorkSize_;
+  }
+  virtual void run(const std::function<void(int, size_t)>& fn, size_t range) = 0;
+
+  // Run an arbitrary function in a thread-safe manner accessing the Workers
+  // Pool
+  virtual void withPool(const std::function<void(WorkersPool*)>& fn) = 0;
+
+ protected:
+  static size_t defaultNumThreads_;
+  mutable std::mutex executionMutex_;
+  size_t minWorkSize_;
+};
+
+size_t getDefaultNumThreads();
+} // namespace caffe2
+
+C10_DECLARE_bool(caffe2_threadpool_force_inline);
+
+// Whether or not threadpool caps apply to Android
+C10_DECLARE_int(caffe2_threadpool_android_cap);
+
+// Whether or not threadpool caps apply to iOS and MacOS
+C10_DECLARE_int(caffe2_threadpool_ios_cap);
+C10_DECLARE_int(caffe2_threadpool_macos_cap);
+
+C10_DECLARE_int(pthreadpool_size);
+#endif // CAFFE2_UTILS_THREADPOOL_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bd04aa595c383ea8c1e0cb833e81e5478bc879b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/ThreadPoolCommon.h
@@ -0,0 +1,25 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#ifndef CAFFE2_UTILS_THREADPOOL_COMMON_H_
+#define CAFFE2_UTILS_THREADPOOL_COMMON_H_
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+// caffe2 depends upon NNPACK, which depends upon this threadpool, so
+// unfortunately we can't reference core/common.h here
+
+// This is copied from core/common.h's definition of C10_MOBILE
+// Define enabled when building for iOS or Android devices
+#if defined(__ANDROID__)
+#define C10_ANDROID 1
+#elif (defined(__APPLE__) &&                                            \
+       (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define C10_IOS 1
+#endif // ANDROID / IOS
+
+#endif  // CAFFE2_UTILS_THREADPOOL_COMMON_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4adbac9b3c1b3a9672b511cb24dda1c48a4622e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/WorkersPool.h
@@ -0,0 +1,383 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <thread>
+#include "c10/util/thread_name.h"
+#include <c10/util/irange.h>
+#include <c10/util/Logging.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+namespace caffe2 {
+
+// Uses code derived from gemmlowp,
+// https://github.com/google/gemmlowp/blob/6c91e1ed0c2eff1182d804310b92911fe9c18019/internal/multi_thread_gemm.h
+// Changes:
+// - allocation-free execute()
+// - Use RAII where possible.
+// - Run the first task on the main thread (since that is the largest task).
+// - removed custom allocator.
+// - Removed some ifdef's
+// - cache-line align Worker.
+// - use std::atomic instead of volatile and custom barriers.
+// - use std::mutex/std::condition_variable instead of raw pthreads.
+
+constexpr size_t kGEMMLOWPCacheLineSize = 64;
+
+template <typename T>
+struct AllocAligned {
+  // Allocate a T aligned at an `align` byte address
+  template <typename... Args>
+  static T* alloc(Args&&... args) {
+    void* p = nullptr;
+
+#if defined(__ANDROID__)
+    p = memalign(kGEMMLOWPCacheLineSize, sizeof(T));
+#elif defined(_MSC_VER)
+    p = _aligned_malloc(sizeof(T), kGEMMLOWPCacheLineSize);
+#else
+    auto res = posix_memalign(&p, kGEMMLOWPCacheLineSize, sizeof(T));
+    (void)res;
+#endif
+
+    if (p) {
+      return new (p) T(std::forward<Args>(args)...);
+    }
+
+    return nullptr;
+  }
+
+  // Free a T previously allocated via AllocAligned<T>::alloc()
+  static void release(T* p) {
+    if (p) {
+      p->~T();
+#if defined(_MSC_VER)
+      _aligned_free((void*)p);
+#else
+      free((void*)p);
+#endif
+    }
+  }
+};
+
+// Deleter object for unique_ptr for an aligned object
+template <typename T>
+struct AlignedDeleter {
+  void operator()(T* p) const { AllocAligned<T>::release(p); }
+};
+
+// make_unique that guarantees alignment
+template <typename T>
+struct MakeAligned {
+  template <typename... Args>
+  static std::unique_ptr<T, AlignedDeleter<T>> make(Args&&... args) {
+    return std::unique_ptr<T, AlignedDeleter<T>>(
+        AllocAligned<T>::alloc(std::forward<Args>(args)...));
+  }
+};
+
+const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
+
+#if defined(_MSC_VER)
+#define GEMMLOWP_NOP __nop();
+#else
+#define GEMMLOWP_NOP "nop\n"
+#endif
+
+#define GEMMLOWP_STRING_CONCAT_4(X) X X X X
+#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
+#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
+#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
+
+inline int Do256NOPs() {
+#if defined(_MSC_VER)
+  GEMMLOWP_NOP64;
+#else
+  asm volatile(GEMMLOWP_NOP64);
+#endif
+  return 64;
+}
+
+#undef GEMMLOWP_STRING_CONCAT_4
+#undef GEMMLOWP_NOP256
+#undef GEMMLOWP_NOP64
+#undef GEMMLOWP_NOP16
+#undef GEMMLOWP_NOP4
+#undef GEMMLOWP_NOP
+
+// Waits until *var != initial_value.
+//
+// Returns the new value of *var. The guarantee here is that
+// the return value is different from initial_value, and that that
+// new value has been taken by *var at some point during the
+// execution of this function. There is no guarantee that this is
+// still the value of *var when this function returns, since *var is
+// not assumed to be guarded by any lock.
+//
+// First does some busy-waiting for a fixed number of no-op cycles,
+// then falls back to passive waiting for the given condvar, guarded
+// by the given mutex.
+//
+// The idea of doing some initial busy-waiting is to help get
+// better and more consistent multithreading benefits for small GEMM sizes.
+// Busy-waiting help ensuring that if we need to wake up soon after having
+// started waiting, then we can wake up quickly (as opposed to, say,
+// having to wait to be scheduled again by the OS). On the other hand,
+// we must still eventually revert to passive waiting for longer waits
+// (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
+// so as to avoid permanently spinning.
+//
+template <typename T>
+T WaitForVariableChange(std::atomic<T>* var,
+                        T initial_value,
+                        std::condition_variable* cond,
+                        std::mutex* mutex) {
+  // If we are on a platform that supports it, spin for some time.
+  {
+    int nops = 0;
+    // First, trivial case where the variable already changed value.
+    T new_value = var->load(std::memory_order_relaxed);
+    if (new_value != initial_value) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return new_value;
+    }
+    // Then try busy-waiting.
+    while (nops < kMaxBusyWaitNOPs) {
+      nops += Do256NOPs();
+      new_value = var->load(std::memory_order_relaxed);
+      if (new_value != initial_value) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return new_value;
+      }
+    }
+  }
+
+  // Finally, do real passive waiting.
+  {
+    std::unique_lock<std::mutex> g(*mutex);
+    T new_value = var->load(std::memory_order_relaxed);
+    // Handle spurious wakeups.
+    cond->wait(g, [&]() {
+      new_value = var->load(std::memory_order_relaxed);
+      return new_value != initial_value;
+    });
+    TORCH_DCHECK_NE(static_cast<size_t>(new_value), static_cast<size_t>(initial_value));
+    return new_value;
+  }
+}
+
+// A BlockingCounter lets one thread to wait for N events to occur.
+// This is how the master thread waits for all the worker threads
+// to have finished working.
+class BlockingCounter {
+ public:
+  // Sets/resets the counter; initial_count is the number of
+  // decrementing events that the Wait() call will be waiting for.
+  void Reset(std::size_t initial_count) {
+    std::lock_guard<std::mutex> g(mutex_);
+    TORCH_DCHECK_EQ(count_, 0);
+    count_ = initial_count;
+  }
+
+  // Decrements the counter; if the counter hits zero, signals
+  // the thread that was waiting for that, and returns true.
+  // Otherwise (if the decremented count is still nonzero),
+  // returns false.
+  bool DecrementCount() {
+    const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1;
+    if (count_value == 0) {
+      std::lock_guard<std::mutex> g(mutex_);
+      cond_.notify_one();
+    }
+    bool retval = count_value == 0;
+    return retval;
+  }
+
+  // Waits for the N other threads (N having been set by Reset())
+  // to hit the BlockingCounter.
+  void Wait() {
+    while (size_t count_value = count_.load(std::memory_order_relaxed)) {
+      WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
+    }
+  }
+
+ private:
+  std::condition_variable cond_;
+  std::mutex mutex_;
+  std::atomic<std::size_t> count_{0};
+};
+
+// A workload for a worker.
+struct Task {
+  Task() = default;
+  virtual ~Task() = default;
+  virtual void Run() = 0;
+};
+
+// A worker thread.
+class alignas(kGEMMLOWPCacheLineSize) Worker {
+ public:
+  enum class State : uint8_t {
+    ThreadStartup, // The initial state before the thread main loop runs.
+    Ready, // Is not working, has not yet received new work to do.
+    HasWork, // Has work to do.
+    ExitAsSoonAsPossible // Should exit at earliest convenience.
+  };
+
+  explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
+      : task_(nullptr),
+        state_(State::ThreadStartup),
+        counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+    thread_ = std::make_unique<std::thread>([this]() {
+      c10::setThreadName("pt_thread_pool");
+      this->ThreadFunc();
+    });
+  }
+
+  ~Worker() {
+    ChangeState(State::ExitAsSoonAsPossible);
+    thread_->join();
+  }
+
+  // Changes State; may be called from either the worker thread
+  // or the master thread; however, not all state transitions are legal,
+  // which is guarded by assertions.
+  void ChangeState(State new_state) {
+    std::lock_guard<std::mutex> g(state_mutex_);
+    DCHECK(new_state != state_.load(std::memory_order_relaxed));
+    switch (state_.load(std::memory_order_relaxed)) {
+    case State::ThreadStartup:
+      DCHECK(new_state == State::Ready);
+      break;
+    case State::Ready:
+      DCHECK(new_state == State::HasWork || new_state == State::ExitAsSoonAsPossible);
+      break;
+    case State::HasWork:
+      DCHECK(new_state == State::Ready || new_state == State::ExitAsSoonAsPossible);
+      break;
+    case State::ExitAsSoonAsPossible:
+    default:
+      abort();
+    }
+    state_.store(new_state, std::memory_order_relaxed);
+    state_cond_.notify_one();
+    if (new_state == State::Ready) {
+      counter_to_decrement_when_ready_->DecrementCount();
+    }
+  }
+
+  // Thread entry point.
+  void ThreadFunc() {
+    c10::setThreadName("CaffeWorkersPool");
+    ChangeState(State::Ready);
+
+    // Thread main loop
+    while (true) {
+      // Get a state to act on
+      // In the 'Ready' state, we have nothing to do but to wait until
+      // we switch to another state.
+      State state_to_act_upon =
+          WaitForVariableChange(&state_, State::Ready, &state_cond_, &state_mutex_);
+
+      // We now have a state to act on, so act.
+      switch (state_to_act_upon) {
+      case State::HasWork:
+        // Got work to do! So do it, and then revert to 'Ready' state.
+        DCHECK(task_.load());
+        (*task_).Run();
+        task_ = nullptr;
+        ChangeState(State::Ready);
+        break;
+      case State::ExitAsSoonAsPossible:
+        return;
+      case State::Ready:
+      case State::ThreadStartup:
+      default:
+        abort();
+      }
+    }
+  }
+
+  static void* ThreadFunc(void* arg) {
+    static_cast<Worker*>(arg)->ThreadFunc();
+    return nullptr;
+  }
+
+  // Called by the master thread to give this worker work to do.
+  // It is only legal to call this if the worker
+  void StartWork(Task* task) {
+    DCHECK(!task_.load());
+    task_ = task;
+    DCHECK(state_.load(std::memory_order_acquire) == State::Ready);
+    ChangeState(State::HasWork);
+  }
+
+ private:
+  // The underlying thread.
+  std::unique_ptr<std::thread> thread_;
+
+  // The task to be worked on.
+  std::atomic<Task*> task_;
+
+  // The condition variable and mutex guarding state changes.
+  std::condition_variable state_cond_;
+  std::mutex state_mutex_;
+
+  // The state enum tells if we're currently working, waiting for work, etc.
+  std::atomic<State> state_;
+
+  // pointer to the master's thread BlockingCounter object, to notify the
+  // master thread of when this worker switches to the 'Ready' state.
+  BlockingCounter* const counter_to_decrement_when_ready_;
+};
+
+class WorkersPool {
+ public:
+  WorkersPool() = default;
+
+  void Execute(const std::vector<std::shared_ptr<Task>>& tasks) {
+    CAFFE_ENFORCE_GE(tasks.size(), 1);
+    // One of the tasks will be run on the current thread.
+    int workers_count = tasks.size() - 1;
+    CreateWorkers(workers_count);
+    TORCH_DCHECK_LE(workers_count, (int)workers_.size());
+    counter_to_decrement_when_ready_.Reset(workers_count);
+    for (const auto task : c10::irange(1, tasks.size())) {
+      workers_[task - 1]->StartWork(tasks[task].get());
+    }
+    // Execute the remaining workload immediately on the current thread.
+    auto& task = tasks.front();
+    task->Run();
+    // Wait for the workers submitted above to finish.
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+ private:
+  // Ensures that the pool has at least the given count of workers.
+  // If any new worker has to be created, this function waits for it to
+  // be ready.
+  void CreateWorkers(std::size_t workers_count) {
+    if (workers_.size() >= workers_count) {
+      return;
+    }
+    counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
+    while (workers_.size() < workers_count) {
+      workers_.push_back(MakeAligned<Worker>::make(&counter_to_decrement_when_ready_));
+    }
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+  C10_DISABLE_COPY_AND_ASSIGN(WorkersPool);
+  std::vector<std::unique_ptr<Worker, AlignedDeleter<Worker>>> workers_;
+  // The BlockingCounter used to wait for the workers.
+  BlockingCounter counter_to_decrement_when_ready_;
+};
+} // namespace caffe2
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb9a01d3bd2ec1bc12d5290b965f18d9bb0cbfb4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool-cpp.h
@@ -0,0 +1,60 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#ifdef USE_PTHREADPOOL
+
+#ifdef USE_INTERNAL_PTHREADPOOL_IMPL
+#include <caffe2/utils/threadpool/pthreadpool.h>
+#else
+#include <pthreadpool.h>
+#endif
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+namespace caffe2 {
+
+class PThreadPool final {
+ public:
+  explicit PThreadPool(size_t thread_count);
+  ~PThreadPool() = default;
+
+  PThreadPool(const PThreadPool&) = delete;
+  PThreadPool& operator=(const PThreadPool&) = delete;
+
+  PThreadPool(PThreadPool&&) = delete;
+  PThreadPool& operator=(PThreadPool&&) = delete;
+
+  size_t get_thread_count() const;
+  void set_thread_count(size_t thread_count);
+
+  // Run, in parallel, function fn(task_id) over task_id in range [0, range).
+  // This function is blocking.  All input is processed by the time it returns.
+  void run(const std::function<void(size_t)>& fn, size_t range);
+
+ private:
+  friend pthreadpool_t pthreadpool_();
+
+ private:
+  mutable std::mutex mutex_;
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_;
+};
+
+// Return a singleton instance of PThreadPool for ATen/TH multithreading.
+PThreadPool* pthreadpool();
+PThreadPool* pthreadpool(size_t thread_count);
+
+// Exposes the underlying implementation of PThreadPool.
+// Only for use in external libraries so as to unify threading across
+// internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK)
+// use cases.
+pthreadpool_t pthreadpool_();
+
+} // namespace caffe2
+
+#endif /* USE_PTHREADPOOL */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff7ff896b589dff1f51abd155e685fe2ee231750
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/pthreadpool.h
@@ -0,0 +1,198 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// pthreadpool header from https://github.com/Maratyszcza/pthreadpool
+// for NNPACK
+#ifndef CAFFE2_UTILS_PTHREADPOOL_H_
+#define CAFFE2_UTILS_PTHREADPOOL_H_
+
+#include "ThreadPoolCommon.h"
+
+#include <stddef.h> // for size_t
+#include <stdint.h> // for uint32_t
+
+#if defined(USE_PTHREADPOOL)
+// This is a hack.
+// Mainly introduced here because
+// 1. NNPACK can be compiled to use internal legacy threadpool implementation because much of C2 depends on that.
+// 2. Then if we want to use NNPACK in PyTorch, which uses new pthreadpool, then we will supply new pthreadpool pointer
+//    to NNPACK. This will not work if NNPACK is compiled with internal legacy threadpool. Thus this guard
+//    along with changes in pthreadpool_impl.cc allows us to override that behavior.
+//    It enables us to use NNPACK from pytorch using `caffe2::pthreadpool_()`
+namespace caffe2 {
+class WithCastToNewThreadPool {
+  public:
+    explicit WithCastToNewThreadPool(bool use_new_threadpool);
+    ~WithCastToNewThreadPool();
+  private:
+    bool use_new_threadpool_;
+};
+}
+#endif
+
+typedef struct pthreadpool* legacy_pthreadpool_t;
+
+typedef void (*legacy_pthreadpool_function_1d_t)(void*, size_t);
+typedef void (*legacy_pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_2d_t)(void*, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_3d_tiled_t)(
+    void*,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t);
+typedef void (*legacy_pthreadpool_function_4d_tiled_t)(
+    void*,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Creates a thread pool with the specified number of threads.
+ *
+ * @param[in]  threads_count  The number of threads in the thread pool.
+ *    A value of 0 has special interpretation: it creates a thread for each
+ *    processor core available in the system.
+ *
+ * @returns  A pointer to an opaque thread pool object.
+ *    On error the function returns NULL and sets errno accordingly.
+ */
+
+// Returns internal threadpool impl.
+legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count);
+
+/**
+ * Queries the number of threads in a thread pool.
+ *
+ * @param[in]  threadpool  The thread pool to query.
+ *
+ * @returns  The number of threads in the thread pool.
+ */
+size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool);
+
+/**
+ * Processes items in parallel using threads from a thread pool.
+ *
+ * When the call returns, all items have been processed and the thread pool is
+ * ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param[in]  threadpool  The thread pool to use for parallelisation.
+ * @param[in]  function    The function to call for each item.
+ * @param[in]  argument    The first argument passed to the @a function.
+ * @param[in]  items       The number of items to process. The @a function
+ *    will be called once for each item.
+ */
+void legacy_pthreadpool_compute_1d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range);
+
+void legacy_pthreadpool_parallelize_1d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range,
+    uint32_t flags);
+
+void legacy_pthreadpool_compute_1d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_tiled_t function,
+    void* argument,
+    size_t range,
+    size_t tile);
+
+void legacy_pthreadpool_compute_2d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_2d_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j);
+
+void legacy_pthreadpool_compute_2d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_2d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t tile_i,
+    size_t tile_j);
+
+void legacy_pthreadpool_compute_3d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_3d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k);
+
+void legacy_pthreadpool_compute_4d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_4d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t range_l,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k,
+    size_t tile_l);
+
+/**
+ * Terminates threads in the thread pool and releases associated resources.
+ *
+ * @warning  Accessing the thread pool after a call to this function constitutes
+ *    undefined behaviour and may cause data corruption.
+ *
+ * @param[in,out]  threadpool  The thread pool to destroy.
+ */
+void legacy_pthreadpool_destroy(legacy_pthreadpool_t threadpool);
+
+#ifdef USE_INTERNAL_PTHREADPOOL_IMPL
+
+#define pthreadpool_t legacy_pthreadpool_t
+#define pthreadpool_function_1d_t legacy_pthreadpool_function_1d_t
+#define pthreadpool_function_1d_tiled_t legacy_pthreadpool_function_1d_tiled_t
+#define pthreadpool_function_2d_t legacy_pthreadpool_function_2d_t
+#define pthreadpool_function_2d_tiled_t legacy_pthreadpool_function_2d_tiled_t
+#define pthreadpool_function_3d_tiled_t legacy_pthreadpool_function_3d_tiled_t
+#define pthreadpool_function_4d_tiled_t legacy_pthreadpool_function_4d_tiled_t
+#define pthreadpool_create legacy_pthreadpool_create
+#define pthreadpool_destroy legacy_pthreadpool_destroy
+#define pthreadpool_get_threads_count legacy_pthreadpool_get_threads_count
+#define pthreadpool_compute_1d legacy_pthreadpool_compute_1d
+#define pthreadpool_parallelize_1d legacy_pthreadpool_parallelize_1d
+#define pthreadpool_compute_1d_tiled legacy_pthreadpool_compute_1d_tiled
+#define pthreadpool_compute_2d legacy_pthreadpool_compute_2d
+#define pthreadpool_compute_2d_tiled legacy_pthreadpool_compute_2d_tiled
+#define pthreadpool_compute_3d_tiled legacy_pthreadpool_compute_3d_tiled
+#define pthreadpool_compute_4d_tiled legacy_pthreadpool_compute_4d_tiled
+
+#endif /* USE_INTERNAL_PTHREADPOOL_IMPL */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // CAFFE2_UTILS_PTHREADPOOL_H_
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb76646e6f61bdc1540bacd7dcbf88b4aa09b5f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/caffe2/utils/threadpool/thread_pool_guard.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct TORCH_API _NoPThreadPoolGuard {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+
+  _NoPThreadPoolGuard(): prev_mode_(_NoPThreadPoolGuard::is_enabled()) {
+      _NoPThreadPoolGuard::set_enabled(true);
+  }
+  ~_NoPThreadPoolGuard() {
+      _NoPThreadPoolGuard::set_enabled(prev_mode_);
+  }
+  private:
+    bool prev_mode_;
+};
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d54052e15455f00eb41246585ecd9e0470508e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.pb.h
@@ -0,0 +1,414 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/any.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fany_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fany_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fany_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class Any;
+class AnyDefaultTypeInternal;
+PROTOBUF_EXPORT extern AnyDefaultTypeInternal _Any_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Any* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Any>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT Any PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Any) */ {
+ public:
+  inline Any() : Any(nullptr) {}
+  virtual ~Any();
+
+  Any(const Any& from);
+  Any(Any&& from) noexcept
+    : Any() {
+    *this = ::std::move(from);
+  }
+
+  inline Any& operator=(const Any& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Any& operator=(Any&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Any& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Any* internal_default_instance() {
+    return reinterpret_cast<const Any*>(
+               &_Any_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  // implements Any -----------------------------------------------
+
+  void PackFrom(const ::PROTOBUF_NAMESPACE_ID::Message& message) {
+    _any_metadata_.PackFrom(message);
+  }
+  void PackFrom(const ::PROTOBUF_NAMESPACE_ID::Message& message,
+                const std::string& type_url_prefix) {
+    _any_metadata_.PackFrom(message, type_url_prefix);
+  }
+  bool UnpackTo(::PROTOBUF_NAMESPACE_ID::Message* message) const {
+    return _any_metadata_.UnpackTo(message);
+  }
+  static bool GetAnyFieldDescriptors(
+      const ::PROTOBUF_NAMESPACE_ID::Message& message,
+      const ::PROTOBUF_NAMESPACE_ID::FieldDescriptor** type_url_field,
+      const ::PROTOBUF_NAMESPACE_ID::FieldDescriptor** value_field);
+  template <typename T, class = typename std::enable_if<!std::is_convertible<T, const ::PROTOBUF_NAMESPACE_ID::Message&>::value>::type>
+  void PackFrom(const T& message) {
+    _any_metadata_.PackFrom<T>(message);
+  }
+  template <typename T, class = typename std::enable_if<!std::is_convertible<T, const ::PROTOBUF_NAMESPACE_ID::Message&>::value>::type>
+  void PackFrom(const T& message,
+                const std::string& type_url_prefix) {
+    _any_metadata_.PackFrom<T>(message, type_url_prefix);}
+  template <typename T, class = typename std::enable_if<!std::is_convertible<T, const ::PROTOBUF_NAMESPACE_ID::Message&>::value>::type>
+  bool UnpackTo(T* message) const {
+    return _any_metadata_.UnpackTo<T>(message);
+  }
+  template<typename T> bool Is() const {
+    return _any_metadata_.Is<T>();
+  }
+  static bool ParseAnyTypeUrl(const string& type_url,
+                              std::string* full_type_name);
+  friend void swap(Any& a, Any& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Any* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Any* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Any* New() const final {
+    return CreateMaybeMessage<Any>(nullptr);
+  }
+
+  Any* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Any>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Any& from);
+  void MergeFrom(const Any& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Any* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Any";
+  }
+  protected:
+  explicit Any(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fany_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fany_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kTypeUrlFieldNumber = 1,
+    kValueFieldNumber = 2,
+  };
+  // string type_url = 1;
+  void clear_type_url();
+  const std::string& type_url() const;
+  void set_type_url(const std::string& value);
+  void set_type_url(std::string&& value);
+  void set_type_url(const char* value);
+  void set_type_url(const char* value, size_t size);
+  std::string* mutable_type_url();
+  std::string* release_type_url();
+  void set_allocated_type_url(std::string* type_url);
+  private:
+  const std::string& _internal_type_url() const;
+  void _internal_set_type_url(const std::string& value);
+  std::string* _internal_mutable_type_url();
+  public:
+
+  // bytes value = 2;
+  void clear_value();
+  const std::string& value() const;
+  void set_value(const std::string& value);
+  void set_value(std::string&& value);
+  void set_value(const char* value);
+  void set_value(const void* value, size_t size);
+  std::string* mutable_value();
+  std::string* release_value();
+  void set_allocated_value(std::string* value);
+  private:
+  const std::string& _internal_value() const;
+  void _internal_set_value(const std::string& value);
+  std::string* _internal_mutable_value();
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Any)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr type_url_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr value_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata _any_metadata_;
+  friend struct ::TableStruct_google_2fprotobuf_2fany_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// Any
+
+// string type_url = 1;
+inline void Any::clear_type_url() {
+  type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Any::type_url() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Any.type_url)
+  return _internal_type_url();
+}
+inline void Any::set_type_url(const std::string& value) {
+  _internal_set_type_url(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Any.type_url)
+}
+inline std::string* Any::mutable_type_url() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Any.type_url)
+  return _internal_mutable_type_url();
+}
+inline const std::string& Any::_internal_type_url() const {
+  return type_url_.Get();
+}
+inline void Any::_internal_set_type_url(const std::string& value) {
+  
+  type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Any::set_type_url(std::string&& value) {
+  
+  type_url_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Any.type_url)
+}
+inline void Any::set_type_url(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Any.type_url)
+}
+inline void Any::set_type_url(const char* value,
+    size_t size) {
+  
+  type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Any.type_url)
+}
+inline std::string* Any::_internal_mutable_type_url() {
+  
+  return type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Any::release_type_url() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Any.type_url)
+  return type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Any::set_allocated_type_url(std::string* type_url) {
+  if (type_url != nullptr) {
+    
+  } else {
+    
+  }
+  type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), type_url,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Any.type_url)
+}
+
+// bytes value = 2;
+inline void Any::clear_value() {
+  value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Any::value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Any.value)
+  return _internal_value();
+}
+inline void Any::set_value(const std::string& value) {
+  _internal_set_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Any.value)
+}
+inline std::string* Any::mutable_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Any.value)
+  return _internal_mutable_value();
+}
+inline const std::string& Any::_internal_value() const {
+  return value_.Get();
+}
+inline void Any::_internal_set_value(const std::string& value) {
+  
+  value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Any::set_value(std::string&& value) {
+  
+  value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Any.value)
+}
+inline void Any::set_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Any.value)
+}
+inline void Any::set_value(const void* value,
+    size_t size) {
+  
+  value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Any.value)
+}
+inline std::string* Any::_internal_mutable_value() {
+  
+  return value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Any::release_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Any.value)
+  return value_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Any::set_allocated_value(std::string* value) {
+  if (value != nullptr) {
+    
+  } else {
+    
+  }
+  value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Any.value)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fany_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b5c902661b1330f34e1ad49c3e7d291d895bda5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/api.pb.h
@@ -0,0 +1,1505 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/api.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+#include <google/protobuf/source_context.pb.h>
+#include <google/protobuf/type.pb.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fapi_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fapi_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[3]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fapi_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class Api;
+class ApiDefaultTypeInternal;
+PROTOBUF_EXPORT extern ApiDefaultTypeInternal _Api_default_instance_;
+class Method;
+class MethodDefaultTypeInternal;
+PROTOBUF_EXPORT extern MethodDefaultTypeInternal _Method_default_instance_;
+class Mixin;
+class MixinDefaultTypeInternal;
+PROTOBUF_EXPORT extern MixinDefaultTypeInternal _Mixin_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Api* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Api>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Method* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Method>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Mixin* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Mixin>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT Api PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Api) */ {
+ public:
+  inline Api() : Api(nullptr) {}
+  virtual ~Api();
+
+  Api(const Api& from);
+  Api(Api&& from) noexcept
+    : Api() {
+    *this = ::std::move(from);
+  }
+
+  inline Api& operator=(const Api& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Api& operator=(Api&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Api& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Api* internal_default_instance() {
+    return reinterpret_cast<const Api*>(
+               &_Api_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(Api& a, Api& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Api* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Api* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Api* New() const final {
+    return CreateMaybeMessage<Api>(nullptr);
+  }
+
+  Api* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Api>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Api& from);
+  void MergeFrom(const Api& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Api* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Api";
+  }
+  protected:
+  explicit Api(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kMethodsFieldNumber = 2,
+    kOptionsFieldNumber = 3,
+    kMixinsFieldNumber = 6,
+    kNameFieldNumber = 1,
+    kVersionFieldNumber = 4,
+    kSourceContextFieldNumber = 5,
+    kSyntaxFieldNumber = 7,
+  };
+  // repeated .google.protobuf.Method methods = 2;
+  int methods_size() const;
+  private:
+  int _internal_methods_size() const;
+  public:
+  void clear_methods();
+  PROTOBUF_NAMESPACE_ID::Method* mutable_methods(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >*
+      mutable_methods();
+  private:
+  const PROTOBUF_NAMESPACE_ID::Method& _internal_methods(int index) const;
+  PROTOBUF_NAMESPACE_ID::Method* _internal_add_methods();
+  public:
+  const PROTOBUF_NAMESPACE_ID::Method& methods(int index) const;
+  PROTOBUF_NAMESPACE_ID::Method* add_methods();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >&
+      methods() const;
+
+  // repeated .google.protobuf.Option options = 3;
+  int options_size() const;
+  private:
+  int _internal_options_size() const;
+  public:
+  void clear_options();
+  PROTOBUF_NAMESPACE_ID::Option* mutable_options(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >*
+      mutable_options();
+  private:
+  const PROTOBUF_NAMESPACE_ID::Option& _internal_options(int index) const;
+  PROTOBUF_NAMESPACE_ID::Option* _internal_add_options();
+  public:
+  const PROTOBUF_NAMESPACE_ID::Option& options(int index) const;
+  PROTOBUF_NAMESPACE_ID::Option* add_options();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >&
+      options() const;
+
+  // repeated .google.protobuf.Mixin mixins = 6;
+  int mixins_size() const;
+  private:
+  int _internal_mixins_size() const;
+  public:
+  void clear_mixins();
+  PROTOBUF_NAMESPACE_ID::Mixin* mutable_mixins(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >*
+      mutable_mixins();
+  private:
+  const PROTOBUF_NAMESPACE_ID::Mixin& _internal_mixins(int index) const;
+  PROTOBUF_NAMESPACE_ID::Mixin* _internal_add_mixins();
+  public:
+  const PROTOBUF_NAMESPACE_ID::Mixin& mixins(int index) const;
+  PROTOBUF_NAMESPACE_ID::Mixin* add_mixins();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >&
+      mixins() const;
+
+  // string name = 1;
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // string version = 4;
+  void clear_version();
+  const std::string& version() const;
+  void set_version(const std::string& value);
+  void set_version(std::string&& value);
+  void set_version(const char* value);
+  void set_version(const char* value, size_t size);
+  std::string* mutable_version();
+  std::string* release_version();
+  void set_allocated_version(std::string* version);
+  private:
+  const std::string& _internal_version() const;
+  void _internal_set_version(const std::string& value);
+  std::string* _internal_mutable_version();
+  public:
+
+  // .google.protobuf.SourceContext source_context = 5;
+  bool has_source_context() const;
+  private:
+  bool _internal_has_source_context() const;
+  public:
+  void clear_source_context();
+  const PROTOBUF_NAMESPACE_ID::SourceContext& source_context() const;
+  PROTOBUF_NAMESPACE_ID::SourceContext* release_source_context();
+  PROTOBUF_NAMESPACE_ID::SourceContext* mutable_source_context();
+  void set_allocated_source_context(PROTOBUF_NAMESPACE_ID::SourceContext* source_context);
+  private:
+  const PROTOBUF_NAMESPACE_ID::SourceContext& _internal_source_context() const;
+  PROTOBUF_NAMESPACE_ID::SourceContext* _internal_mutable_source_context();
+  public:
+  void unsafe_arena_set_allocated_source_context(
+      PROTOBUF_NAMESPACE_ID::SourceContext* source_context);
+  PROTOBUF_NAMESPACE_ID::SourceContext* unsafe_arena_release_source_context();
+
+  // .google.protobuf.Syntax syntax = 7;
+  void clear_syntax();
+  PROTOBUF_NAMESPACE_ID::Syntax syntax() const;
+  void set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value);
+  private:
+  PROTOBUF_NAMESPACE_ID::Syntax _internal_syntax() const;
+  void _internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Api)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method > methods_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option > options_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin > mixins_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr version_;
+  PROTOBUF_NAMESPACE_ID::SourceContext* source_context_;
+  int syntax_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT Method PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Method) */ {
+ public:
+  inline Method() : Method(nullptr) {}
+  virtual ~Method();
+
+  Method(const Method& from);
+  Method(Method&& from) noexcept
+    : Method() {
+    *this = ::std::move(from);
+  }
+
+  inline Method& operator=(const Method& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Method& operator=(Method&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Method& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Method* internal_default_instance() {
+    return reinterpret_cast<const Method*>(
+               &_Method_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    1;
+
+  friend void swap(Method& a, Method& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Method* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Method* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Method* New() const final {
+    return CreateMaybeMessage<Method>(nullptr);
+  }
+
+  Method* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Method>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Method& from);
+  void MergeFrom(const Method& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Method* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Method";
+  }
+  protected:
+  explicit Method(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kOptionsFieldNumber = 6,
+    kNameFieldNumber = 1,
+    kRequestTypeUrlFieldNumber = 2,
+    kResponseTypeUrlFieldNumber = 4,
+    kRequestStreamingFieldNumber = 3,
+    kResponseStreamingFieldNumber = 5,
+    kSyntaxFieldNumber = 7,
+  };
+  // repeated .google.protobuf.Option options = 6;
+  int options_size() const;
+  private:
+  int _internal_options_size() const;
+  public:
+  void clear_options();
+  PROTOBUF_NAMESPACE_ID::Option* mutable_options(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >*
+      mutable_options();
+  private:
+  const PROTOBUF_NAMESPACE_ID::Option& _internal_options(int index) const;
+  PROTOBUF_NAMESPACE_ID::Option* _internal_add_options();
+  public:
+  const PROTOBUF_NAMESPACE_ID::Option& options(int index) const;
+  PROTOBUF_NAMESPACE_ID::Option* add_options();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >&
+      options() const;
+
+  // string name = 1;
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // string request_type_url = 2;
+  void clear_request_type_url();
+  const std::string& request_type_url() const;
+  void set_request_type_url(const std::string& value);
+  void set_request_type_url(std::string&& value);
+  void set_request_type_url(const char* value);
+  void set_request_type_url(const char* value, size_t size);
+  std::string* mutable_request_type_url();
+  std::string* release_request_type_url();
+  void set_allocated_request_type_url(std::string* request_type_url);
+  private:
+  const std::string& _internal_request_type_url() const;
+  void _internal_set_request_type_url(const std::string& value);
+  std::string* _internal_mutable_request_type_url();
+  public:
+
+  // string response_type_url = 4;
+  void clear_response_type_url();
+  const std::string& response_type_url() const;
+  void set_response_type_url(const std::string& value);
+  void set_response_type_url(std::string&& value);
+  void set_response_type_url(const char* value);
+  void set_response_type_url(const char* value, size_t size);
+  std::string* mutable_response_type_url();
+  std::string* release_response_type_url();
+  void set_allocated_response_type_url(std::string* response_type_url);
+  private:
+  const std::string& _internal_response_type_url() const;
+  void _internal_set_response_type_url(const std::string& value);
+  std::string* _internal_mutable_response_type_url();
+  public:
+
+  // bool request_streaming = 3;
+  void clear_request_streaming();
+  bool request_streaming() const;
+  void set_request_streaming(bool value);
+  private:
+  bool _internal_request_streaming() const;
+  void _internal_set_request_streaming(bool value);
+  public:
+
+  // bool response_streaming = 5;
+  void clear_response_streaming();
+  bool response_streaming() const;
+  void set_response_streaming(bool value);
+  private:
+  bool _internal_response_streaming() const;
+  void _internal_set_response_streaming(bool value);
+  public:
+
+  // .google.protobuf.Syntax syntax = 7;
+  void clear_syntax();
+  PROTOBUF_NAMESPACE_ID::Syntax syntax() const;
+  void set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value);
+  private:
+  PROTOBUF_NAMESPACE_ID::Syntax _internal_syntax() const;
+  void _internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Method)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option > options_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr request_type_url_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr response_type_url_;
+  bool request_streaming_;
+  bool response_streaming_;
+  int syntax_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT Mixin PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Mixin) */ {
+ public:
+  inline Mixin() : Mixin(nullptr) {}
+  virtual ~Mixin();
+
+  Mixin(const Mixin& from);
+  Mixin(Mixin&& from) noexcept
+    : Mixin() {
+    *this = ::std::move(from);
+  }
+
+  inline Mixin& operator=(const Mixin& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Mixin& operator=(Mixin&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Mixin& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Mixin* internal_default_instance() {
+    return reinterpret_cast<const Mixin*>(
+               &_Mixin_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    2;
+
+  friend void swap(Mixin& a, Mixin& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Mixin* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Mixin* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Mixin* New() const final {
+    return CreateMaybeMessage<Mixin>(nullptr);
+  }
+
+  Mixin* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Mixin>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Mixin& from);
+  void MergeFrom(const Mixin& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Mixin* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Mixin";
+  }
+  protected:
+  explicit Mixin(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fapi_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fapi_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 1,
+    kRootFieldNumber = 2,
+  };
+  // string name = 1;
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // string root = 2;
+  void clear_root();
+  const std::string& root() const;
+  void set_root(const std::string& value);
+  void set_root(std::string&& value);
+  void set_root(const char* value);
+  void set_root(const char* value, size_t size);
+  std::string* mutable_root();
+  std::string* release_root();
+  void set_allocated_root(std::string* root);
+  private:
+  const std::string& _internal_root() const;
+  void _internal_set_root(const std::string& value);
+  std::string* _internal_mutable_root();
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Mixin)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr root_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fapi_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// Api
+
+// string name = 1;
+inline void Api::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Api::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.name)
+  return _internal_name();
+}
+inline void Api::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Api.name)
+}
+inline std::string* Api::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.name)
+  return _internal_mutable_name();
+}
+inline const std::string& Api::_internal_name() const {
+  return name_.Get();
+}
+inline void Api::_internal_set_name(const std::string& value) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Api::set_name(std::string&& value) {
+  
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Api.name)
+}
+inline void Api::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Api.name)
+}
+inline void Api::set_name(const char* value,
+    size_t size) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Api.name)
+}
+inline std::string* Api::_internal_mutable_name() {
+  
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Api::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Api.name)
+  return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Api::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.name)
+}
+
+// repeated .google.protobuf.Method methods = 2;
+inline int Api::_internal_methods_size() const {
+  return methods_.size();
+}
+inline int Api::methods_size() const {
+  return _internal_methods_size();
+}
+inline void Api::clear_methods() {
+  methods_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::Method* Api::mutable_methods(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.methods)
+  return methods_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >*
+Api::mutable_methods() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.methods)
+  return &methods_;
+}
+inline const PROTOBUF_NAMESPACE_ID::Method& Api::_internal_methods(int index) const {
+  return methods_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::Method& Api::methods(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.methods)
+  return _internal_methods(index);
+}
+inline PROTOBUF_NAMESPACE_ID::Method* Api::_internal_add_methods() {
+  return methods_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::Method* Api::add_methods() {
+  // @@protoc_insertion_point(field_add:google.protobuf.Api.methods)
+  return _internal_add_methods();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Method >&
+Api::methods() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.Api.methods)
+  return methods_;
+}
+
+// repeated .google.protobuf.Option options = 3;
+inline int Api::_internal_options_size() const {
+  return options_.size();
+}
+inline int Api::options_size() const {
+  return _internal_options_size();
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Api::mutable_options(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.options)
+  return options_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >*
+Api::mutable_options() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.options)
+  return &options_;
+}
+inline const PROTOBUF_NAMESPACE_ID::Option& Api::_internal_options(int index) const {
+  return options_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::Option& Api::options(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.options)
+  return _internal_options(index);
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Api::_internal_add_options() {
+  return options_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Api::add_options() {
+  // @@protoc_insertion_point(field_add:google.protobuf.Api.options)
+  return _internal_add_options();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >&
+Api::options() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.Api.options)
+  return options_;
+}
+
+// string version = 4;
+inline void Api::clear_version() {
+  version_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Api::version() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.version)
+  return _internal_version();
+}
+inline void Api::set_version(const std::string& value) {
+  _internal_set_version(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Api.version)
+}
+inline std::string* Api::mutable_version() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.version)
+  return _internal_mutable_version();
+}
+inline const std::string& Api::_internal_version() const {
+  return version_.Get();
+}
+inline void Api::_internal_set_version(const std::string& value) {
+  
+  version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Api::set_version(std::string&& value) {
+  
+  version_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Api.version)
+}
+inline void Api::set_version(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Api.version)
+}
+inline void Api::set_version(const char* value,
+    size_t size) {
+  
+  version_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Api.version)
+}
+inline std::string* Api::_internal_mutable_version() {
+  
+  return version_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Api::release_version() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Api.version)
+  return version_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Api::set_allocated_version(std::string* version) {
+  if (version != nullptr) {
+    
+  } else {
+    
+  }
+  version_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), version,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.version)
+}
+
+// .google.protobuf.SourceContext source_context = 5;
+inline bool Api::_internal_has_source_context() const {
+  return this != internal_default_instance() && source_context_ != nullptr;
+}
+inline bool Api::has_source_context() const {
+  return _internal_has_source_context();
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceContext& Api::_internal_source_context() const {
+  const PROTOBUF_NAMESPACE_ID::SourceContext* p = source_context_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::SourceContext*>(
+      &PROTOBUF_NAMESPACE_ID::_SourceContext_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceContext& Api::source_context() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.source_context)
+  return _internal_source_context();
+}
+inline void Api::unsafe_arena_set_allocated_source_context(
+    PROTOBUF_NAMESPACE_ID::SourceContext* source_context) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context_);
+  }
+  source_context_ = source_context;
+  if (source_context) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Api.source_context)
+}
+inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::release_source_context() {
+  
+  PROTOBUF_NAMESPACE_ID::SourceContext* temp = source_context_;
+  source_context_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::unsafe_arena_release_source_context() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Api.source_context)
+  
+  PROTOBUF_NAMESPACE_ID::SourceContext* temp = source_context_;
+  source_context_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::_internal_mutable_source_context() {
+  
+  if (source_context_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::SourceContext>(GetArena());
+    source_context_ = p;
+  }
+  return source_context_;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceContext* Api::mutable_source_context() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.source_context)
+  return _internal_mutable_source_context();
+}
+inline void Api::set_allocated_source_context(PROTOBUF_NAMESPACE_ID::SourceContext* source_context) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete reinterpret_cast< ::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context_);
+  }
+  if (source_context) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_context)->GetArena();
+    if (message_arena != submessage_arena) {
+      source_context = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, source_context, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  source_context_ = source_context;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Api.source_context)
+}
+
+// repeated .google.protobuf.Mixin mixins = 6;
+inline int Api::_internal_mixins_size() const {
+  return mixins_.size();
+}
+inline int Api::mixins_size() const {
+  return _internal_mixins_size();
+}
+inline void Api::clear_mixins() {
+  mixins_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::Mixin* Api::mutable_mixins(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Api.mixins)
+  return mixins_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >*
+Api::mutable_mixins() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.Api.mixins)
+  return &mixins_;
+}
+inline const PROTOBUF_NAMESPACE_ID::Mixin& Api::_internal_mixins(int index) const {
+  return mixins_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::Mixin& Api::mixins(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.mixins)
+  return _internal_mixins(index);
+}
+inline PROTOBUF_NAMESPACE_ID::Mixin* Api::_internal_add_mixins() {
+  return mixins_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::Mixin* Api::add_mixins() {
+  // @@protoc_insertion_point(field_add:google.protobuf.Api.mixins)
+  return _internal_add_mixins();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Mixin >&
+Api::mixins() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.Api.mixins)
+  return mixins_;
+}
+
+// .google.protobuf.Syntax syntax = 7;
+inline void Api::clear_syntax() {
+  syntax_ = 0;
+}
+inline PROTOBUF_NAMESPACE_ID::Syntax Api::_internal_syntax() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::Syntax >(syntax_);
+}
+inline PROTOBUF_NAMESPACE_ID::Syntax Api::syntax() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Api.syntax)
+  return _internal_syntax();
+}
+inline void Api::_internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) {
+  
+  syntax_ = value;
+}
+inline void Api::set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) {
+  _internal_set_syntax(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Api.syntax)
+}
+
+// -------------------------------------------------------------------
+
+// Method
+
+// string name = 1;
+inline void Method::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Method::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.name)
+  return _internal_name();
+}
+inline void Method::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.name)
+}
+inline std::string* Method::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Method.name)
+  return _internal_mutable_name();
+}
+inline const std::string& Method::_internal_name() const {
+  return name_.Get();
+}
+inline void Method::_internal_set_name(const std::string& value) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Method::set_name(std::string&& value) {
+  
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.name)
+}
+inline void Method::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Method.name)
+}
+inline void Method::set_name(const char* value,
+    size_t size) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.name)
+}
+inline std::string* Method::_internal_mutable_name() {
+  
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Method::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Method.name)
+  return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Method::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.name)
+}
+
+// string request_type_url = 2;
+inline void Method::clear_request_type_url() {
+  request_type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Method::request_type_url() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.request_type_url)
+  return _internal_request_type_url();
+}
+inline void Method::set_request_type_url(const std::string& value) {
+  _internal_set_request_type_url(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.request_type_url)
+}
+inline std::string* Method::mutable_request_type_url() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Method.request_type_url)
+  return _internal_mutable_request_type_url();
+}
+inline const std::string& Method::_internal_request_type_url() const {
+  return request_type_url_.Get();
+}
+inline void Method::_internal_set_request_type_url(const std::string& value) {
+  
+  request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Method::set_request_type_url(std::string&& value) {
+  
+  request_type_url_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.request_type_url)
+}
+inline void Method::set_request_type_url(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Method.request_type_url)
+}
+inline void Method::set_request_type_url(const char* value,
+    size_t size) {
+  
+  request_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.request_type_url)
+}
+inline std::string* Method::_internal_mutable_request_type_url() {
+  
+  return request_type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Method::release_request_type_url() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Method.request_type_url)
+  return request_type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Method::set_allocated_request_type_url(std::string* request_type_url) {
+  if (request_type_url != nullptr) {
+    
+  } else {
+    
+  }
+  request_type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), request_type_url,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.request_type_url)
+}
+
+// bool request_streaming = 3;
+inline void Method::clear_request_streaming() {
+  request_streaming_ = false;
+}
+inline bool Method::_internal_request_streaming() const {
+  return request_streaming_;
+}
+inline bool Method::request_streaming() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.request_streaming)
+  return _internal_request_streaming();
+}
+inline void Method::_internal_set_request_streaming(bool value) {
+  
+  request_streaming_ = value;
+}
+inline void Method::set_request_streaming(bool value) {
+  _internal_set_request_streaming(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.request_streaming)
+}
+
+// string response_type_url = 4;
+inline void Method::clear_response_type_url() {
+  response_type_url_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Method::response_type_url() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.response_type_url)
+  return _internal_response_type_url();
+}
+inline void Method::set_response_type_url(const std::string& value) {
+  _internal_set_response_type_url(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.response_type_url)
+}
+inline std::string* Method::mutable_response_type_url() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Method.response_type_url)
+  return _internal_mutable_response_type_url();
+}
+inline const std::string& Method::_internal_response_type_url() const {
+  return response_type_url_.Get();
+}
+inline void Method::_internal_set_response_type_url(const std::string& value) {
+  
+  response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Method::set_response_type_url(std::string&& value) {
+  
+  response_type_url_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Method.response_type_url)
+}
+inline void Method::set_response_type_url(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Method.response_type_url)
+}
+inline void Method::set_response_type_url(const char* value,
+    size_t size) {
+  
+  response_type_url_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Method.response_type_url)
+}
+inline std::string* Method::_internal_mutable_response_type_url() {
+  
+  return response_type_url_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Method::release_response_type_url() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Method.response_type_url)
+  return response_type_url_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Method::set_allocated_response_type_url(std::string* response_type_url) {
+  if (response_type_url != nullptr) {
+    
+  } else {
+    
+  }
+  response_type_url_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), response_type_url,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Method.response_type_url)
+}
+
+// bool response_streaming = 5;
+inline void Method::clear_response_streaming() {
+  response_streaming_ = false;
+}
+inline bool Method::_internal_response_streaming() const {
+  return response_streaming_;
+}
+inline bool Method::response_streaming() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.response_streaming)
+  return _internal_response_streaming();
+}
+inline void Method::_internal_set_response_streaming(bool value) {
+  
+  response_streaming_ = value;
+}
+inline void Method::set_response_streaming(bool value) {
+  _internal_set_response_streaming(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.response_streaming)
+}
+
+// repeated .google.protobuf.Option options = 6;
+inline int Method::_internal_options_size() const {
+  return options_.size();
+}
+inline int Method::options_size() const {
+  return _internal_options_size();
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Method::mutable_options(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Method.options)
+  return options_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >*
+Method::mutable_options() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.Method.options)
+  return &options_;
+}
+inline const PROTOBUF_NAMESPACE_ID::Option& Method::_internal_options(int index) const {
+  return options_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::Option& Method::options(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.options)
+  return _internal_options(index);
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Method::_internal_add_options() {
+  return options_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::Option* Method::add_options() {
+  // @@protoc_insertion_point(field_add:google.protobuf.Method.options)
+  return _internal_add_options();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Option >&
+Method::options() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.Method.options)
+  return options_;
+}
+
+// .google.protobuf.Syntax syntax = 7;
+inline void Method::clear_syntax() {
+  syntax_ = 0;
+}
+inline PROTOBUF_NAMESPACE_ID::Syntax Method::_internal_syntax() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::Syntax >(syntax_);
+}
+inline PROTOBUF_NAMESPACE_ID::Syntax Method::syntax() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Method.syntax)
+  return _internal_syntax();
+}
+inline void Method::_internal_set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) {
+  
+  syntax_ = value;
+}
+inline void Method::set_syntax(PROTOBUF_NAMESPACE_ID::Syntax value) {
+  _internal_set_syntax(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Method.syntax)
+}
+
+// -------------------------------------------------------------------
+
+// Mixin
+
+// string name = 1;
+inline void Mixin::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Mixin::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Mixin.name)
+  return _internal_name();
+}
+inline void Mixin::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Mixin.name)
+}
+inline std::string* Mixin::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Mixin.name)
+  return _internal_mutable_name();
+}
+inline const std::string& Mixin::_internal_name() const {
+  return name_.Get();
+}
+inline void Mixin::_internal_set_name(const std::string& value) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Mixin::set_name(std::string&& value) {
+  
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Mixin.name)
+}
+inline void Mixin::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Mixin.name)
+}
+inline void Mixin::set_name(const char* value,
+    size_t size) {
+  
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Mixin.name)
+}
+inline std::string* Mixin::_internal_mutable_name() {
+  
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Mixin::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Mixin.name)
+  return name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Mixin::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Mixin.name)
+}
+
+// string root = 2;
+inline void Mixin::clear_root() {
+  root_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& Mixin::root() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Mixin.root)
+  return _internal_root();
+}
+inline void Mixin::set_root(const std::string& value) {
+  _internal_set_root(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Mixin.root)
+}
+inline std::string* Mixin::mutable_root() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Mixin.root)
+  return _internal_mutable_root();
+}
+inline const std::string& Mixin::_internal_root() const {
+  return root_.Get();
+}
+inline void Mixin::_internal_set_root(const std::string& value) {
+  
+  root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Mixin::set_root(std::string&& value) {
+  
+  root_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Mixin.root)
+}
+inline void Mixin::set_root(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Mixin.root)
+}
+inline void Mixin::set_root(const char* value,
+    size_t size) {
+  
+  root_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Mixin.root)
+}
+inline std::string* Mixin::_internal_mutable_root() {
+  
+  return root_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Mixin::release_root() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Mixin.root)
+  return root_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void Mixin::set_allocated_root(std::string* root) {
+  if (root != nullptr) {
+    
+  } else {
+    
+  }
+  root_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), root,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Mixin.root)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fapi_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h
new file mode 100644
index 0000000000000000000000000000000000000000..33adc15cad401fbeb880476d3965a301232a5777
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/arena.h
@@ -0,0 +1,741 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file defines an Arena allocator for better allocation performance.
+
+#ifndef GOOGLE_PROTOBUF_ARENA_H__
+#define GOOGLE_PROTOBUF_ARENA_H__
+
+
+#include <limits>
+#include <type_traits>
+#include <utility>
+#ifdef max
+#undef max  // Visual Studio defines this macro
+#endif
+#if defined(_MSC_VER) && !defined(_LIBCPP_STD_VER) && !_HAS_EXCEPTIONS
+// Work around bugs in MSVC <typeinfo> header when _HAS_EXCEPTIONS=0.
+#include <exception>
+#include <typeinfo>
+namespace std {
+using type_info = ::type_info;
+}
+#else
+#include <typeinfo>
+#endif
+
+#include <type_traits>
+#include <google/protobuf/arena_impl.h>
+#include <google/protobuf/port.h>
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+
+struct ArenaOptions;  // defined below
+
+}  // namespace protobuf
+}  // namespace google
+
+namespace google {
+namespace protobuf {
+
+class Arena;    // defined below
+class Message;  // defined in message.h
+class MessageLite;
+template <typename Key, typename T>
+class Map;
+
+namespace arena_metrics {
+
+void EnableArenaMetrics(ArenaOptions* options);
+
+}  // namespace arena_metrics
+
+namespace internal {
+
+struct ArenaStringPtr;  // defined in arenastring.h
+class LazyField;        // defined in lazy_field.h
+class EpsCopyInputStream;  // defined in parse_context.h
+
+template <typename Type>
+class GenericTypeHandler;  // defined in repeated_field.h
+
+// Templated cleanup methods.
+template <typename T>
+void arena_destruct_object(void* object) {
+  reinterpret_cast<T*>(object)->~T();
+}
+template <typename T>
+void arena_delete_object(void* object) {
+  delete reinterpret_cast<T*>(object);
+}
+inline void arena_free(void* object, size_t size) {
+#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation)
+  ::operator delete(object, size);
+#else
+  (void)size;
+  ::operator delete(object);
+#endif
+}
+
+}  // namespace internal
+
+// ArenaOptions provides optional additional parameters to arena construction
+// that control its block-allocation behavior.
+struct ArenaOptions {
+  // This defines the size of the first block requested from the system malloc.
+  // Subsequent block sizes will increase in a geometric series up to a maximum.
+  size_t start_block_size;
+
+  // This defines the maximum block size requested from system malloc (unless an
+  // individual arena allocation request occurs with a size larger than this
+  // maximum). Requested block sizes increase up to this value, then remain
+  // here.
+  size_t max_block_size;
+
+  // An initial block of memory for the arena to use, or NULL for none. If
+  // provided, the block must live at least as long as the arena itself. The
+  // creator of the Arena retains ownership of the block after the Arena is
+  // destroyed.
+  char* initial_block;
+
+  // The size of the initial block, if provided.
+  size_t initial_block_size;
+
+  // A function pointer to an alloc method that returns memory blocks of size
+  // requested. By default, it contains a ptr to the malloc function.
+  //
+  // NOTE: block_alloc and dealloc functions are expected to behave like
+  // malloc and free, including Asan poisoning.
+  void* (*block_alloc)(size_t);
+  // A function pointer to a dealloc method that takes ownership of the blocks
+  // from the arena. By default, it contains a ptr to a wrapper function that
+  // calls free.
+  void (*block_dealloc)(void*, size_t);
+
+  ArenaOptions()
+      : start_block_size(kDefaultStartBlockSize),
+        max_block_size(kDefaultMaxBlockSize),
+        initial_block(NULL),
+        initial_block_size(0),
+        block_alloc(&::operator new),
+        block_dealloc(&internal::arena_free),
+        on_arena_init(NULL),
+        on_arena_reset(NULL),
+        on_arena_destruction(NULL),
+        on_arena_allocation(NULL) {}
+
+ private:
+  // Hooks for adding external functionality such as user-specific metrics
+  // collection, specific debugging abilities, etc.
+  // Init hook (if set) will always be called at Arena init time. Init hook may
+  // return a pointer to a cookie to be stored in the arena. Reset and
+  // destruction hooks will then be called with the same cookie pointer. This
+  // allows us to save an external object per arena instance and use it on the
+  // other hooks (Note: If init hook returns NULL, the other hooks will NOT be
+  // called on this arena instance).
+  // on_arena_reset and on_arena_destruction also receive the space used in the
+  // arena just before the reset.
+  void* (*on_arena_init)(Arena* arena);
+  void (*on_arena_reset)(Arena* arena, void* cookie, uint64 space_used);
+  void (*on_arena_destruction)(Arena* arena, void* cookie, uint64 space_used);
+
+  // type_info is promised to be static - its lifetime extends to
+  // match program's lifetime (It is given by typeid operator).
+  // Note: typeid(void) will be passed as allocated_type every time we
+  // intentionally want to avoid monitoring an allocation. (i.e. internal
+  // allocations for managing the arena)
+  void (*on_arena_allocation)(const std::type_info* allocated_type,
+                              uint64 alloc_size, void* cookie);
+
+  // Constants define default starting block size and max block size for
+  // arena allocator behavior -- see descriptions above.
+  static const size_t kDefaultStartBlockSize = 256;
+  static const size_t kDefaultMaxBlockSize = 8192;
+
+  friend void arena_metrics::EnableArenaMetrics(ArenaOptions*);
+  friend class Arena;
+  friend class ArenaOptionsTestFriend;
+};
+
+// Support for non-RTTI environments. (The metrics hooks API uses type
+// information.)
+#if PROTOBUF_RTTI
+#define RTTI_TYPE_ID(type) (&typeid(type))
+#else
+#define RTTI_TYPE_ID(type) (NULL)
+#endif
+
+// Arena allocator. Arena allocation replaces ordinary (heap-based) allocation
+// with new/delete, and improves performance by aggregating allocations into
+// larger blocks and freeing allocations all at once. Protocol messages are
+// allocated on an arena by using Arena::CreateMessage<T>(Arena*), below, and
+// are automatically freed when the arena is destroyed.
+//
+// This is a thread-safe implementation: multiple threads may allocate from the
+// arena concurrently. Destruction is not thread-safe and the destructing
+// thread must synchronize with users of the arena first.
+//
+// An arena provides two allocation interfaces: CreateMessage<T>, which works
+// for arena-enabled proto2 message types as well as other types that satisfy
+// the appropriate protocol (described below), and Create<T>, which works for
+// any arbitrary type T. CreateMessage<T> is better when the type T supports it,
+// because this interface (i) passes the arena pointer to the created object so
+// that its sub-objects and internal allocations can use the arena too, and (ii)
+// elides the object's destructor call when possible. Create<T> does not place
+// any special requirements on the type T, and will invoke the object's
+// destructor when the arena is destroyed.
+//
+// The arena message allocation protocol, required by
+// CreateMessage<T>(Arena* arena, Args&&... args), is as follows:
+//
+// - The type T must have (at least) two constructors: a constructor callable
+//   with `args` (without `arena`), called when a T is allocated on the heap;
+//   and a constructor callable with `Arena* arena, Args&&... args`, called when
+//   a T is allocated on an arena. If the second constructor is called with a
+//   NULL arena pointer, it must be equivalent to invoking the first
+//   (`args`-only) constructor.
+//
+// - The type T must have a particular type trait: a nested type
+//   |InternalArenaConstructable_|. This is usually a typedef to |void|. If no
+//   such type trait exists, then the instantiation CreateMessage<T> will fail
+//   to compile.
+//
+// - The type T *may* have the type trait |DestructorSkippable_|. If this type
+//   trait is present in the type, then its destructor will not be called if and
+//   only if it was passed a non-NULL arena pointer. If this type trait is not
+//   present on the type, then its destructor is always called when the
+//   containing arena is destroyed.
+//
+// This protocol is implemented by all arena-enabled proto2 message classes as
+// well as protobuf container types like RepeatedPtrField and Map. The protocol
+// is internal to protobuf and is not guaranteed to be stable. Non-proto types
+// should not rely on this protocol.
+class PROTOBUF_EXPORT PROTOBUF_ALIGNAS(8) Arena final {
+ public:
+  // Arena constructor taking custom options. See ArenaOptions below for
+  // descriptions of the options available.
+  explicit Arena(const ArenaOptions& options) : impl_(options) {
+    Init(options);
+  }
+
+  // Block overhead.  Use this as a guide for how much to over-allocate the
+  // initial block if you want an allocation of size N to fit inside it.
+  //
+  // WARNING: if you allocate multiple objects, it is difficult to guarantee
+  // that a series of allocations will fit in the initial block, especially if
+  // Arena changes its alignment guarantees in the future!
+  static const size_t kBlockOverhead = internal::ArenaImpl::kBlockHeaderSize +
+                                       internal::ArenaImpl::kSerialArenaSize;
+
+  // Default constructor with sensible default options, tuned for average
+  // use-cases.
+  Arena() : impl_(ArenaOptions()) { Init(ArenaOptions()); }
+
+  ~Arena() {
+    if (hooks_cookie_) {
+      CallDestructorHooks();
+    }
+  }
+
+  void Init(const ArenaOptions& options) {
+    on_arena_allocation_ = options.on_arena_allocation;
+    on_arena_reset_ = options.on_arena_reset;
+    on_arena_destruction_ = options.on_arena_destruction;
+    // Call the initialization hook
+    if (options.on_arena_init != NULL) {
+      hooks_cookie_ = options.on_arena_init(this);
+    } else {
+      hooks_cookie_ = NULL;
+    }
+  }
+
+  // API to create proto2 message objects on the arena. If the arena passed in
+  // is NULL, then a heap allocated object is returned. Type T must be a message
+  // defined in a .proto file with cc_enable_arenas set to true, otherwise a
+  // compilation error will occur.
+  //
+  // RepeatedField and RepeatedPtrField may also be instantiated directly on an
+  // arena with this method.
+  //
+  // This function also accepts any type T that satisfies the arena message
+  // allocation protocol, documented above.
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateMessage(Arena* arena, Args&&... args) {
+    static_assert(
+        InternalHelper<T>::is_arena_constructable::value,
+        "CreateMessage can only construct types that are ArenaConstructable");
+    // We must delegate to CreateMaybeMessage() and NOT CreateMessageInternal()
+    // because protobuf generated classes specialize CreateMaybeMessage() and we
+    // need to use that specialization for code size reasons.
+    return Arena::CreateMaybeMessage<T>(arena, std::forward<Args>(args)...);
+  }
+
+  // API to create any objects on the arena. Note that only the object will
+  // be created on the arena; the underlying ptrs (in case of a proto2 message)
+  // will be still heap allocated. Proto messages should usually be allocated
+  // with CreateMessage<T>() instead.
+  //
+  // Note that even if T satisfies the arena message construction protocol
+  // (InternalArenaConstructable_ trait and optional DestructorSkippable_
+  // trait), as described above, this function does not follow the protocol;
+  // instead, it treats T as a black-box type, just as if it did not have these
+  // traits. Specifically, T's constructor arguments will always be only those
+  // passed to Create<T>() -- no additional arena pointer is implicitly added.
+  // Furthermore, the destructor will always be called at arena destruction time
+  // (unless the destructor is trivial). Hence, from T's point of view, it is as
+  // if the object were allocated on the heap (except that the underlying memory
+  // is obtained from the arena).
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* Create(Arena* arena, Args&&... args) {
+    return CreateNoMessage<T>(arena, is_arena_constructable<T>(),
+                              std::forward<Args>(args)...);
+  }
+
+  // Create an array of object type T on the arena *without* invoking the
+  // constructor of T. If `arena` is null, then the return value should be freed
+  // with `delete[] x;` (or `::operator delete[](x);`).
+  // To ensure safe uses, this function checks at compile time
+  // (when compiled as C++11) that T is trivially default-constructible and
+  // trivially destructible.
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE static T* CreateArray(Arena* arena,
+                                               size_t num_elements) {
+    static_assert(std::is_pod<T>::value,
+                  "CreateArray requires a trivially constructible type");
+    static_assert(std::is_trivially_destructible<T>::value,
+                  "CreateArray requires a trivially destructible type");
+    GOOGLE_CHECK_LE(num_elements, std::numeric_limits<size_t>::max() / sizeof(T))
+        << "Requested size is too large to fit into size_t.";
+    if (arena == NULL) {
+      return static_cast<T*>(::operator new[](num_elements * sizeof(T)));
+    } else {
+      return arena->CreateInternalRawArray<T>(num_elements);
+    }
+  }
+
+  // Returns the total space allocated by the arena, which is the sum of the
+  // sizes of the underlying blocks. This method is relatively fast; a counter
+  // is kept as blocks are allocated.
+  uint64 SpaceAllocated() const { return impl_.SpaceAllocated(); }
+  // Returns the total space used by the arena. Similar to SpaceAllocated but
+  // does not include free space and block overhead. The total space returned
+  // may not include space used by other threads executing concurrently with
+  // the call to this method.
+  uint64 SpaceUsed() const { return impl_.SpaceUsed(); }
+
+  // Frees all storage allocated by this arena after calling destructors
+  // registered with OwnDestructor() and freeing objects registered with Own().
+  // Any objects allocated on this arena are unusable after this call. It also
+  // returns the total space used by the arena which is the sums of the sizes
+  // of the allocated blocks. This method is not thread-safe.
+  PROTOBUF_NOINLINE uint64 Reset() {
+    // Call the reset hook
+    if (on_arena_reset_ != NULL) {
+      on_arena_reset_(this, hooks_cookie_, impl_.SpaceAllocated());
+    }
+    return impl_.Reset();
+  }
+
+  // Adds |object| to a list of heap-allocated objects to be freed with |delete|
+  // when the arena is destroyed or reset.
+  template <typename T>
+  PROTOBUF_NOINLINE void Own(T* object) {
+    OwnInternal(object, std::is_convertible<T*, Message*>());
+  }
+
+  // Adds |object| to a list of objects whose destructors will be manually
+  // called when the arena is destroyed or reset. This differs from Own() in
+  // that it does not free the underlying memory with |delete|; hence, it is
+  // normally only used for objects that are placement-newed into
+  // arena-allocated memory.
+  template <typename T>
+  PROTOBUF_NOINLINE void OwnDestructor(T* object) {
+    if (object != NULL) {
+      impl_.AddCleanup(object, &internal::arena_destruct_object<T>);
+    }
+  }
+
+  // Adds a custom member function on an object to the list of destructors that
+  // will be manually called when the arena is destroyed or reset. This differs
+  // from OwnDestructor() in that any member function may be specified, not only
+  // the class destructor.
+  PROTOBUF_NOINLINE void OwnCustomDestructor(void* object,
+                                             void (*destruct)(void*)) {
+    impl_.AddCleanup(object, destruct);
+  }
+
+  // Retrieves the arena associated with |value| if |value| is an arena-capable
+  // message, or NULL otherwise. If possible, the call resolves at compile time.
+  // Note that we can often devirtualize calls to `value->GetArena()` so usually
+  // calling this method is unnecessary.
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE static Arena* GetArena(const T* value) {
+    return GetArenaInternal(value);
+  }
+
+  template <typename T>
+  class InternalHelper {
+    template <typename U>
+    static char DestructorSkippable(const typename U::DestructorSkippable_*);
+    template <typename U>
+    static double DestructorSkippable(...);
+
+    typedef std::integral_constant<
+        bool, sizeof(DestructorSkippable<T>(static_cast<const T*>(0))) ==
+                      sizeof(char) ||
+                  std::is_trivially_destructible<T>::value>
+        is_destructor_skippable;
+
+    template <typename U>
+    static char ArenaConstructable(
+        const typename U::InternalArenaConstructable_*);
+    template <typename U>
+    static double ArenaConstructable(...);
+
+    typedef std::integral_constant<bool, sizeof(ArenaConstructable<T>(
+                                             static_cast<const T*>(0))) ==
+                                             sizeof(char)>
+        is_arena_constructable;
+
+    template <typename U,
+              typename std::enable_if<
+                  std::is_same<Arena*, decltype(std::declval<const U>()
+                                                    .GetArena())>::value,
+                  int>::type = 0>
+    static char HasGetArena(decltype(&U::GetArena));
+    template <typename U>
+    static double HasGetArena(...);
+
+    typedef std::integral_constant<bool, sizeof(HasGetArena<T>(nullptr)) ==
+                                             sizeof(char)>
+        has_get_arena;
+
+    template <typename... Args>
+    static T* Construct(void* ptr, Args&&... args) {
+      return new (ptr) T(std::forward<Args>(args)...);
+    }
+
+    static Arena* GetArena(const T* p) { return p->GetArena(); }
+
+    friend class Arena;
+  };
+
+  // Helper typetraits that indicates support for arenas in a type T at compile
+  // time. This is public only to allow construction of higher-level templated
+  // utilities.
+  //
+  // is_arena_constructable<T>::value is true if the message type T has arena
+  // support enabled, and false otherwise.
+  //
+  // is_destructor_skippable<T>::value is true if the message type T has told
+  // the arena that it is safe to skip the destructor, and false otherwise.
+  //
+  // This is inside Arena because only Arena has the friend relationships
+  // necessary to see the underlying generated code traits.
+  template <typename T>
+  struct is_arena_constructable : InternalHelper<T>::is_arena_constructable {};
+  template <typename T>
+  struct is_destructor_skippable : InternalHelper<T>::is_destructor_skippable {
+  };
+
+ private:
+  template <typename T>
+  struct has_get_arena : InternalHelper<T>::has_get_arena {};
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateMessageInternal(Arena* arena,
+                                                         Args&&... args) {
+    static_assert(
+        InternalHelper<T>::is_arena_constructable::value,
+        "CreateMessage can only construct types that are ArenaConstructable");
+    if (arena == NULL) {
+      return new T(nullptr, std::forward<Args>(args)...);
+    } else {
+      return arena->DoCreateMessage<T>(std::forward<Args>(args)...);
+    }
+  }
+
+  // This specialization for no arguments is necessary, because its behavior is
+  // slightly different.  When the arena pointer is nullptr, it calls T()
+  // instead of T(nullptr).
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE static T* CreateMessageInternal(Arena* arena) {
+    static_assert(
+        InternalHelper<T>::is_arena_constructable::value,
+        "CreateMessage can only construct types that are ArenaConstructable");
+    if (arena == NULL) {
+      return new T();
+    } else {
+      return arena->DoCreateMessage<T>();
+    }
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateInternal(Arena* arena,
+                                                  Args&&... args) {
+    if (arena == NULL) {
+      return new T(std::forward<Args>(args)...);
+    } else {
+      return arena->DoCreate<T>(std::is_trivially_destructible<T>::value,
+                                std::forward<Args>(args)...);
+    }
+  }
+
+  void CallDestructorHooks();
+  void OnArenaAllocation(const std::type_info* allocated_type, size_t n) const;
+  inline void AllocHook(const std::type_info* allocated_type, size_t n) const {
+    if (PROTOBUF_PREDICT_FALSE(hooks_cookie_ != NULL)) {
+      OnArenaAllocation(allocated_type, n);
+    }
+  }
+
+  // Allocate and also optionally call on_arena_allocation callback with the
+  // allocated type info when the hooks are in place in ArenaOptions and
+  // the cookie is not null.
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE void* AllocateInternal(bool skip_explicit_ownership) {
+    static_assert(alignof(T) <= 8, "T is overaligned, see b/151247138");
+    const size_t n = internal::AlignUpTo8(sizeof(T));
+    AllocHook(RTTI_TYPE_ID(T), n);
+    // Monitor allocation if needed.
+    if (skip_explicit_ownership) {
+      return AllocateAlignedNoHook(n);
+    } else {
+      return impl_.AllocateAlignedAndAddCleanup(
+          n, &internal::arena_destruct_object<T>);
+    }
+  }
+
+  // CreateMessage<T> requires that T supports arenas, but this private method
+  // works whether or not T supports arenas. These are not exposed to user code
+  // as it can cause confusing API usages, and end up having double free in
+  // user code. These are used only internally from LazyField and Repeated
+  // fields, since they are designed to work in all mode combinations.
+  template <typename Msg, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static Msg* DoCreateMaybeMessage(Arena* arena,
+                                                          std::true_type,
+                                                          Args&&... args) {
+    return CreateMessageInternal<Msg>(arena, std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* DoCreateMaybeMessage(Arena* arena,
+                                                        std::false_type,
+                                                        Args&&... args) {
+    return CreateInternal<T>(arena, std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateMaybeMessage(Arena* arena,
+                                                      Args&&... args) {
+    return DoCreateMaybeMessage<T>(arena, is_arena_constructable<T>(),
+                                   std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateNoMessage(Arena* arena, std::true_type,
+                                                   Args&&... args) {
+    // User is constructing with Create() despite the fact that T supports arena
+    // construction.  In this case we have to delegate to CreateInternal(), and
+    // we can't use any CreateMaybeMessage() specialization that may be defined.
+    return CreateInternal<T>(arena, std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE static T* CreateNoMessage(Arena* arena,
+                                                   std::false_type,
+                                                   Args&&... args) {
+    // User is constructing with Create() and the type does not support arena
+    // construction.  In this case we can delegate to CreateMaybeMessage() and
+    // use any specialization that may be available for that.
+    return CreateMaybeMessage<T>(arena, std::forward<Args>(args)...);
+  }
+
+  // Just allocate the required size for the given type assuming the
+  // type has a trivial constructor.
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE T* CreateInternalRawArray(size_t num_elements) {
+    GOOGLE_CHECK_LE(num_elements, std::numeric_limits<size_t>::max() / sizeof(T))
+        << "Requested size is too large to fit into size_t.";
+    const size_t n = internal::AlignUpTo8(sizeof(T) * num_elements);
+    // Monitor allocation if needed.
+    AllocHook(RTTI_TYPE_ID(T), n);
+    return static_cast<T*>(AllocateAlignedNoHook(n));
+  }
+
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE T* DoCreate(bool skip_explicit_ownership,
+                                     Args&&... args) {
+    return new (AllocateInternal<T>(skip_explicit_ownership))
+        T(std::forward<Args>(args)...);
+  }
+  template <typename T, typename... Args>
+  PROTOBUF_ALWAYS_INLINE T* DoCreateMessage(Args&&... args) {
+    return InternalHelper<T>::Construct(
+        AllocateInternal<T>(InternalHelper<T>::is_destructor_skippable::value),
+        this, std::forward<Args>(args)...);
+  }
+
+  // CreateInArenaStorage is used to implement map field. Without it,
+  // Map need to call generated message's protected arena constructor,
+  // which needs to declare Map as friend of generated message.
+  template <typename T, typename... Args>
+  static void CreateInArenaStorage(T* ptr, Arena* arena, Args&&... args) {
+    CreateInArenaStorageInternal(ptr, arena,
+                                 typename is_arena_constructable<T>::type(),
+                                 std::forward<Args>(args)...);
+    RegisterDestructorInternal(
+        ptr, arena,
+        typename InternalHelper<T>::is_destructor_skippable::type());
+  }
+
+  template <typename T, typename... Args>
+  static void CreateInArenaStorageInternal(T* ptr, Arena* arena,
+                                           std::true_type, Args&&... args) {
+    InternalHelper<T>::Construct(ptr, arena, std::forward<Args>(args)...);
+  }
+  template <typename T, typename... Args>
+  static void CreateInArenaStorageInternal(T* ptr, Arena* /* arena */,
+                                           std::false_type, Args&&... args) {
+    new (ptr) T(std::forward<Args>(args)...);
+  }
+
+  template <typename T>
+  static void RegisterDestructorInternal(T* /* ptr */, Arena* /* arena */,
+                                         std::true_type) {}
+  template <typename T>
+  static void RegisterDestructorInternal(T* ptr, Arena* arena,
+                                         std::false_type) {
+    arena->OwnDestructor(ptr);
+  }
+
+  // These implement Own(), which registers an object for deletion (destructor
+  // call and operator delete()). The second parameter has type 'true_type' if T
+  // is a subtype of Message and 'false_type' otherwise. Collapsing
+  // all template instantiations to one for generic Message reduces code size,
+  // using the virtual destructor instead.
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE void OwnInternal(T* object, std::true_type) {
+    if (object != NULL) {
+      impl_.AddCleanup(object, &internal::arena_delete_object<Message>);
+    }
+  }
+  template <typename T>
+  PROTOBUF_ALWAYS_INLINE void OwnInternal(T* object, std::false_type) {
+    if (object != NULL) {
+      impl_.AddCleanup(object, &internal::arena_delete_object<T>);
+    }
+  }
+
+  // Implementation for GetArena(). Only message objects with
+  // InternalArenaConstructable_ tags can be associated with an arena, and such
+  // objects must implement a GetArena() method.
+  template <typename T, typename std::enable_if<
+                            is_arena_constructable<T>::value, int>::type = 0>
+  PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) {
+    return InternalHelper<T>::GetArena(value);
+  }
+  template <typename T,
+            typename std::enable_if<!is_arena_constructable<T>::value &&
+                                        has_get_arena<T>::value,
+                                    int>::type = 0>
+  PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) {
+    return value->GetArena();
+  }
+  template <typename T,
+            typename std::enable_if<!is_arena_constructable<T>::value &&
+                                        !has_get_arena<T>::value,
+                                    int>::type = 0>
+  PROTOBUF_ALWAYS_INLINE static Arena* GetArenaInternal(const T* value) {
+    (void)value;
+    return nullptr;
+  }
+
+  // For friends of arena.
+  void* AllocateAligned(size_t n) {
+    AllocHook(NULL, n);
+    return AllocateAlignedNoHook(internal::AlignUpTo8(n));
+  }
+  template<size_t Align>
+  void* AllocateAlignedTo(size_t n) {
+    static_assert(Align > 0, "Alignment must be greater than 0");
+    static_assert((Align & (Align - 1)) == 0, "Alignment must be power of two");
+    if (Align <= 8) return AllocateAligned(n);
+    // TODO(b/151247138): if the pointer would have been aligned already,
+    // this is wasting space. We should pass the alignment down.
+    uintptr_t ptr = reinterpret_cast<uintptr_t>(AllocateAligned(n + Align - 8));
+    ptr = (ptr + Align - 1) & -Align;
+    return reinterpret_cast<void*>(ptr);
+  }
+
+  void* AllocateAlignedNoHook(size_t n);
+
+  internal::ArenaImpl impl_;
+
+  void (*on_arena_allocation_)(const std::type_info* allocated_type,
+                               uint64 alloc_size, void* cookie);
+  void (*on_arena_reset_)(Arena* arena, void* cookie, uint64 space_used);
+  void (*on_arena_destruction_)(Arena* arena, void* cookie, uint64 space_used);
+
+  // The arena may save a cookie it receives from the external on_init hook
+  // and then use it when calling the on_reset and on_destruction hooks.
+  void* hooks_cookie_;
+
+  template <typename Type>
+  friend class internal::GenericTypeHandler;
+  friend struct internal::ArenaStringPtr;  // For AllocateAligned.
+  friend class internal::LazyField;        // For CreateMaybeMessage.
+  friend class internal::EpsCopyInputStream;  // For parser performance
+  friend class MessageLite;
+  template <typename Key, typename T>
+  friend class Map;
+};
+
+// Defined above for supporting environments without RTTI.
+#undef RTTI_TYPE_ID
+
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_ARENA_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..9eb2b2e55df09165755d5977cef55d75725ab2d2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/descriptor.pb.h
@@ -0,0 +1,12958 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/descriptor.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/generated_enum_reflection.h>
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fdescriptor_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fdescriptor_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[27]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fdescriptor_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class DescriptorProto;
+class DescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern DescriptorProtoDefaultTypeInternal _DescriptorProto_default_instance_;
+class DescriptorProto_ExtensionRange;
+class DescriptorProto_ExtensionRangeDefaultTypeInternal;
+PROTOBUF_EXPORT extern DescriptorProto_ExtensionRangeDefaultTypeInternal _DescriptorProto_ExtensionRange_default_instance_;
+class DescriptorProto_ReservedRange;
+class DescriptorProto_ReservedRangeDefaultTypeInternal;
+PROTOBUF_EXPORT extern DescriptorProto_ReservedRangeDefaultTypeInternal _DescriptorProto_ReservedRange_default_instance_;
+class EnumDescriptorProto;
+class EnumDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern EnumDescriptorProtoDefaultTypeInternal _EnumDescriptorProto_default_instance_;
+class EnumDescriptorProto_EnumReservedRange;
+class EnumDescriptorProto_EnumReservedRangeDefaultTypeInternal;
+PROTOBUF_EXPORT extern EnumDescriptorProto_EnumReservedRangeDefaultTypeInternal _EnumDescriptorProto_EnumReservedRange_default_instance_;
+class EnumOptions;
+class EnumOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern EnumOptionsDefaultTypeInternal _EnumOptions_default_instance_;
+class EnumValueDescriptorProto;
+class EnumValueDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern EnumValueDescriptorProtoDefaultTypeInternal _EnumValueDescriptorProto_default_instance_;
+class EnumValueOptions;
+class EnumValueOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern EnumValueOptionsDefaultTypeInternal _EnumValueOptions_default_instance_;
+class ExtensionRangeOptions;
+class ExtensionRangeOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern ExtensionRangeOptionsDefaultTypeInternal _ExtensionRangeOptions_default_instance_;
+class FieldDescriptorProto;
+class FieldDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern FieldDescriptorProtoDefaultTypeInternal _FieldDescriptorProto_default_instance_;
+class FieldOptions;
+class FieldOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern FieldOptionsDefaultTypeInternal _FieldOptions_default_instance_;
+class FileDescriptorProto;
+class FileDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern FileDescriptorProtoDefaultTypeInternal _FileDescriptorProto_default_instance_;
+class FileDescriptorSet;
+class FileDescriptorSetDefaultTypeInternal;
+PROTOBUF_EXPORT extern FileDescriptorSetDefaultTypeInternal _FileDescriptorSet_default_instance_;
+class FileOptions;
+class FileOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern FileOptionsDefaultTypeInternal _FileOptions_default_instance_;
+class GeneratedCodeInfo;
+class GeneratedCodeInfoDefaultTypeInternal;
+PROTOBUF_EXPORT extern GeneratedCodeInfoDefaultTypeInternal _GeneratedCodeInfo_default_instance_;
+class GeneratedCodeInfo_Annotation;
+class GeneratedCodeInfo_AnnotationDefaultTypeInternal;
+PROTOBUF_EXPORT extern GeneratedCodeInfo_AnnotationDefaultTypeInternal _GeneratedCodeInfo_Annotation_default_instance_;
+class MessageOptions;
+class MessageOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern MessageOptionsDefaultTypeInternal _MessageOptions_default_instance_;
+class MethodDescriptorProto;
+class MethodDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern MethodDescriptorProtoDefaultTypeInternal _MethodDescriptorProto_default_instance_;
+class MethodOptions;
+class MethodOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern MethodOptionsDefaultTypeInternal _MethodOptions_default_instance_;
+class OneofDescriptorProto;
+class OneofDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern OneofDescriptorProtoDefaultTypeInternal _OneofDescriptorProto_default_instance_;
+class OneofOptions;
+class OneofOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern OneofOptionsDefaultTypeInternal _OneofOptions_default_instance_;
+class ServiceDescriptorProto;
+class ServiceDescriptorProtoDefaultTypeInternal;
+PROTOBUF_EXPORT extern ServiceDescriptorProtoDefaultTypeInternal _ServiceDescriptorProto_default_instance_;
+class ServiceOptions;
+class ServiceOptionsDefaultTypeInternal;
+PROTOBUF_EXPORT extern ServiceOptionsDefaultTypeInternal _ServiceOptions_default_instance_;
+class SourceCodeInfo;
+class SourceCodeInfoDefaultTypeInternal;
+PROTOBUF_EXPORT extern SourceCodeInfoDefaultTypeInternal _SourceCodeInfo_default_instance_;
+class SourceCodeInfo_Location;
+class SourceCodeInfo_LocationDefaultTypeInternal;
+PROTOBUF_EXPORT extern SourceCodeInfo_LocationDefaultTypeInternal _SourceCodeInfo_Location_default_instance_;
+class UninterpretedOption;
+class UninterpretedOptionDefaultTypeInternal;
+PROTOBUF_EXPORT extern UninterpretedOptionDefaultTypeInternal _UninterpretedOption_default_instance_;
+class UninterpretedOption_NamePart;
+class UninterpretedOption_NamePartDefaultTypeInternal;
+PROTOBUF_EXPORT extern UninterpretedOption_NamePartDefaultTypeInternal _UninterpretedOption_NamePart_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::DescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::EnumValueOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumValueOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FieldDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FieldOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FileDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileDescriptorSet* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FileDescriptorSet>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FileOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FileOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MessageOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::MessageOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::MethodDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::MethodOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::MethodOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::OneofDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::OneofOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::OneofOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ServiceOptions* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ServiceOptions>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceCodeInfo* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::SourceCodeInfo>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::UninterpretedOption* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::UninterpretedOption>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+enum FieldDescriptorProto_Type : int {
+  FieldDescriptorProto_Type_TYPE_DOUBLE = 1,
+  FieldDescriptorProto_Type_TYPE_FLOAT = 2,
+  FieldDescriptorProto_Type_TYPE_INT64 = 3,
+  FieldDescriptorProto_Type_TYPE_UINT64 = 4,
+  FieldDescriptorProto_Type_TYPE_INT32 = 5,
+  FieldDescriptorProto_Type_TYPE_FIXED64 = 6,
+  FieldDescriptorProto_Type_TYPE_FIXED32 = 7,
+  FieldDescriptorProto_Type_TYPE_BOOL = 8,
+  FieldDescriptorProto_Type_TYPE_STRING = 9,
+  FieldDescriptorProto_Type_TYPE_GROUP = 10,
+  FieldDescriptorProto_Type_TYPE_MESSAGE = 11,
+  FieldDescriptorProto_Type_TYPE_BYTES = 12,
+  FieldDescriptorProto_Type_TYPE_UINT32 = 13,
+  FieldDescriptorProto_Type_TYPE_ENUM = 14,
+  FieldDescriptorProto_Type_TYPE_SFIXED32 = 15,
+  FieldDescriptorProto_Type_TYPE_SFIXED64 = 16,
+  FieldDescriptorProto_Type_TYPE_SINT32 = 17,
+  FieldDescriptorProto_Type_TYPE_SINT64 = 18
+};
+PROTOBUF_EXPORT bool FieldDescriptorProto_Type_IsValid(int value);
+constexpr FieldDescriptorProto_Type FieldDescriptorProto_Type_Type_MIN = FieldDescriptorProto_Type_TYPE_DOUBLE;
+constexpr FieldDescriptorProto_Type FieldDescriptorProto_Type_Type_MAX = FieldDescriptorProto_Type_TYPE_SINT64;
+constexpr int FieldDescriptorProto_Type_Type_ARRAYSIZE = FieldDescriptorProto_Type_Type_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldDescriptorProto_Type_descriptor();
+template<typename T>
+inline const std::string& FieldDescriptorProto_Type_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, FieldDescriptorProto_Type>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function FieldDescriptorProto_Type_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    FieldDescriptorProto_Type_descriptor(), enum_t_value);
+}
+inline bool FieldDescriptorProto_Type_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldDescriptorProto_Type* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<FieldDescriptorProto_Type>(
+    FieldDescriptorProto_Type_descriptor(), name, value);
+}
+enum FieldDescriptorProto_Label : int {
+  FieldDescriptorProto_Label_LABEL_OPTIONAL = 1,
+  FieldDescriptorProto_Label_LABEL_REQUIRED = 2,
+  FieldDescriptorProto_Label_LABEL_REPEATED = 3
+};
+PROTOBUF_EXPORT bool FieldDescriptorProto_Label_IsValid(int value);
+constexpr FieldDescriptorProto_Label FieldDescriptorProto_Label_Label_MIN = FieldDescriptorProto_Label_LABEL_OPTIONAL;
+constexpr FieldDescriptorProto_Label FieldDescriptorProto_Label_Label_MAX = FieldDescriptorProto_Label_LABEL_REPEATED;
+constexpr int FieldDescriptorProto_Label_Label_ARRAYSIZE = FieldDescriptorProto_Label_Label_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldDescriptorProto_Label_descriptor();
+template<typename T>
+inline const std::string& FieldDescriptorProto_Label_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, FieldDescriptorProto_Label>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function FieldDescriptorProto_Label_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    FieldDescriptorProto_Label_descriptor(), enum_t_value);
+}
+inline bool FieldDescriptorProto_Label_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldDescriptorProto_Label* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<FieldDescriptorProto_Label>(
+    FieldDescriptorProto_Label_descriptor(), name, value);
+}
+enum FileOptions_OptimizeMode : int {
+  FileOptions_OptimizeMode_SPEED = 1,
+  FileOptions_OptimizeMode_CODE_SIZE = 2,
+  FileOptions_OptimizeMode_LITE_RUNTIME = 3
+};
+PROTOBUF_EXPORT bool FileOptions_OptimizeMode_IsValid(int value);
+constexpr FileOptions_OptimizeMode FileOptions_OptimizeMode_OptimizeMode_MIN = FileOptions_OptimizeMode_SPEED;
+constexpr FileOptions_OptimizeMode FileOptions_OptimizeMode_OptimizeMode_MAX = FileOptions_OptimizeMode_LITE_RUNTIME;
+constexpr int FileOptions_OptimizeMode_OptimizeMode_ARRAYSIZE = FileOptions_OptimizeMode_OptimizeMode_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FileOptions_OptimizeMode_descriptor();
+template<typename T>
+inline const std::string& FileOptions_OptimizeMode_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, FileOptions_OptimizeMode>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function FileOptions_OptimizeMode_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    FileOptions_OptimizeMode_descriptor(), enum_t_value);
+}
+inline bool FileOptions_OptimizeMode_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FileOptions_OptimizeMode* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<FileOptions_OptimizeMode>(
+    FileOptions_OptimizeMode_descriptor(), name, value);
+}
+enum FieldOptions_CType : int {
+  FieldOptions_CType_STRING = 0,
+  FieldOptions_CType_CORD = 1,
+  FieldOptions_CType_STRING_PIECE = 2
+};
+PROTOBUF_EXPORT bool FieldOptions_CType_IsValid(int value);
+constexpr FieldOptions_CType FieldOptions_CType_CType_MIN = FieldOptions_CType_STRING;
+constexpr FieldOptions_CType FieldOptions_CType_CType_MAX = FieldOptions_CType_STRING_PIECE;
+constexpr int FieldOptions_CType_CType_ARRAYSIZE = FieldOptions_CType_CType_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldOptions_CType_descriptor();
+template<typename T>
+inline const std::string& FieldOptions_CType_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, FieldOptions_CType>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function FieldOptions_CType_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    FieldOptions_CType_descriptor(), enum_t_value);
+}
+inline bool FieldOptions_CType_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldOptions_CType* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<FieldOptions_CType>(
+    FieldOptions_CType_descriptor(), name, value);
+}
+enum FieldOptions_JSType : int {
+  FieldOptions_JSType_JS_NORMAL = 0,
+  FieldOptions_JSType_JS_STRING = 1,
+  FieldOptions_JSType_JS_NUMBER = 2
+};
+PROTOBUF_EXPORT bool FieldOptions_JSType_IsValid(int value);
+constexpr FieldOptions_JSType FieldOptions_JSType_JSType_MIN = FieldOptions_JSType_JS_NORMAL;
+constexpr FieldOptions_JSType FieldOptions_JSType_JSType_MAX = FieldOptions_JSType_JS_NUMBER;
+constexpr int FieldOptions_JSType_JSType_ARRAYSIZE = FieldOptions_JSType_JSType_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* FieldOptions_JSType_descriptor();
+template<typename T>
+inline const std::string& FieldOptions_JSType_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, FieldOptions_JSType>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function FieldOptions_JSType_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    FieldOptions_JSType_descriptor(), enum_t_value);
+}
+inline bool FieldOptions_JSType_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, FieldOptions_JSType* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<FieldOptions_JSType>(
+    FieldOptions_JSType_descriptor(), name, value);
+}
+enum MethodOptions_IdempotencyLevel : int {
+  MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN = 0,
+  MethodOptions_IdempotencyLevel_NO_SIDE_EFFECTS = 1,
+  MethodOptions_IdempotencyLevel_IDEMPOTENT = 2
+};
+PROTOBUF_EXPORT bool MethodOptions_IdempotencyLevel_IsValid(int value);
+constexpr MethodOptions_IdempotencyLevel MethodOptions_IdempotencyLevel_IdempotencyLevel_MIN = MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN;
+constexpr MethodOptions_IdempotencyLevel MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX = MethodOptions_IdempotencyLevel_IDEMPOTENT;
+constexpr int MethodOptions_IdempotencyLevel_IdempotencyLevel_ARRAYSIZE = MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* MethodOptions_IdempotencyLevel_descriptor();
+template<typename T>
+inline const std::string& MethodOptions_IdempotencyLevel_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, MethodOptions_IdempotencyLevel>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function MethodOptions_IdempotencyLevel_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    MethodOptions_IdempotencyLevel_descriptor(), enum_t_value);
+}
+inline bool MethodOptions_IdempotencyLevel_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, MethodOptions_IdempotencyLevel* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<MethodOptions_IdempotencyLevel>(
+    MethodOptions_IdempotencyLevel_descriptor(), name, value);
+}
+// ===================================================================
+
+class PROTOBUF_EXPORT FileDescriptorSet PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileDescriptorSet) */ {
+ public:
+  inline FileDescriptorSet() : FileDescriptorSet(nullptr) {}
+  virtual ~FileDescriptorSet();
+
+  FileDescriptorSet(const FileDescriptorSet& from);
+  FileDescriptorSet(FileDescriptorSet&& from) noexcept
+    : FileDescriptorSet() {
+    *this = ::std::move(from);
+  }
+
+  inline FileDescriptorSet& operator=(const FileDescriptorSet& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FileDescriptorSet& operator=(FileDescriptorSet&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FileDescriptorSet& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FileDescriptorSet* internal_default_instance() {
+    return reinterpret_cast<const FileDescriptorSet*>(
+               &_FileDescriptorSet_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(FileDescriptorSet& a, FileDescriptorSet& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FileDescriptorSet* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FileDescriptorSet* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FileDescriptorSet* New() const final {
+    return CreateMaybeMessage<FileDescriptorSet>(nullptr);
+  }
+
+  FileDescriptorSet* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FileDescriptorSet>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FileDescriptorSet& from);
+  void MergeFrom(const FileDescriptorSet& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FileDescriptorSet* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FileDescriptorSet";
+  }
+  protected:
+  explicit FileDescriptorSet(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kFileFieldNumber = 1,
+  };
+  // repeated .google.protobuf.FileDescriptorProto file = 1;
+  int file_size() const;
+  private:
+  int _internal_file_size() const;
+  public:
+  void clear_file();
+  PROTOBUF_NAMESPACE_ID::FileDescriptorProto* mutable_file(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >*
+      mutable_file();
+  private:
+  const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& _internal_file(int index) const;
+  PROTOBUF_NAMESPACE_ID::FileDescriptorProto* _internal_add_file();
+  public:
+  const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& file(int index) const;
+  PROTOBUF_NAMESPACE_ID::FileDescriptorProto* add_file();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >&
+      file() const;
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.FileDescriptorSet)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto > file_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT FileDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileDescriptorProto) */ {
+ public:
+  inline FileDescriptorProto() : FileDescriptorProto(nullptr) {}
+  virtual ~FileDescriptorProto();
+
+  FileDescriptorProto(const FileDescriptorProto& from);
+  FileDescriptorProto(FileDescriptorProto&& from) noexcept
+    : FileDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline FileDescriptorProto& operator=(const FileDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FileDescriptorProto& operator=(FileDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FileDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FileDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const FileDescriptorProto*>(
+               &_FileDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    1;
+
+  friend void swap(FileDescriptorProto& a, FileDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FileDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FileDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FileDescriptorProto* New() const final {
+    return CreateMaybeMessage<FileDescriptorProto>(nullptr);
+  }
+
+  FileDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FileDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FileDescriptorProto& from);
+  void MergeFrom(const FileDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FileDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FileDescriptorProto";
+  }
+  protected:
+  explicit FileDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kDependencyFieldNumber = 3,
+    kMessageTypeFieldNumber = 4,
+    kEnumTypeFieldNumber = 5,
+    kServiceFieldNumber = 6,
+    kExtensionFieldNumber = 7,
+    kPublicDependencyFieldNumber = 10,
+    kWeakDependencyFieldNumber = 11,
+    kNameFieldNumber = 1,
+    kPackageFieldNumber = 2,
+    kSyntaxFieldNumber = 12,
+    kOptionsFieldNumber = 8,
+    kSourceCodeInfoFieldNumber = 9,
+  };
+  // repeated string dependency = 3;
+  int dependency_size() const;
+  private:
+  int _internal_dependency_size() const;
+  public:
+  void clear_dependency();
+  const std::string& dependency(int index) const;
+  std::string* mutable_dependency(int index);
+  void set_dependency(int index, const std::string& value);
+  void set_dependency(int index, std::string&& value);
+  void set_dependency(int index, const char* value);
+  void set_dependency(int index, const char* value, size_t size);
+  std::string* add_dependency();
+  void add_dependency(const std::string& value);
+  void add_dependency(std::string&& value);
+  void add_dependency(const char* value);
+  void add_dependency(const char* value, size_t size);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>& dependency() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>* mutable_dependency();
+  private:
+  const std::string& _internal_dependency(int index) const;
+  std::string* _internal_add_dependency();
+  public:
+
+  // repeated .google.protobuf.DescriptorProto message_type = 4;
+  int message_type_size() const;
+  private:
+  int _internal_message_type_size() const;
+  public:
+  void clear_message_type();
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* mutable_message_type(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >*
+      mutable_message_type();
+  private:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto& _internal_message_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* _internal_add_message_type();
+  public:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto& message_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* add_message_type();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >&
+      message_type() const;
+
+  // repeated .google.protobuf.EnumDescriptorProto enum_type = 5;
+  int enum_type_size() const;
+  private:
+  int _internal_enum_type_size() const;
+  public:
+  void clear_enum_type();
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* mutable_enum_type(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >*
+      mutable_enum_type();
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& _internal_enum_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* _internal_add_enum_type();
+  public:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& enum_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* add_enum_type();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >&
+      enum_type() const;
+
+  // repeated .google.protobuf.ServiceDescriptorProto service = 6;
+  int service_size() const;
+  private:
+  int _internal_service_size() const;
+  public:
+  void clear_service();
+  PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* mutable_service(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >*
+      mutable_service();
+  private:
+  const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& _internal_service(int index) const;
+  PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* _internal_add_service();
+  public:
+  const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& service(int index) const;
+  PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* add_service();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >&
+      service() const;
+
+  // repeated .google.protobuf.FieldDescriptorProto extension = 7;
+  int extension_size() const;
+  private:
+  int _internal_extension_size() const;
+  public:
+  void clear_extension();
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_extension(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+      mutable_extension();
+  private:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_extension(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_extension();
+  public:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& extension(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_extension();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+      extension() const;
+
+  // repeated int32 public_dependency = 10;
+  int public_dependency_size() const;
+  private:
+  int _internal_public_dependency_size() const;
+  public:
+  void clear_public_dependency();
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_public_dependency(int index) const;
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      _internal_public_dependency() const;
+  void _internal_add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      _internal_mutable_public_dependency();
+  public:
+  ::PROTOBUF_NAMESPACE_ID::int32 public_dependency(int index) const;
+  void set_public_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
+  void add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      public_dependency() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      mutable_public_dependency();
+
+  // repeated int32 weak_dependency = 11;
+  int weak_dependency_size() const;
+  private:
+  int _internal_weak_dependency_size() const;
+  public:
+  void clear_weak_dependency();
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_weak_dependency(int index) const;
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      _internal_weak_dependency() const;
+  void _internal_add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      _internal_mutable_weak_dependency();
+  public:
+  ::PROTOBUF_NAMESPACE_ID::int32 weak_dependency(int index) const;
+  void set_weak_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
+  void add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      weak_dependency() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      mutable_weak_dependency();
+
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional string package = 2;
+  bool has_package() const;
+  private:
+  bool _internal_has_package() const;
+  public:
+  void clear_package();
+  const std::string& package() const;
+  void set_package(const std::string& value);
+  void set_package(std::string&& value);
+  void set_package(const char* value);
+  void set_package(const char* value, size_t size);
+  std::string* mutable_package();
+  std::string* release_package();
+  void set_allocated_package(std::string* package);
+  private:
+  const std::string& _internal_package() const;
+  void _internal_set_package(const std::string& value);
+  std::string* _internal_mutable_package();
+  public:
+
+  // optional string syntax = 12;
+  bool has_syntax() const;
+  private:
+  bool _internal_has_syntax() const;
+  public:
+  void clear_syntax();
+  const std::string& syntax() const;
+  void set_syntax(const std::string& value);
+  void set_syntax(std::string&& value);
+  void set_syntax(const char* value);
+  void set_syntax(const char* value, size_t size);
+  std::string* mutable_syntax();
+  std::string* release_syntax();
+  void set_allocated_syntax(std::string* syntax);
+  private:
+  const std::string& _internal_syntax() const;
+  void _internal_set_syntax(const std::string& value);
+  std::string* _internal_mutable_syntax();
+  public:
+
+  // optional .google.protobuf.FileOptions options = 8;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::FileOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::FileOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::FileOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::FileOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::FileOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::FileOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::FileOptions* options);
+  PROTOBUF_NAMESPACE_ID::FileOptions* unsafe_arena_release_options();
+
+  // optional .google.protobuf.SourceCodeInfo source_code_info = 9;
+  bool has_source_code_info() const;
+  private:
+  bool _internal_has_source_code_info() const;
+  public:
+  void clear_source_code_info();
+  const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& source_code_info() const;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* release_source_code_info();
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* mutable_source_code_info();
+  void set_allocated_source_code_info(PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info);
+  private:
+  const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& _internal_source_code_info() const;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* _internal_mutable_source_code_info();
+  public:
+  void unsafe_arena_set_allocated_source_code_info(
+      PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info);
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* unsafe_arena_release_source_code_info();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.FileDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string> dependency_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto > message_type_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto > enum_type_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto > service_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > extension_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > public_dependency_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > weak_dependency_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr package_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr syntax_;
+  PROTOBUF_NAMESPACE_ID::FileOptions* options_;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT DescriptorProto_ExtensionRange PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto.ExtensionRange) */ {
+ public:
+  inline DescriptorProto_ExtensionRange() : DescriptorProto_ExtensionRange(nullptr) {}
+  virtual ~DescriptorProto_ExtensionRange();
+
+  DescriptorProto_ExtensionRange(const DescriptorProto_ExtensionRange& from);
+  DescriptorProto_ExtensionRange(DescriptorProto_ExtensionRange&& from) noexcept
+    : DescriptorProto_ExtensionRange() {
+    *this = ::std::move(from);
+  }
+
+  inline DescriptorProto_ExtensionRange& operator=(const DescriptorProto_ExtensionRange& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline DescriptorProto_ExtensionRange& operator=(DescriptorProto_ExtensionRange&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const DescriptorProto_ExtensionRange& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const DescriptorProto_ExtensionRange* internal_default_instance() {
+    return reinterpret_cast<const DescriptorProto_ExtensionRange*>(
+               &_DescriptorProto_ExtensionRange_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    2;
+
+  friend void swap(DescriptorProto_ExtensionRange& a, DescriptorProto_ExtensionRange& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(DescriptorProto_ExtensionRange* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(DescriptorProto_ExtensionRange* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline DescriptorProto_ExtensionRange* New() const final {
+    return CreateMaybeMessage<DescriptorProto_ExtensionRange>(nullptr);
+  }
+
+  DescriptorProto_ExtensionRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<DescriptorProto_ExtensionRange>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const DescriptorProto_ExtensionRange& from);
+  void MergeFrom(const DescriptorProto_ExtensionRange& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(DescriptorProto_ExtensionRange* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.DescriptorProto.ExtensionRange";
+  }
+  protected:
+  explicit DescriptorProto_ExtensionRange(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kOptionsFieldNumber = 3,
+    kStartFieldNumber = 1,
+    kEndFieldNumber = 2,
+  };
+  // optional .google.protobuf.ExtensionRangeOptions options = 3;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options);
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* unsafe_arena_release_options();
+
+  // optional int32 start = 1;
+  bool has_start() const;
+  private:
+  bool _internal_has_start() const;
+  public:
+  void clear_start();
+  ::PROTOBUF_NAMESPACE_ID::int32 start() const;
+  void set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const;
+  void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional int32 end = 2;
+  bool has_end() const;
+  private:
+  bool _internal_has_end() const;
+  public:
+  void clear_end();
+  ::PROTOBUF_NAMESPACE_ID::int32 end() const;
+  void set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const;
+  void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto.ExtensionRange)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options_;
+  ::PROTOBUF_NAMESPACE_ID::int32 start_;
+  ::PROTOBUF_NAMESPACE_ID::int32 end_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT DescriptorProto_ReservedRange PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto.ReservedRange) */ {
+ public:
+  inline DescriptorProto_ReservedRange() : DescriptorProto_ReservedRange(nullptr) {}
+  virtual ~DescriptorProto_ReservedRange();
+
+  DescriptorProto_ReservedRange(const DescriptorProto_ReservedRange& from);
+  DescriptorProto_ReservedRange(DescriptorProto_ReservedRange&& from) noexcept
+    : DescriptorProto_ReservedRange() {
+    *this = ::std::move(from);
+  }
+
+  inline DescriptorProto_ReservedRange& operator=(const DescriptorProto_ReservedRange& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline DescriptorProto_ReservedRange& operator=(DescriptorProto_ReservedRange&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const DescriptorProto_ReservedRange& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const DescriptorProto_ReservedRange* internal_default_instance() {
+    return reinterpret_cast<const DescriptorProto_ReservedRange*>(
+               &_DescriptorProto_ReservedRange_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    3;
+
+  friend void swap(DescriptorProto_ReservedRange& a, DescriptorProto_ReservedRange& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(DescriptorProto_ReservedRange* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(DescriptorProto_ReservedRange* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline DescriptorProto_ReservedRange* New() const final {
+    return CreateMaybeMessage<DescriptorProto_ReservedRange>(nullptr);
+  }
+
+  DescriptorProto_ReservedRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<DescriptorProto_ReservedRange>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const DescriptorProto_ReservedRange& from);
+  void MergeFrom(const DescriptorProto_ReservedRange& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(DescriptorProto_ReservedRange* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.DescriptorProto.ReservedRange";
+  }
+  protected:
+  explicit DescriptorProto_ReservedRange(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kStartFieldNumber = 1,
+    kEndFieldNumber = 2,
+  };
+  // optional int32 start = 1;
+  bool has_start() const;
+  private:
+  bool _internal_has_start() const;
+  public:
+  void clear_start();
+  ::PROTOBUF_NAMESPACE_ID::int32 start() const;
+  void set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const;
+  void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional int32 end = 2;
+  bool has_end() const;
+  private:
+  bool _internal_has_end() const;
+  public:
+  void clear_end();
+  ::PROTOBUF_NAMESPACE_ID::int32 end() const;
+  void set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const;
+  void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto.ReservedRange)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::int32 start_;
+  ::PROTOBUF_NAMESPACE_ID::int32 end_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT DescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.DescriptorProto) */ {
+ public:
+  inline DescriptorProto() : DescriptorProto(nullptr) {}
+  virtual ~DescriptorProto();
+
+  DescriptorProto(const DescriptorProto& from);
+  DescriptorProto(DescriptorProto&& from) noexcept
+    : DescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline DescriptorProto& operator=(const DescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline DescriptorProto& operator=(DescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const DescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const DescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const DescriptorProto*>(
+               &_DescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    4;
+
+  friend void swap(DescriptorProto& a, DescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(DescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(DescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline DescriptorProto* New() const final {
+    return CreateMaybeMessage<DescriptorProto>(nullptr);
+  }
+
+  DescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<DescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const DescriptorProto& from);
+  void MergeFrom(const DescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(DescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.DescriptorProto";
+  }
+  protected:
+  explicit DescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef DescriptorProto_ExtensionRange ExtensionRange;
+  typedef DescriptorProto_ReservedRange ReservedRange;
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kFieldFieldNumber = 2,
+    kNestedTypeFieldNumber = 3,
+    kEnumTypeFieldNumber = 4,
+    kExtensionRangeFieldNumber = 5,
+    kExtensionFieldNumber = 6,
+    kOneofDeclFieldNumber = 8,
+    kReservedRangeFieldNumber = 9,
+    kReservedNameFieldNumber = 10,
+    kNameFieldNumber = 1,
+    kOptionsFieldNumber = 7,
+  };
+  // repeated .google.protobuf.FieldDescriptorProto field = 2;
+  int field_size() const;
+  private:
+  int _internal_field_size() const;
+  public:
+  void clear_field();
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_field(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+      mutable_field();
+  private:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_field(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_field();
+  public:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& field(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_field();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+      field() const;
+
+  // repeated .google.protobuf.DescriptorProto nested_type = 3;
+  int nested_type_size() const;
+  private:
+  int _internal_nested_type_size() const;
+  public:
+  void clear_nested_type();
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* mutable_nested_type(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >*
+      mutable_nested_type();
+  private:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto& _internal_nested_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* _internal_add_nested_type();
+  public:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto& nested_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto* add_nested_type();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >&
+      nested_type() const;
+
+  // repeated .google.protobuf.EnumDescriptorProto enum_type = 4;
+  int enum_type_size() const;
+  private:
+  int _internal_enum_type_size() const;
+  public:
+  void clear_enum_type();
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* mutable_enum_type(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >*
+      mutable_enum_type();
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& _internal_enum_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* _internal_add_enum_type();
+  public:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& enum_type(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* add_enum_type();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >&
+      enum_type() const;
+
+  // repeated .google.protobuf.DescriptorProto.ExtensionRange extension_range = 5;
+  int extension_range_size() const;
+  private:
+  int _internal_extension_range_size() const;
+  public:
+  void clear_extension_range();
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* mutable_extension_range(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >*
+      mutable_extension_range();
+  private:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& _internal_extension_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* _internal_add_extension_range();
+  public:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& extension_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* add_extension_range();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >&
+      extension_range() const;
+
+  // repeated .google.protobuf.FieldDescriptorProto extension = 6;
+  int extension_size() const;
+  private:
+  int _internal_extension_size() const;
+  public:
+  void clear_extension();
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* mutable_extension(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+      mutable_extension();
+  private:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& _internal_extension(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* _internal_add_extension();
+  public:
+  const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& extension(int index) const;
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* add_extension();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+      extension() const;
+
+  // repeated .google.protobuf.OneofDescriptorProto oneof_decl = 8;
+  int oneof_decl_size() const;
+  private:
+  int _internal_oneof_decl_size() const;
+  public:
+  void clear_oneof_decl();
+  PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* mutable_oneof_decl(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >*
+      mutable_oneof_decl();
+  private:
+  const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& _internal_oneof_decl(int index) const;
+  PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* _internal_add_oneof_decl();
+  public:
+  const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& oneof_decl(int index) const;
+  PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* add_oneof_decl();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >&
+      oneof_decl() const;
+
+  // repeated .google.protobuf.DescriptorProto.ReservedRange reserved_range = 9;
+  int reserved_range_size() const;
+  private:
+  int _internal_reserved_range_size() const;
+  public:
+  void clear_reserved_range();
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* mutable_reserved_range(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >*
+      mutable_reserved_range();
+  private:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& _internal_reserved_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* _internal_add_reserved_range();
+  public:
+  const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& reserved_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* add_reserved_range();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >&
+      reserved_range() const;
+
+  // repeated string reserved_name = 10;
+  int reserved_name_size() const;
+  private:
+  int _internal_reserved_name_size() const;
+  public:
+  void clear_reserved_name();
+  const std::string& reserved_name(int index) const;
+  std::string* mutable_reserved_name(int index);
+  void set_reserved_name(int index, const std::string& value);
+  void set_reserved_name(int index, std::string&& value);
+  void set_reserved_name(int index, const char* value);
+  void set_reserved_name(int index, const char* value, size_t size);
+  std::string* add_reserved_name();
+  void add_reserved_name(const std::string& value);
+  void add_reserved_name(std::string&& value);
+  void add_reserved_name(const char* value);
+  void add_reserved_name(const char* value, size_t size);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>& reserved_name() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>* mutable_reserved_name();
+  private:
+  const std::string& _internal_reserved_name(int index) const;
+  std::string* _internal_add_reserved_name();
+  public:
+
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional .google.protobuf.MessageOptions options = 7;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::MessageOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::MessageOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::MessageOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::MessageOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::MessageOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::MessageOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::MessageOptions* options);
+  PROTOBUF_NAMESPACE_ID::MessageOptions* unsafe_arena_release_options();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.DescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > field_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto > nested_type_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto > enum_type_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange > extension_range_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto > extension_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto > oneof_decl_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange > reserved_range_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string> reserved_name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  PROTOBUF_NAMESPACE_ID::MessageOptions* options_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT ExtensionRangeOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ExtensionRangeOptions) */ {
+ public:
+  inline ExtensionRangeOptions() : ExtensionRangeOptions(nullptr) {}
+  virtual ~ExtensionRangeOptions();
+
+  ExtensionRangeOptions(const ExtensionRangeOptions& from);
+  ExtensionRangeOptions(ExtensionRangeOptions&& from) noexcept
+    : ExtensionRangeOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline ExtensionRangeOptions& operator=(const ExtensionRangeOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline ExtensionRangeOptions& operator=(ExtensionRangeOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const ExtensionRangeOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ExtensionRangeOptions* internal_default_instance() {
+    return reinterpret_cast<const ExtensionRangeOptions*>(
+               &_ExtensionRangeOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    5;
+
+  friend void swap(ExtensionRangeOptions& a, ExtensionRangeOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(ExtensionRangeOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(ExtensionRangeOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ExtensionRangeOptions* New() const final {
+    return CreateMaybeMessage<ExtensionRangeOptions>(nullptr);
+  }
+
+  ExtensionRangeOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<ExtensionRangeOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const ExtensionRangeOptions& from);
+  void MergeFrom(const ExtensionRangeOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ExtensionRangeOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.ExtensionRangeOptions";
+  }
+  protected:
+  explicit ExtensionRangeOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(ExtensionRangeOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.ExtensionRangeOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT FieldDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldDescriptorProto) */ {
+ public:
+  inline FieldDescriptorProto() : FieldDescriptorProto(nullptr) {}
+  virtual ~FieldDescriptorProto();
+
+  FieldDescriptorProto(const FieldDescriptorProto& from);
+  FieldDescriptorProto(FieldDescriptorProto&& from) noexcept
+    : FieldDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline FieldDescriptorProto& operator=(const FieldDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FieldDescriptorProto& operator=(FieldDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FieldDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FieldDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const FieldDescriptorProto*>(
+               &_FieldDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    6;
+
+  friend void swap(FieldDescriptorProto& a, FieldDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FieldDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FieldDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FieldDescriptorProto* New() const final {
+    return CreateMaybeMessage<FieldDescriptorProto>(nullptr);
+  }
+
+  FieldDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FieldDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FieldDescriptorProto& from);
+  void MergeFrom(const FieldDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FieldDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FieldDescriptorProto";
+  }
+  protected:
+  explicit FieldDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef FieldDescriptorProto_Type Type;
+  static constexpr Type TYPE_DOUBLE =
+    FieldDescriptorProto_Type_TYPE_DOUBLE;
+  static constexpr Type TYPE_FLOAT =
+    FieldDescriptorProto_Type_TYPE_FLOAT;
+  static constexpr Type TYPE_INT64 =
+    FieldDescriptorProto_Type_TYPE_INT64;
+  static constexpr Type TYPE_UINT64 =
+    FieldDescriptorProto_Type_TYPE_UINT64;
+  static constexpr Type TYPE_INT32 =
+    FieldDescriptorProto_Type_TYPE_INT32;
+  static constexpr Type TYPE_FIXED64 =
+    FieldDescriptorProto_Type_TYPE_FIXED64;
+  static constexpr Type TYPE_FIXED32 =
+    FieldDescriptorProto_Type_TYPE_FIXED32;
+  static constexpr Type TYPE_BOOL =
+    FieldDescriptorProto_Type_TYPE_BOOL;
+  static constexpr Type TYPE_STRING =
+    FieldDescriptorProto_Type_TYPE_STRING;
+  static constexpr Type TYPE_GROUP =
+    FieldDescriptorProto_Type_TYPE_GROUP;
+  static constexpr Type TYPE_MESSAGE =
+    FieldDescriptorProto_Type_TYPE_MESSAGE;
+  static constexpr Type TYPE_BYTES =
+    FieldDescriptorProto_Type_TYPE_BYTES;
+  static constexpr Type TYPE_UINT32 =
+    FieldDescriptorProto_Type_TYPE_UINT32;
+  static constexpr Type TYPE_ENUM =
+    FieldDescriptorProto_Type_TYPE_ENUM;
+  static constexpr Type TYPE_SFIXED32 =
+    FieldDescriptorProto_Type_TYPE_SFIXED32;
+  static constexpr Type TYPE_SFIXED64 =
+    FieldDescriptorProto_Type_TYPE_SFIXED64;
+  static constexpr Type TYPE_SINT32 =
+    FieldDescriptorProto_Type_TYPE_SINT32;
+  static constexpr Type TYPE_SINT64 =
+    FieldDescriptorProto_Type_TYPE_SINT64;
+  static inline bool Type_IsValid(int value) {
+    return FieldDescriptorProto_Type_IsValid(value);
+  }
+  static constexpr Type Type_MIN =
+    FieldDescriptorProto_Type_Type_MIN;
+  static constexpr Type Type_MAX =
+    FieldDescriptorProto_Type_Type_MAX;
+  static constexpr int Type_ARRAYSIZE =
+    FieldDescriptorProto_Type_Type_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  Type_descriptor() {
+    return FieldDescriptorProto_Type_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& Type_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, Type>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function Type_Name.");
+    return FieldDescriptorProto_Type_Name(enum_t_value);
+  }
+  static inline bool Type_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      Type* value) {
+    return FieldDescriptorProto_Type_Parse(name, value);
+  }
+
+  typedef FieldDescriptorProto_Label Label;
+  static constexpr Label LABEL_OPTIONAL =
+    FieldDescriptorProto_Label_LABEL_OPTIONAL;
+  static constexpr Label LABEL_REQUIRED =
+    FieldDescriptorProto_Label_LABEL_REQUIRED;
+  static constexpr Label LABEL_REPEATED =
+    FieldDescriptorProto_Label_LABEL_REPEATED;
+  static inline bool Label_IsValid(int value) {
+    return FieldDescriptorProto_Label_IsValid(value);
+  }
+  static constexpr Label Label_MIN =
+    FieldDescriptorProto_Label_Label_MIN;
+  static constexpr Label Label_MAX =
+    FieldDescriptorProto_Label_Label_MAX;
+  static constexpr int Label_ARRAYSIZE =
+    FieldDescriptorProto_Label_Label_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  Label_descriptor() {
+    return FieldDescriptorProto_Label_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& Label_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, Label>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function Label_Name.");
+    return FieldDescriptorProto_Label_Name(enum_t_value);
+  }
+  static inline bool Label_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      Label* value) {
+    return FieldDescriptorProto_Label_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 1,
+    kExtendeeFieldNumber = 2,
+    kTypeNameFieldNumber = 6,
+    kDefaultValueFieldNumber = 7,
+    kJsonNameFieldNumber = 10,
+    kOptionsFieldNumber = 8,
+    kNumberFieldNumber = 3,
+    kOneofIndexFieldNumber = 9,
+    kProto3OptionalFieldNumber = 17,
+    kLabelFieldNumber = 4,
+    kTypeFieldNumber = 5,
+  };
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional string extendee = 2;
+  bool has_extendee() const;
+  private:
+  bool _internal_has_extendee() const;
+  public:
+  void clear_extendee();
+  const std::string& extendee() const;
+  void set_extendee(const std::string& value);
+  void set_extendee(std::string&& value);
+  void set_extendee(const char* value);
+  void set_extendee(const char* value, size_t size);
+  std::string* mutable_extendee();
+  std::string* release_extendee();
+  void set_allocated_extendee(std::string* extendee);
+  private:
+  const std::string& _internal_extendee() const;
+  void _internal_set_extendee(const std::string& value);
+  std::string* _internal_mutable_extendee();
+  public:
+
+  // optional string type_name = 6;
+  bool has_type_name() const;
+  private:
+  bool _internal_has_type_name() const;
+  public:
+  void clear_type_name();
+  const std::string& type_name() const;
+  void set_type_name(const std::string& value);
+  void set_type_name(std::string&& value);
+  void set_type_name(const char* value);
+  void set_type_name(const char* value, size_t size);
+  std::string* mutable_type_name();
+  std::string* release_type_name();
+  void set_allocated_type_name(std::string* type_name);
+  private:
+  const std::string& _internal_type_name() const;
+  void _internal_set_type_name(const std::string& value);
+  std::string* _internal_mutable_type_name();
+  public:
+
+  // optional string default_value = 7;
+  bool has_default_value() const;
+  private:
+  bool _internal_has_default_value() const;
+  public:
+  void clear_default_value();
+  const std::string& default_value() const;
+  void set_default_value(const std::string& value);
+  void set_default_value(std::string&& value);
+  void set_default_value(const char* value);
+  void set_default_value(const char* value, size_t size);
+  std::string* mutable_default_value();
+  std::string* release_default_value();
+  void set_allocated_default_value(std::string* default_value);
+  private:
+  const std::string& _internal_default_value() const;
+  void _internal_set_default_value(const std::string& value);
+  std::string* _internal_mutable_default_value();
+  public:
+
+  // optional string json_name = 10;
+  bool has_json_name() const;
+  private:
+  bool _internal_has_json_name() const;
+  public:
+  void clear_json_name();
+  const std::string& json_name() const;
+  void set_json_name(const std::string& value);
+  void set_json_name(std::string&& value);
+  void set_json_name(const char* value);
+  void set_json_name(const char* value, size_t size);
+  std::string* mutable_json_name();
+  std::string* release_json_name();
+  void set_allocated_json_name(std::string* json_name);
+  private:
+  const std::string& _internal_json_name() const;
+  void _internal_set_json_name(const std::string& value);
+  std::string* _internal_mutable_json_name();
+  public:
+
+  // optional .google.protobuf.FieldOptions options = 8;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::FieldOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::FieldOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::FieldOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::FieldOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::FieldOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::FieldOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::FieldOptions* options);
+  PROTOBUF_NAMESPACE_ID::FieldOptions* unsafe_arena_release_options();
+
+  // optional int32 number = 3;
+  bool has_number() const;
+  private:
+  bool _internal_has_number() const;
+  public:
+  void clear_number();
+  ::PROTOBUF_NAMESPACE_ID::int32 number() const;
+  void set_number(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_number() const;
+  void _internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional int32 oneof_index = 9;
+  bool has_oneof_index() const;
+  private:
+  bool _internal_has_oneof_index() const;
+  public:
+  void clear_oneof_index();
+  ::PROTOBUF_NAMESPACE_ID::int32 oneof_index() const;
+  void set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_oneof_index() const;
+  void _internal_set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional bool proto3_optional = 17;
+  bool has_proto3_optional() const;
+  private:
+  bool _internal_has_proto3_optional() const;
+  public:
+  void clear_proto3_optional();
+  bool proto3_optional() const;
+  void set_proto3_optional(bool value);
+  private:
+  bool _internal_proto3_optional() const;
+  void _internal_set_proto3_optional(bool value);
+  public:
+
+  // optional .google.protobuf.FieldDescriptorProto.Label label = 4;
+  bool has_label() const;
+  private:
+  bool _internal_has_label() const;
+  public:
+  void clear_label();
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label label() const;
+  void set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value);
+  private:
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label _internal_label() const;
+  void _internal_set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value);
+  public:
+
+  // optional .google.protobuf.FieldDescriptorProto.Type type = 5;
+  bool has_type() const;
+  private:
+  bool _internal_has_type() const;
+  public:
+  void clear_type();
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type type() const;
+  void set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value);
+  private:
+  PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type _internal_type() const;
+  void _internal_set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.FieldDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr extendee_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr type_name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr default_value_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr json_name_;
+  PROTOBUF_NAMESPACE_ID::FieldOptions* options_;
+  ::PROTOBUF_NAMESPACE_ID::int32 number_;
+  ::PROTOBUF_NAMESPACE_ID::int32 oneof_index_;
+  bool proto3_optional_;
+  int label_;
+  int type_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT OneofDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.OneofDescriptorProto) */ {
+ public:
+  inline OneofDescriptorProto() : OneofDescriptorProto(nullptr) {}
+  virtual ~OneofDescriptorProto();
+
+  OneofDescriptorProto(const OneofDescriptorProto& from);
+  OneofDescriptorProto(OneofDescriptorProto&& from) noexcept
+    : OneofDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline OneofDescriptorProto& operator=(const OneofDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline OneofDescriptorProto& operator=(OneofDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const OneofDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const OneofDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const OneofDescriptorProto*>(
+               &_OneofDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    7;
+
+  friend void swap(OneofDescriptorProto& a, OneofDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(OneofDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(OneofDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline OneofDescriptorProto* New() const final {
+    return CreateMaybeMessage<OneofDescriptorProto>(nullptr);
+  }
+
+  OneofDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<OneofDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const OneofDescriptorProto& from);
+  void MergeFrom(const OneofDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(OneofDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.OneofDescriptorProto";
+  }
+  protected:
+  explicit OneofDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 1,
+    kOptionsFieldNumber = 2,
+  };
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional .google.protobuf.OneofOptions options = 2;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::OneofOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::OneofOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::OneofOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::OneofOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::OneofOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::OneofOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::OneofOptions* options);
+  PROTOBUF_NAMESPACE_ID::OneofOptions* unsafe_arena_release_options();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.OneofDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  PROTOBUF_NAMESPACE_ID::OneofOptions* options_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT EnumDescriptorProto_EnumReservedRange PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumDescriptorProto.EnumReservedRange) */ {
+ public:
+  inline EnumDescriptorProto_EnumReservedRange() : EnumDescriptorProto_EnumReservedRange(nullptr) {}
+  virtual ~EnumDescriptorProto_EnumReservedRange();
+
+  EnumDescriptorProto_EnumReservedRange(const EnumDescriptorProto_EnumReservedRange& from);
+  EnumDescriptorProto_EnumReservedRange(EnumDescriptorProto_EnumReservedRange&& from) noexcept
+    : EnumDescriptorProto_EnumReservedRange() {
+    *this = ::std::move(from);
+  }
+
+  inline EnumDescriptorProto_EnumReservedRange& operator=(const EnumDescriptorProto_EnumReservedRange& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline EnumDescriptorProto_EnumReservedRange& operator=(EnumDescriptorProto_EnumReservedRange&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const EnumDescriptorProto_EnumReservedRange& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const EnumDescriptorProto_EnumReservedRange* internal_default_instance() {
+    return reinterpret_cast<const EnumDescriptorProto_EnumReservedRange*>(
+               &_EnumDescriptorProto_EnumReservedRange_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    8;
+
+  friend void swap(EnumDescriptorProto_EnumReservedRange& a, EnumDescriptorProto_EnumReservedRange& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(EnumDescriptorProto_EnumReservedRange* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(EnumDescriptorProto_EnumReservedRange* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline EnumDescriptorProto_EnumReservedRange* New() const final {
+    return CreateMaybeMessage<EnumDescriptorProto_EnumReservedRange>(nullptr);
+  }
+
+  EnumDescriptorProto_EnumReservedRange* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<EnumDescriptorProto_EnumReservedRange>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const EnumDescriptorProto_EnumReservedRange& from);
+  void MergeFrom(const EnumDescriptorProto_EnumReservedRange& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(EnumDescriptorProto_EnumReservedRange* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.EnumDescriptorProto.EnumReservedRange";
+  }
+  protected:
+  explicit EnumDescriptorProto_EnumReservedRange(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kStartFieldNumber = 1,
+    kEndFieldNumber = 2,
+  };
+  // optional int32 start = 1;
+  bool has_start() const;
+  private:
+  bool _internal_has_start() const;
+  public:
+  void clear_start();
+  ::PROTOBUF_NAMESPACE_ID::int32 start() const;
+  void set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_start() const;
+  void _internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional int32 end = 2;
+  bool has_end() const;
+  private:
+  bool _internal_has_end() const;
+  public:
+  void clear_end();
+  ::PROTOBUF_NAMESPACE_ID::int32 end() const;
+  void set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const;
+  void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.EnumDescriptorProto.EnumReservedRange)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::int32 start_;
+  ::PROTOBUF_NAMESPACE_ID::int32 end_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT EnumDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumDescriptorProto) */ {
+ public:
+  inline EnumDescriptorProto() : EnumDescriptorProto(nullptr) {}
+  virtual ~EnumDescriptorProto();
+
+  EnumDescriptorProto(const EnumDescriptorProto& from);
+  EnumDescriptorProto(EnumDescriptorProto&& from) noexcept
+    : EnumDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline EnumDescriptorProto& operator=(const EnumDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline EnumDescriptorProto& operator=(EnumDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const EnumDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const EnumDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const EnumDescriptorProto*>(
+               &_EnumDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    9;
+
+  friend void swap(EnumDescriptorProto& a, EnumDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(EnumDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(EnumDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline EnumDescriptorProto* New() const final {
+    return CreateMaybeMessage<EnumDescriptorProto>(nullptr);
+  }
+
+  EnumDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<EnumDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const EnumDescriptorProto& from);
+  void MergeFrom(const EnumDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(EnumDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.EnumDescriptorProto";
+  }
+  protected:
+  explicit EnumDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef EnumDescriptorProto_EnumReservedRange EnumReservedRange;
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kValueFieldNumber = 2,
+    kReservedRangeFieldNumber = 4,
+    kReservedNameFieldNumber = 5,
+    kNameFieldNumber = 1,
+    kOptionsFieldNumber = 3,
+  };
+  // repeated .google.protobuf.EnumValueDescriptorProto value = 2;
+  int value_size() const;
+  private:
+  int _internal_value_size() const;
+  public:
+  void clear_value();
+  PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* mutable_value(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >*
+      mutable_value();
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& _internal_value(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* _internal_add_value();
+  public:
+  const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& value(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* add_value();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >&
+      value() const;
+
+  // repeated .google.protobuf.EnumDescriptorProto.EnumReservedRange reserved_range = 4;
+  int reserved_range_size() const;
+  private:
+  int _internal_reserved_range_size() const;
+  public:
+  void clear_reserved_range();
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* mutable_reserved_range(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >*
+      mutable_reserved_range();
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& _internal_reserved_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* _internal_add_reserved_range();
+  public:
+  const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& reserved_range(int index) const;
+  PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* add_reserved_range();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >&
+      reserved_range() const;
+
+  // repeated string reserved_name = 5;
+  int reserved_name_size() const;
+  private:
+  int _internal_reserved_name_size() const;
+  public:
+  void clear_reserved_name();
+  const std::string& reserved_name(int index) const;
+  std::string* mutable_reserved_name(int index);
+  void set_reserved_name(int index, const std::string& value);
+  void set_reserved_name(int index, std::string&& value);
+  void set_reserved_name(int index, const char* value);
+  void set_reserved_name(int index, const char* value, size_t size);
+  std::string* add_reserved_name();
+  void add_reserved_name(const std::string& value);
+  void add_reserved_name(std::string&& value);
+  void add_reserved_name(const char* value);
+  void add_reserved_name(const char* value, size_t size);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>& reserved_name() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>* mutable_reserved_name();
+  private:
+  const std::string& _internal_reserved_name(int index) const;
+  std::string* _internal_add_reserved_name();
+  public:
+
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional .google.protobuf.EnumOptions options = 3;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::EnumOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::EnumOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::EnumOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::EnumOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::EnumOptions* options);
+  PROTOBUF_NAMESPACE_ID::EnumOptions* unsafe_arena_release_options();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.EnumDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto > value_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange > reserved_range_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string> reserved_name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  PROTOBUF_NAMESPACE_ID::EnumOptions* options_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT EnumValueDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumValueDescriptorProto) */ {
+ public:
+  inline EnumValueDescriptorProto() : EnumValueDescriptorProto(nullptr) {}
+  virtual ~EnumValueDescriptorProto();
+
+  EnumValueDescriptorProto(const EnumValueDescriptorProto& from);
+  EnumValueDescriptorProto(EnumValueDescriptorProto&& from) noexcept
+    : EnumValueDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline EnumValueDescriptorProto& operator=(const EnumValueDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline EnumValueDescriptorProto& operator=(EnumValueDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const EnumValueDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const EnumValueDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const EnumValueDescriptorProto*>(
+               &_EnumValueDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    10;
+
+  friend void swap(EnumValueDescriptorProto& a, EnumValueDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(EnumValueDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(EnumValueDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline EnumValueDescriptorProto* New() const final {
+    return CreateMaybeMessage<EnumValueDescriptorProto>(nullptr);
+  }
+
+  EnumValueDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<EnumValueDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const EnumValueDescriptorProto& from);
+  void MergeFrom(const EnumValueDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(EnumValueDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.EnumValueDescriptorProto";
+  }
+  protected:
+  explicit EnumValueDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 1,
+    kOptionsFieldNumber = 3,
+    kNumberFieldNumber = 2,
+  };
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional .google.protobuf.EnumValueOptions options = 3;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::EnumValueOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumValueOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::EnumValueOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::EnumValueOptions* options);
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* unsafe_arena_release_options();
+
+  // optional int32 number = 2;
+  bool has_number() const;
+  private:
+  bool _internal_has_number() const;
+  public:
+  void clear_number();
+  ::PROTOBUF_NAMESPACE_ID::int32 number() const;
+  void set_number(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_number() const;
+  void _internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.EnumValueDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* options_;
+  ::PROTOBUF_NAMESPACE_ID::int32 number_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT ServiceDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ServiceDescriptorProto) */ {
+ public:
+  inline ServiceDescriptorProto() : ServiceDescriptorProto(nullptr) {}
+  virtual ~ServiceDescriptorProto();
+
+  ServiceDescriptorProto(const ServiceDescriptorProto& from);
+  ServiceDescriptorProto(ServiceDescriptorProto&& from) noexcept
+    : ServiceDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline ServiceDescriptorProto& operator=(const ServiceDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline ServiceDescriptorProto& operator=(ServiceDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const ServiceDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ServiceDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const ServiceDescriptorProto*>(
+               &_ServiceDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    11;
+
+  friend void swap(ServiceDescriptorProto& a, ServiceDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(ServiceDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(ServiceDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ServiceDescriptorProto* New() const final {
+    return CreateMaybeMessage<ServiceDescriptorProto>(nullptr);
+  }
+
+  ServiceDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<ServiceDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const ServiceDescriptorProto& from);
+  void MergeFrom(const ServiceDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ServiceDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.ServiceDescriptorProto";
+  }
+  protected:
+  explicit ServiceDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kMethodFieldNumber = 2,
+    kNameFieldNumber = 1,
+    kOptionsFieldNumber = 3,
+  };
+  // repeated .google.protobuf.MethodDescriptorProto method = 2;
+  int method_size() const;
+  private:
+  int _internal_method_size() const;
+  public:
+  void clear_method();
+  PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* mutable_method(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >*
+      mutable_method();
+  private:
+  const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& _internal_method(int index) const;
+  PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* _internal_add_method();
+  public:
+  const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& method(int index) const;
+  PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* add_method();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >&
+      method() const;
+
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional .google.protobuf.ServiceOptions options = 3;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::ServiceOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::ServiceOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::ServiceOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::ServiceOptions* options);
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* unsafe_arena_release_options();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.ServiceDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto > method_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* options_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT MethodDescriptorProto PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MethodDescriptorProto) */ {
+ public:
+  inline MethodDescriptorProto() : MethodDescriptorProto(nullptr) {}
+  virtual ~MethodDescriptorProto();
+
+  MethodDescriptorProto(const MethodDescriptorProto& from);
+  MethodDescriptorProto(MethodDescriptorProto&& from) noexcept
+    : MethodDescriptorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline MethodDescriptorProto& operator=(const MethodDescriptorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline MethodDescriptorProto& operator=(MethodDescriptorProto&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const MethodDescriptorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const MethodDescriptorProto* internal_default_instance() {
+    return reinterpret_cast<const MethodDescriptorProto*>(
+               &_MethodDescriptorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    12;
+
+  friend void swap(MethodDescriptorProto& a, MethodDescriptorProto& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(MethodDescriptorProto* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(MethodDescriptorProto* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline MethodDescriptorProto* New() const final {
+    return CreateMaybeMessage<MethodDescriptorProto>(nullptr);
+  }
+
+  MethodDescriptorProto* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<MethodDescriptorProto>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const MethodDescriptorProto& from);
+  void MergeFrom(const MethodDescriptorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(MethodDescriptorProto* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.MethodDescriptorProto";
+  }
+  protected:
+  explicit MethodDescriptorProto(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 1,
+    kInputTypeFieldNumber = 2,
+    kOutputTypeFieldNumber = 3,
+    kOptionsFieldNumber = 4,
+    kClientStreamingFieldNumber = 5,
+    kServerStreamingFieldNumber = 6,
+  };
+  // optional string name = 1;
+  bool has_name() const;
+  private:
+  bool _internal_has_name() const;
+  public:
+  void clear_name();
+  const std::string& name() const;
+  void set_name(const std::string& value);
+  void set_name(std::string&& value);
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  std::string* mutable_name();
+  std::string* release_name();
+  void set_allocated_name(std::string* name);
+  private:
+  const std::string& _internal_name() const;
+  void _internal_set_name(const std::string& value);
+  std::string* _internal_mutable_name();
+  public:
+
+  // optional string input_type = 2;
+  bool has_input_type() const;
+  private:
+  bool _internal_has_input_type() const;
+  public:
+  void clear_input_type();
+  const std::string& input_type() const;
+  void set_input_type(const std::string& value);
+  void set_input_type(std::string&& value);
+  void set_input_type(const char* value);
+  void set_input_type(const char* value, size_t size);
+  std::string* mutable_input_type();
+  std::string* release_input_type();
+  void set_allocated_input_type(std::string* input_type);
+  private:
+  const std::string& _internal_input_type() const;
+  void _internal_set_input_type(const std::string& value);
+  std::string* _internal_mutable_input_type();
+  public:
+
+  // optional string output_type = 3;
+  bool has_output_type() const;
+  private:
+  bool _internal_has_output_type() const;
+  public:
+  void clear_output_type();
+  const std::string& output_type() const;
+  void set_output_type(const std::string& value);
+  void set_output_type(std::string&& value);
+  void set_output_type(const char* value);
+  void set_output_type(const char* value, size_t size);
+  std::string* mutable_output_type();
+  std::string* release_output_type();
+  void set_allocated_output_type(std::string* output_type);
+  private:
+  const std::string& _internal_output_type() const;
+  void _internal_set_output_type(const std::string& value);
+  std::string* _internal_mutable_output_type();
+  public:
+
+  // optional .google.protobuf.MethodOptions options = 4;
+  bool has_options() const;
+  private:
+  bool _internal_has_options() const;
+  public:
+  void clear_options();
+  const PROTOBUF_NAMESPACE_ID::MethodOptions& options() const;
+  PROTOBUF_NAMESPACE_ID::MethodOptions* release_options();
+  PROTOBUF_NAMESPACE_ID::MethodOptions* mutable_options();
+  void set_allocated_options(PROTOBUF_NAMESPACE_ID::MethodOptions* options);
+  private:
+  const PROTOBUF_NAMESPACE_ID::MethodOptions& _internal_options() const;
+  PROTOBUF_NAMESPACE_ID::MethodOptions* _internal_mutable_options();
+  public:
+  void unsafe_arena_set_allocated_options(
+      PROTOBUF_NAMESPACE_ID::MethodOptions* options);
+  PROTOBUF_NAMESPACE_ID::MethodOptions* unsafe_arena_release_options();
+
+  // optional bool client_streaming = 5 [default = false];
+  bool has_client_streaming() const;
+  private:
+  bool _internal_has_client_streaming() const;
+  public:
+  void clear_client_streaming();
+  bool client_streaming() const;
+  void set_client_streaming(bool value);
+  private:
+  bool _internal_client_streaming() const;
+  void _internal_set_client_streaming(bool value);
+  public:
+
+  // optional bool server_streaming = 6 [default = false];
+  bool has_server_streaming() const;
+  private:
+  bool _internal_has_server_streaming() const;
+  public:
+  void clear_server_streaming();
+  bool server_streaming() const;
+  void set_server_streaming(bool value);
+  private:
+  bool _internal_server_streaming() const;
+  void _internal_set_server_streaming(bool value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.MethodDescriptorProto)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr input_type_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr output_type_;
+  PROTOBUF_NAMESPACE_ID::MethodOptions* options_;
+  bool client_streaming_;
+  bool server_streaming_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT FileOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FileOptions) */ {
+ public:
+  inline FileOptions() : FileOptions(nullptr) {}
+  virtual ~FileOptions();
+
+  FileOptions(const FileOptions& from);
+  FileOptions(FileOptions&& from) noexcept
+    : FileOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline FileOptions& operator=(const FileOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FileOptions& operator=(FileOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FileOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FileOptions* internal_default_instance() {
+    return reinterpret_cast<const FileOptions*>(
+               &_FileOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    13;
+
+  friend void swap(FileOptions& a, FileOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FileOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FileOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FileOptions* New() const final {
+    return CreateMaybeMessage<FileOptions>(nullptr);
+  }
+
+  FileOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FileOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FileOptions& from);
+  void MergeFrom(const FileOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FileOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FileOptions";
+  }
+  protected:
+  explicit FileOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef FileOptions_OptimizeMode OptimizeMode;
+  static constexpr OptimizeMode SPEED =
+    FileOptions_OptimizeMode_SPEED;
+  static constexpr OptimizeMode CODE_SIZE =
+    FileOptions_OptimizeMode_CODE_SIZE;
+  static constexpr OptimizeMode LITE_RUNTIME =
+    FileOptions_OptimizeMode_LITE_RUNTIME;
+  static inline bool OptimizeMode_IsValid(int value) {
+    return FileOptions_OptimizeMode_IsValid(value);
+  }
+  static constexpr OptimizeMode OptimizeMode_MIN =
+    FileOptions_OptimizeMode_OptimizeMode_MIN;
+  static constexpr OptimizeMode OptimizeMode_MAX =
+    FileOptions_OptimizeMode_OptimizeMode_MAX;
+  static constexpr int OptimizeMode_ARRAYSIZE =
+    FileOptions_OptimizeMode_OptimizeMode_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  OptimizeMode_descriptor() {
+    return FileOptions_OptimizeMode_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& OptimizeMode_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, OptimizeMode>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function OptimizeMode_Name.");
+    return FileOptions_OptimizeMode_Name(enum_t_value);
+  }
+  static inline bool OptimizeMode_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      OptimizeMode* value) {
+    return FileOptions_OptimizeMode_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kJavaPackageFieldNumber = 1,
+    kJavaOuterClassnameFieldNumber = 8,
+    kGoPackageFieldNumber = 11,
+    kObjcClassPrefixFieldNumber = 36,
+    kCsharpNamespaceFieldNumber = 37,
+    kSwiftPrefixFieldNumber = 39,
+    kPhpClassPrefixFieldNumber = 40,
+    kPhpNamespaceFieldNumber = 41,
+    kPhpMetadataNamespaceFieldNumber = 44,
+    kRubyPackageFieldNumber = 45,
+    kJavaMultipleFilesFieldNumber = 10,
+    kJavaGenerateEqualsAndHashFieldNumber = 20,
+    kJavaStringCheckUtf8FieldNumber = 27,
+    kCcGenericServicesFieldNumber = 16,
+    kJavaGenericServicesFieldNumber = 17,
+    kPyGenericServicesFieldNumber = 18,
+    kPhpGenericServicesFieldNumber = 42,
+    kDeprecatedFieldNumber = 23,
+    kOptimizeForFieldNumber = 9,
+    kCcEnableArenasFieldNumber = 31,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional string java_package = 1;
+  bool has_java_package() const;
+  private:
+  bool _internal_has_java_package() const;
+  public:
+  void clear_java_package();
+  const std::string& java_package() const;
+  void set_java_package(const std::string& value);
+  void set_java_package(std::string&& value);
+  void set_java_package(const char* value);
+  void set_java_package(const char* value, size_t size);
+  std::string* mutable_java_package();
+  std::string* release_java_package();
+  void set_allocated_java_package(std::string* java_package);
+  private:
+  const std::string& _internal_java_package() const;
+  void _internal_set_java_package(const std::string& value);
+  std::string* _internal_mutable_java_package();
+  public:
+
+  // optional string java_outer_classname = 8;
+  bool has_java_outer_classname() const;
+  private:
+  bool _internal_has_java_outer_classname() const;
+  public:
+  void clear_java_outer_classname();
+  const std::string& java_outer_classname() const;
+  void set_java_outer_classname(const std::string& value);
+  void set_java_outer_classname(std::string&& value);
+  void set_java_outer_classname(const char* value);
+  void set_java_outer_classname(const char* value, size_t size);
+  std::string* mutable_java_outer_classname();
+  std::string* release_java_outer_classname();
+  void set_allocated_java_outer_classname(std::string* java_outer_classname);
+  private:
+  const std::string& _internal_java_outer_classname() const;
+  void _internal_set_java_outer_classname(const std::string& value);
+  std::string* _internal_mutable_java_outer_classname();
+  public:
+
+  // optional string go_package = 11;
+  bool has_go_package() const;
+  private:
+  bool _internal_has_go_package() const;
+  public:
+  void clear_go_package();
+  const std::string& go_package() const;
+  void set_go_package(const std::string& value);
+  void set_go_package(std::string&& value);
+  void set_go_package(const char* value);
+  void set_go_package(const char* value, size_t size);
+  std::string* mutable_go_package();
+  std::string* release_go_package();
+  void set_allocated_go_package(std::string* go_package);
+  private:
+  const std::string& _internal_go_package() const;
+  void _internal_set_go_package(const std::string& value);
+  std::string* _internal_mutable_go_package();
+  public:
+
+  // optional string objc_class_prefix = 36;
+  bool has_objc_class_prefix() const;
+  private:
+  bool _internal_has_objc_class_prefix() const;
+  public:
+  void clear_objc_class_prefix();
+  const std::string& objc_class_prefix() const;
+  void set_objc_class_prefix(const std::string& value);
+  void set_objc_class_prefix(std::string&& value);
+  void set_objc_class_prefix(const char* value);
+  void set_objc_class_prefix(const char* value, size_t size);
+  std::string* mutable_objc_class_prefix();
+  std::string* release_objc_class_prefix();
+  void set_allocated_objc_class_prefix(std::string* objc_class_prefix);
+  private:
+  const std::string& _internal_objc_class_prefix() const;
+  void _internal_set_objc_class_prefix(const std::string& value);
+  std::string* _internal_mutable_objc_class_prefix();
+  public:
+
+  // optional string csharp_namespace = 37;
+  bool has_csharp_namespace() const;
+  private:
+  bool _internal_has_csharp_namespace() const;
+  public:
+  void clear_csharp_namespace();
+  const std::string& csharp_namespace() const;
+  void set_csharp_namespace(const std::string& value);
+  void set_csharp_namespace(std::string&& value);
+  void set_csharp_namespace(const char* value);
+  void set_csharp_namespace(const char* value, size_t size);
+  std::string* mutable_csharp_namespace();
+  std::string* release_csharp_namespace();
+  void set_allocated_csharp_namespace(std::string* csharp_namespace);
+  private:
+  const std::string& _internal_csharp_namespace() const;
+  void _internal_set_csharp_namespace(const std::string& value);
+  std::string* _internal_mutable_csharp_namespace();
+  public:
+
+  // optional string swift_prefix = 39;
+  bool has_swift_prefix() const;
+  private:
+  bool _internal_has_swift_prefix() const;
+  public:
+  void clear_swift_prefix();
+  const std::string& swift_prefix() const;
+  void set_swift_prefix(const std::string& value);
+  void set_swift_prefix(std::string&& value);
+  void set_swift_prefix(const char* value);
+  void set_swift_prefix(const char* value, size_t size);
+  std::string* mutable_swift_prefix();
+  std::string* release_swift_prefix();
+  void set_allocated_swift_prefix(std::string* swift_prefix);
+  private:
+  const std::string& _internal_swift_prefix() const;
+  void _internal_set_swift_prefix(const std::string& value);
+  std::string* _internal_mutable_swift_prefix();
+  public:
+
+  // optional string php_class_prefix = 40;
+  bool has_php_class_prefix() const;
+  private:
+  bool _internal_has_php_class_prefix() const;
+  public:
+  void clear_php_class_prefix();
+  const std::string& php_class_prefix() const;
+  void set_php_class_prefix(const std::string& value);
+  void set_php_class_prefix(std::string&& value);
+  void set_php_class_prefix(const char* value);
+  void set_php_class_prefix(const char* value, size_t size);
+  std::string* mutable_php_class_prefix();
+  std::string* release_php_class_prefix();
+  void set_allocated_php_class_prefix(std::string* php_class_prefix);
+  private:
+  const std::string& _internal_php_class_prefix() const;
+  void _internal_set_php_class_prefix(const std::string& value);
+  std::string* _internal_mutable_php_class_prefix();
+  public:
+
+  // optional string php_namespace = 41;
+  bool has_php_namespace() const;
+  private:
+  bool _internal_has_php_namespace() const;
+  public:
+  void clear_php_namespace();
+  const std::string& php_namespace() const;
+  void set_php_namespace(const std::string& value);
+  void set_php_namespace(std::string&& value);
+  void set_php_namespace(const char* value);
+  void set_php_namespace(const char* value, size_t size);
+  std::string* mutable_php_namespace();
+  std::string* release_php_namespace();
+  void set_allocated_php_namespace(std::string* php_namespace);
+  private:
+  const std::string& _internal_php_namespace() const;
+  void _internal_set_php_namespace(const std::string& value);
+  std::string* _internal_mutable_php_namespace();
+  public:
+
+  // optional string php_metadata_namespace = 44;
+  bool has_php_metadata_namespace() const;
+  private:
+  bool _internal_has_php_metadata_namespace() const;
+  public:
+  void clear_php_metadata_namespace();
+  const std::string& php_metadata_namespace() const;
+  void set_php_metadata_namespace(const std::string& value);
+  void set_php_metadata_namespace(std::string&& value);
+  void set_php_metadata_namespace(const char* value);
+  void set_php_metadata_namespace(const char* value, size_t size);
+  std::string* mutable_php_metadata_namespace();
+  std::string* release_php_metadata_namespace();
+  void set_allocated_php_metadata_namespace(std::string* php_metadata_namespace);
+  private:
+  const std::string& _internal_php_metadata_namespace() const;
+  void _internal_set_php_metadata_namespace(const std::string& value);
+  std::string* _internal_mutable_php_metadata_namespace();
+  public:
+
+  // optional string ruby_package = 45;
+  bool has_ruby_package() const;
+  private:
+  bool _internal_has_ruby_package() const;
+  public:
+  void clear_ruby_package();
+  const std::string& ruby_package() const;
+  void set_ruby_package(const std::string& value);
+  void set_ruby_package(std::string&& value);
+  void set_ruby_package(const char* value);
+  void set_ruby_package(const char* value, size_t size);
+  std::string* mutable_ruby_package();
+  std::string* release_ruby_package();
+  void set_allocated_ruby_package(std::string* ruby_package);
+  private:
+  const std::string& _internal_ruby_package() const;
+  void _internal_set_ruby_package(const std::string& value);
+  std::string* _internal_mutable_ruby_package();
+  public:
+
+  // optional bool java_multiple_files = 10 [default = false];
+  bool has_java_multiple_files() const;
+  private:
+  bool _internal_has_java_multiple_files() const;
+  public:
+  void clear_java_multiple_files();
+  bool java_multiple_files() const;
+  void set_java_multiple_files(bool value);
+  private:
+  bool _internal_java_multiple_files() const;
+  void _internal_set_java_multiple_files(bool value);
+  public:
+
+  // optional bool java_generate_equals_and_hash = 20 [deprecated = true];
+  PROTOBUF_DEPRECATED bool has_java_generate_equals_and_hash() const;
+  private:
+  bool _internal_has_java_generate_equals_and_hash() const;
+  public:
+  PROTOBUF_DEPRECATED void clear_java_generate_equals_and_hash();
+  PROTOBUF_DEPRECATED bool java_generate_equals_and_hash() const;
+  PROTOBUF_DEPRECATED void set_java_generate_equals_and_hash(bool value);
+  private:
+  bool _internal_java_generate_equals_and_hash() const;
+  void _internal_set_java_generate_equals_and_hash(bool value);
+  public:
+
+  // optional bool java_string_check_utf8 = 27 [default = false];
+  bool has_java_string_check_utf8() const;
+  private:
+  bool _internal_has_java_string_check_utf8() const;
+  public:
+  void clear_java_string_check_utf8();
+  bool java_string_check_utf8() const;
+  void set_java_string_check_utf8(bool value);
+  private:
+  bool _internal_java_string_check_utf8() const;
+  void _internal_set_java_string_check_utf8(bool value);
+  public:
+
+  // optional bool cc_generic_services = 16 [default = false];
+  bool has_cc_generic_services() const;
+  private:
+  bool _internal_has_cc_generic_services() const;
+  public:
+  void clear_cc_generic_services();
+  bool cc_generic_services() const;
+  void set_cc_generic_services(bool value);
+  private:
+  bool _internal_cc_generic_services() const;
+  void _internal_set_cc_generic_services(bool value);
+  public:
+
+  // optional bool java_generic_services = 17 [default = false];
+  bool has_java_generic_services() const;
+  private:
+  bool _internal_has_java_generic_services() const;
+  public:
+  void clear_java_generic_services();
+  bool java_generic_services() const;
+  void set_java_generic_services(bool value);
+  private:
+  bool _internal_java_generic_services() const;
+  void _internal_set_java_generic_services(bool value);
+  public:
+
+  // optional bool py_generic_services = 18 [default = false];
+  bool has_py_generic_services() const;
+  private:
+  bool _internal_has_py_generic_services() const;
+  public:
+  void clear_py_generic_services();
+  bool py_generic_services() const;
+  void set_py_generic_services(bool value);
+  private:
+  bool _internal_py_generic_services() const;
+  void _internal_set_py_generic_services(bool value);
+  public:
+
+  // optional bool php_generic_services = 42 [default = false];
+  bool has_php_generic_services() const;
+  private:
+  bool _internal_has_php_generic_services() const;
+  public:
+  void clear_php_generic_services();
+  bool php_generic_services() const;
+  void set_php_generic_services(bool value);
+  private:
+  bool _internal_php_generic_services() const;
+  void _internal_set_php_generic_services(bool value);
+  public:
+
+  // optional bool deprecated = 23 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  // optional .google.protobuf.FileOptions.OptimizeMode optimize_for = 9 [default = SPEED];
+  bool has_optimize_for() const;
+  private:
+  bool _internal_has_optimize_for() const;
+  public:
+  void clear_optimize_for();
+  PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode optimize_for() const;
+  void set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value);
+  private:
+  PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode _internal_optimize_for() const;
+  void _internal_set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value);
+  public:
+
+  // optional bool cc_enable_arenas = 31 [default = true];
+  bool has_cc_enable_arenas() const;
+  private:
+  bool _internal_has_cc_enable_arenas() const;
+  public:
+  void clear_cc_enable_arenas();
+  bool cc_enable_arenas() const;
+  void set_cc_enable_arenas(bool value);
+  private:
+  bool _internal_cc_enable_arenas() const;
+  void _internal_set_cc_enable_arenas(bool value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(FileOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.FileOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr java_package_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr java_outer_classname_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr go_package_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr objc_class_prefix_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr csharp_namespace_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr swift_prefix_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_class_prefix_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_namespace_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr php_metadata_namespace_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr ruby_package_;
+  bool java_multiple_files_;
+  bool java_generate_equals_and_hash_;
+  bool java_string_check_utf8_;
+  bool cc_generic_services_;
+  bool java_generic_services_;
+  bool py_generic_services_;
+  bool php_generic_services_;
+  bool deprecated_;
+  int optimize_for_;
+  bool cc_enable_arenas_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT MessageOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MessageOptions) */ {
+ public:
+  inline MessageOptions() : MessageOptions(nullptr) {}
+  virtual ~MessageOptions();
+
+  MessageOptions(const MessageOptions& from);
+  MessageOptions(MessageOptions&& from) noexcept
+    : MessageOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline MessageOptions& operator=(const MessageOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline MessageOptions& operator=(MessageOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const MessageOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const MessageOptions* internal_default_instance() {
+    return reinterpret_cast<const MessageOptions*>(
+               &_MessageOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    14;
+
+  friend void swap(MessageOptions& a, MessageOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(MessageOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(MessageOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline MessageOptions* New() const final {
+    return CreateMaybeMessage<MessageOptions>(nullptr);
+  }
+
+  MessageOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<MessageOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const MessageOptions& from);
+  void MergeFrom(const MessageOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(MessageOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.MessageOptions";
+  }
+  protected:
+  explicit MessageOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kMessageSetWireFormatFieldNumber = 1,
+    kNoStandardDescriptorAccessorFieldNumber = 2,
+    kDeprecatedFieldNumber = 3,
+    kMapEntryFieldNumber = 7,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional bool message_set_wire_format = 1 [default = false];
+  bool has_message_set_wire_format() const;
+  private:
+  bool _internal_has_message_set_wire_format() const;
+  public:
+  void clear_message_set_wire_format();
+  bool message_set_wire_format() const;
+  void set_message_set_wire_format(bool value);
+  private:
+  bool _internal_message_set_wire_format() const;
+  void _internal_set_message_set_wire_format(bool value);
+  public:
+
+  // optional bool no_standard_descriptor_accessor = 2 [default = false];
+  bool has_no_standard_descriptor_accessor() const;
+  private:
+  bool _internal_has_no_standard_descriptor_accessor() const;
+  public:
+  void clear_no_standard_descriptor_accessor();
+  bool no_standard_descriptor_accessor() const;
+  void set_no_standard_descriptor_accessor(bool value);
+  private:
+  bool _internal_no_standard_descriptor_accessor() const;
+  void _internal_set_no_standard_descriptor_accessor(bool value);
+  public:
+
+  // optional bool deprecated = 3 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  // optional bool map_entry = 7;
+  bool has_map_entry() const;
+  private:
+  bool _internal_has_map_entry() const;
+  public:
+  void clear_map_entry();
+  bool map_entry() const;
+  void set_map_entry(bool value);
+  private:
+  bool _internal_map_entry() const;
+  void _internal_set_map_entry(bool value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(MessageOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.MessageOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  bool message_set_wire_format_;
+  bool no_standard_descriptor_accessor_;
+  bool deprecated_;
+  bool map_entry_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT FieldOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldOptions) */ {
+ public:
+  inline FieldOptions() : FieldOptions(nullptr) {}
+  virtual ~FieldOptions();
+
+  FieldOptions(const FieldOptions& from);
+  FieldOptions(FieldOptions&& from) noexcept
+    : FieldOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline FieldOptions& operator=(const FieldOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FieldOptions& operator=(FieldOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FieldOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FieldOptions* internal_default_instance() {
+    return reinterpret_cast<const FieldOptions*>(
+               &_FieldOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    15;
+
+  friend void swap(FieldOptions& a, FieldOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FieldOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FieldOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FieldOptions* New() const final {
+    return CreateMaybeMessage<FieldOptions>(nullptr);
+  }
+
+  FieldOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FieldOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FieldOptions& from);
+  void MergeFrom(const FieldOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FieldOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FieldOptions";
+  }
+  protected:
+  explicit FieldOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef FieldOptions_CType CType;
+  static constexpr CType STRING =
+    FieldOptions_CType_STRING;
+  static constexpr CType CORD =
+    FieldOptions_CType_CORD;
+  static constexpr CType STRING_PIECE =
+    FieldOptions_CType_STRING_PIECE;
+  static inline bool CType_IsValid(int value) {
+    return FieldOptions_CType_IsValid(value);
+  }
+  static constexpr CType CType_MIN =
+    FieldOptions_CType_CType_MIN;
+  static constexpr CType CType_MAX =
+    FieldOptions_CType_CType_MAX;
+  static constexpr int CType_ARRAYSIZE =
+    FieldOptions_CType_CType_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  CType_descriptor() {
+    return FieldOptions_CType_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& CType_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, CType>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function CType_Name.");
+    return FieldOptions_CType_Name(enum_t_value);
+  }
+  static inline bool CType_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      CType* value) {
+    return FieldOptions_CType_Parse(name, value);
+  }
+
+  typedef FieldOptions_JSType JSType;
+  static constexpr JSType JS_NORMAL =
+    FieldOptions_JSType_JS_NORMAL;
+  static constexpr JSType JS_STRING =
+    FieldOptions_JSType_JS_STRING;
+  static constexpr JSType JS_NUMBER =
+    FieldOptions_JSType_JS_NUMBER;
+  static inline bool JSType_IsValid(int value) {
+    return FieldOptions_JSType_IsValid(value);
+  }
+  static constexpr JSType JSType_MIN =
+    FieldOptions_JSType_JSType_MIN;
+  static constexpr JSType JSType_MAX =
+    FieldOptions_JSType_JSType_MAX;
+  static constexpr int JSType_ARRAYSIZE =
+    FieldOptions_JSType_JSType_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  JSType_descriptor() {
+    return FieldOptions_JSType_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& JSType_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, JSType>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function JSType_Name.");
+    return FieldOptions_JSType_Name(enum_t_value);
+  }
+  static inline bool JSType_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      JSType* value) {
+    return FieldOptions_JSType_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kCtypeFieldNumber = 1,
+    kPackedFieldNumber = 2,
+    kLazyFieldNumber = 5,
+    kDeprecatedFieldNumber = 3,
+    kWeakFieldNumber = 10,
+    kJstypeFieldNumber = 6,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional .google.protobuf.FieldOptions.CType ctype = 1 [default = STRING];
+  bool has_ctype() const;
+  private:
+  bool _internal_has_ctype() const;
+  public:
+  void clear_ctype();
+  PROTOBUF_NAMESPACE_ID::FieldOptions_CType ctype() const;
+  void set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value);
+  private:
+  PROTOBUF_NAMESPACE_ID::FieldOptions_CType _internal_ctype() const;
+  void _internal_set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value);
+  public:
+
+  // optional bool packed = 2;
+  bool has_packed() const;
+  private:
+  bool _internal_has_packed() const;
+  public:
+  void clear_packed();
+  bool packed() const;
+  void set_packed(bool value);
+  private:
+  bool _internal_packed() const;
+  void _internal_set_packed(bool value);
+  public:
+
+  // optional bool lazy = 5 [default = false];
+  bool has_lazy() const;
+  private:
+  bool _internal_has_lazy() const;
+  public:
+  void clear_lazy();
+  bool lazy() const;
+  void set_lazy(bool value);
+  private:
+  bool _internal_lazy() const;
+  void _internal_set_lazy(bool value);
+  public:
+
+  // optional bool deprecated = 3 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  // optional bool weak = 10 [default = false];
+  bool has_weak() const;
+  private:
+  bool _internal_has_weak() const;
+  public:
+  void clear_weak();
+  bool weak() const;
+  void set_weak(bool value);
+  private:
+  bool _internal_weak() const;
+  void _internal_set_weak(bool value);
+  public:
+
+  // optional .google.protobuf.FieldOptions.JSType jstype = 6 [default = JS_NORMAL];
+  bool has_jstype() const;
+  private:
+  bool _internal_has_jstype() const;
+  public:
+  void clear_jstype();
+  PROTOBUF_NAMESPACE_ID::FieldOptions_JSType jstype() const;
+  void set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value);
+  private:
+  PROTOBUF_NAMESPACE_ID::FieldOptions_JSType _internal_jstype() const;
+  void _internal_set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(FieldOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.FieldOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  int ctype_;
+  bool packed_;
+  bool lazy_;
+  bool deprecated_;
+  bool weak_;
+  int jstype_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT OneofOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.OneofOptions) */ {
+ public:
+  inline OneofOptions() : OneofOptions(nullptr) {}
+  virtual ~OneofOptions();
+
+  OneofOptions(const OneofOptions& from);
+  OneofOptions(OneofOptions&& from) noexcept
+    : OneofOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline OneofOptions& operator=(const OneofOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline OneofOptions& operator=(OneofOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const OneofOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const OneofOptions* internal_default_instance() {
+    return reinterpret_cast<const OneofOptions*>(
+               &_OneofOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    16;
+
+  friend void swap(OneofOptions& a, OneofOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(OneofOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(OneofOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline OneofOptions* New() const final {
+    return CreateMaybeMessage<OneofOptions>(nullptr);
+  }
+
+  OneofOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<OneofOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const OneofOptions& from);
+  void MergeFrom(const OneofOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(OneofOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.OneofOptions";
+  }
+  protected:
+  explicit OneofOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(OneofOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.OneofOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT EnumOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumOptions) */ {
+ public:
+  inline EnumOptions() : EnumOptions(nullptr) {}
+  virtual ~EnumOptions();
+
+  EnumOptions(const EnumOptions& from);
+  EnumOptions(EnumOptions&& from) noexcept
+    : EnumOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline EnumOptions& operator=(const EnumOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline EnumOptions& operator=(EnumOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const EnumOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const EnumOptions* internal_default_instance() {
+    return reinterpret_cast<const EnumOptions*>(
+               &_EnumOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    17;
+
+  friend void swap(EnumOptions& a, EnumOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(EnumOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(EnumOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline EnumOptions* New() const final {
+    return CreateMaybeMessage<EnumOptions>(nullptr);
+  }
+
+  EnumOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<EnumOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const EnumOptions& from);
+  void MergeFrom(const EnumOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(EnumOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.EnumOptions";
+  }
+  protected:
+  explicit EnumOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kAllowAliasFieldNumber = 2,
+    kDeprecatedFieldNumber = 3,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional bool allow_alias = 2;
+  bool has_allow_alias() const;
+  private:
+  bool _internal_has_allow_alias() const;
+  public:
+  void clear_allow_alias();
+  bool allow_alias() const;
+  void set_allow_alias(bool value);
+  private:
+  bool _internal_allow_alias() const;
+  void _internal_set_allow_alias(bool value);
+  public:
+
+  // optional bool deprecated = 3 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(EnumOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.EnumOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  bool allow_alias_;
+  bool deprecated_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT EnumValueOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.EnumValueOptions) */ {
+ public:
+  inline EnumValueOptions() : EnumValueOptions(nullptr) {}
+  virtual ~EnumValueOptions();
+
+  EnumValueOptions(const EnumValueOptions& from);
+  EnumValueOptions(EnumValueOptions&& from) noexcept
+    : EnumValueOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline EnumValueOptions& operator=(const EnumValueOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline EnumValueOptions& operator=(EnumValueOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const EnumValueOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const EnumValueOptions* internal_default_instance() {
+    return reinterpret_cast<const EnumValueOptions*>(
+               &_EnumValueOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    18;
+
+  friend void swap(EnumValueOptions& a, EnumValueOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(EnumValueOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(EnumValueOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline EnumValueOptions* New() const final {
+    return CreateMaybeMessage<EnumValueOptions>(nullptr);
+  }
+
+  EnumValueOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<EnumValueOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const EnumValueOptions& from);
+  void MergeFrom(const EnumValueOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(EnumValueOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.EnumValueOptions";
+  }
+  protected:
+  explicit EnumValueOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kDeprecatedFieldNumber = 1,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional bool deprecated = 1 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(EnumValueOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.EnumValueOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  bool deprecated_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT ServiceOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ServiceOptions) */ {
+ public:
+  inline ServiceOptions() : ServiceOptions(nullptr) {}
+  virtual ~ServiceOptions();
+
+  ServiceOptions(const ServiceOptions& from);
+  ServiceOptions(ServiceOptions&& from) noexcept
+    : ServiceOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline ServiceOptions& operator=(const ServiceOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline ServiceOptions& operator=(ServiceOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const ServiceOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ServiceOptions* internal_default_instance() {
+    return reinterpret_cast<const ServiceOptions*>(
+               &_ServiceOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    19;
+
+  friend void swap(ServiceOptions& a, ServiceOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(ServiceOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(ServiceOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ServiceOptions* New() const final {
+    return CreateMaybeMessage<ServiceOptions>(nullptr);
+  }
+
+  ServiceOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<ServiceOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const ServiceOptions& from);
+  void MergeFrom(const ServiceOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ServiceOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.ServiceOptions";
+  }
+  protected:
+  explicit ServiceOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kDeprecatedFieldNumber = 33,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional bool deprecated = 33 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(ServiceOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.ServiceOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  bool deprecated_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT MethodOptions PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.MethodOptions) */ {
+ public:
+  inline MethodOptions() : MethodOptions(nullptr) {}
+  virtual ~MethodOptions();
+
+  MethodOptions(const MethodOptions& from);
+  MethodOptions(MethodOptions&& from) noexcept
+    : MethodOptions() {
+    *this = ::std::move(from);
+  }
+
+  inline MethodOptions& operator=(const MethodOptions& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline MethodOptions& operator=(MethodOptions&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const MethodOptions& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const MethodOptions* internal_default_instance() {
+    return reinterpret_cast<const MethodOptions*>(
+               &_MethodOptions_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    20;
+
+  friend void swap(MethodOptions& a, MethodOptions& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(MethodOptions* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(MethodOptions* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline MethodOptions* New() const final {
+    return CreateMaybeMessage<MethodOptions>(nullptr);
+  }
+
+  MethodOptions* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<MethodOptions>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const MethodOptions& from);
+  void MergeFrom(const MethodOptions& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(MethodOptions* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.MethodOptions";
+  }
+  protected:
+  explicit MethodOptions(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef MethodOptions_IdempotencyLevel IdempotencyLevel;
+  static constexpr IdempotencyLevel IDEMPOTENCY_UNKNOWN =
+    MethodOptions_IdempotencyLevel_IDEMPOTENCY_UNKNOWN;
+  static constexpr IdempotencyLevel NO_SIDE_EFFECTS =
+    MethodOptions_IdempotencyLevel_NO_SIDE_EFFECTS;
+  static constexpr IdempotencyLevel IDEMPOTENT =
+    MethodOptions_IdempotencyLevel_IDEMPOTENT;
+  static inline bool IdempotencyLevel_IsValid(int value) {
+    return MethodOptions_IdempotencyLevel_IsValid(value);
+  }
+  static constexpr IdempotencyLevel IdempotencyLevel_MIN =
+    MethodOptions_IdempotencyLevel_IdempotencyLevel_MIN;
+  static constexpr IdempotencyLevel IdempotencyLevel_MAX =
+    MethodOptions_IdempotencyLevel_IdempotencyLevel_MAX;
+  static constexpr int IdempotencyLevel_ARRAYSIZE =
+    MethodOptions_IdempotencyLevel_IdempotencyLevel_ARRAYSIZE;
+  static inline const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor*
+  IdempotencyLevel_descriptor() {
+    return MethodOptions_IdempotencyLevel_descriptor();
+  }
+  template<typename T>
+  static inline const std::string& IdempotencyLevel_Name(T enum_t_value) {
+    static_assert(::std::is_same<T, IdempotencyLevel>::value ||
+      ::std::is_integral<T>::value,
+      "Incorrect type passed to function IdempotencyLevel_Name.");
+    return MethodOptions_IdempotencyLevel_Name(enum_t_value);
+  }
+  static inline bool IdempotencyLevel_Parse(::PROTOBUF_NAMESPACE_ID::ConstStringParam name,
+      IdempotencyLevel* value) {
+    return MethodOptions_IdempotencyLevel_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kUninterpretedOptionFieldNumber = 999,
+    kDeprecatedFieldNumber = 33,
+    kIdempotencyLevelFieldNumber = 34,
+  };
+  // repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+  int uninterpreted_option_size() const;
+  private:
+  int _internal_uninterpreted_option_size() const;
+  public:
+  void clear_uninterpreted_option();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* mutable_uninterpreted_option(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+      mutable_uninterpreted_option();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& _internal_uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* _internal_add_uninterpreted_option();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption& uninterpreted_option(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption* add_uninterpreted_option();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+      uninterpreted_option() const;
+
+  // optional bool deprecated = 33 [default = false];
+  bool has_deprecated() const;
+  private:
+  bool _internal_has_deprecated() const;
+  public:
+  void clear_deprecated();
+  bool deprecated() const;
+  void set_deprecated(bool value);
+  private:
+  bool _internal_deprecated() const;
+  void _internal_set_deprecated(bool value);
+  public:
+
+  // optional .google.protobuf.MethodOptions.IdempotencyLevel idempotency_level = 34 [default = IDEMPOTENCY_UNKNOWN];
+  bool has_idempotency_level() const;
+  private:
+  bool _internal_has_idempotency_level() const;
+  public:
+  void clear_idempotency_level();
+  PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel idempotency_level() const;
+  void set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value);
+  private:
+  PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel _internal_idempotency_level() const;
+  void _internal_set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value);
+  public:
+
+  GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(MethodOptions)
+  // @@protoc_insertion_point(class_scope:google.protobuf.MethodOptions)
+ private:
+  class _Internal;
+
+  ::PROTOBUF_NAMESPACE_ID::internal::ExtensionSet _extensions_;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption > uninterpreted_option_;
+  bool deprecated_;
+  int idempotency_level_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT UninterpretedOption_NamePart PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.UninterpretedOption.NamePart) */ {
+ public:
+  inline UninterpretedOption_NamePart() : UninterpretedOption_NamePart(nullptr) {}
+  virtual ~UninterpretedOption_NamePart();
+
+  UninterpretedOption_NamePart(const UninterpretedOption_NamePart& from);
+  UninterpretedOption_NamePart(UninterpretedOption_NamePart&& from) noexcept
+    : UninterpretedOption_NamePart() {
+    *this = ::std::move(from);
+  }
+
+  inline UninterpretedOption_NamePart& operator=(const UninterpretedOption_NamePart& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline UninterpretedOption_NamePart& operator=(UninterpretedOption_NamePart&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const UninterpretedOption_NamePart& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const UninterpretedOption_NamePart* internal_default_instance() {
+    return reinterpret_cast<const UninterpretedOption_NamePart*>(
+               &_UninterpretedOption_NamePart_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    21;
+
+  friend void swap(UninterpretedOption_NamePart& a, UninterpretedOption_NamePart& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(UninterpretedOption_NamePart* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(UninterpretedOption_NamePart* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline UninterpretedOption_NamePart* New() const final {
+    return CreateMaybeMessage<UninterpretedOption_NamePart>(nullptr);
+  }
+
+  UninterpretedOption_NamePart* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<UninterpretedOption_NamePart>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const UninterpretedOption_NamePart& from);
+  void MergeFrom(const UninterpretedOption_NamePart& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(UninterpretedOption_NamePart* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.UninterpretedOption.NamePart";
+  }
+  protected:
+  explicit UninterpretedOption_NamePart(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNamePartFieldNumber = 1,
+    kIsExtensionFieldNumber = 2,
+  };
+  // required string name_part = 1;
+  bool has_name_part() const;
+  private:
+  bool _internal_has_name_part() const;
+  public:
+  void clear_name_part();
+  const std::string& name_part() const;
+  void set_name_part(const std::string& value);
+  void set_name_part(std::string&& value);
+  void set_name_part(const char* value);
+  void set_name_part(const char* value, size_t size);
+  std::string* mutable_name_part();
+  std::string* release_name_part();
+  void set_allocated_name_part(std::string* name_part);
+  private:
+  const std::string& _internal_name_part() const;
+  void _internal_set_name_part(const std::string& value);
+  std::string* _internal_mutable_name_part();
+  public:
+
+  // required bool is_extension = 2;
+  bool has_is_extension() const;
+  private:
+  bool _internal_has_is_extension() const;
+  public:
+  void clear_is_extension();
+  bool is_extension() const;
+  void set_is_extension(bool value);
+  private:
+  bool _internal_is_extension() const;
+  void _internal_set_is_extension(bool value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.UninterpretedOption.NamePart)
+ private:
+  class _Internal;
+
+  // helper for ByteSizeLong()
+  size_t RequiredFieldsByteSizeFallback() const;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr name_part_;
+  bool is_extension_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT UninterpretedOption PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.UninterpretedOption) */ {
+ public:
+  inline UninterpretedOption() : UninterpretedOption(nullptr) {}
+  virtual ~UninterpretedOption();
+
+  UninterpretedOption(const UninterpretedOption& from);
+  UninterpretedOption(UninterpretedOption&& from) noexcept
+    : UninterpretedOption() {
+    *this = ::std::move(from);
+  }
+
+  inline UninterpretedOption& operator=(const UninterpretedOption& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline UninterpretedOption& operator=(UninterpretedOption&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const UninterpretedOption& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const UninterpretedOption* internal_default_instance() {
+    return reinterpret_cast<const UninterpretedOption*>(
+               &_UninterpretedOption_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    22;
+
+  friend void swap(UninterpretedOption& a, UninterpretedOption& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(UninterpretedOption* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(UninterpretedOption* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline UninterpretedOption* New() const final {
+    return CreateMaybeMessage<UninterpretedOption>(nullptr);
+  }
+
+  UninterpretedOption* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<UninterpretedOption>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const UninterpretedOption& from);
+  void MergeFrom(const UninterpretedOption& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(UninterpretedOption* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.UninterpretedOption";
+  }
+  protected:
+  explicit UninterpretedOption(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef UninterpretedOption_NamePart NamePart;
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNameFieldNumber = 2,
+    kIdentifierValueFieldNumber = 3,
+    kStringValueFieldNumber = 7,
+    kAggregateValueFieldNumber = 8,
+    kPositiveIntValueFieldNumber = 4,
+    kNegativeIntValueFieldNumber = 5,
+    kDoubleValueFieldNumber = 6,
+  };
+  // repeated .google.protobuf.UninterpretedOption.NamePart name = 2;
+  int name_size() const;
+  private:
+  int _internal_name_size() const;
+  public:
+  void clear_name();
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* mutable_name(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >*
+      mutable_name();
+  private:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& _internal_name(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* _internal_add_name();
+  public:
+  const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& name(int index) const;
+  PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* add_name();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >&
+      name() const;
+
+  // optional string identifier_value = 3;
+  bool has_identifier_value() const;
+  private:
+  bool _internal_has_identifier_value() const;
+  public:
+  void clear_identifier_value();
+  const std::string& identifier_value() const;
+  void set_identifier_value(const std::string& value);
+  void set_identifier_value(std::string&& value);
+  void set_identifier_value(const char* value);
+  void set_identifier_value(const char* value, size_t size);
+  std::string* mutable_identifier_value();
+  std::string* release_identifier_value();
+  void set_allocated_identifier_value(std::string* identifier_value);
+  private:
+  const std::string& _internal_identifier_value() const;
+  void _internal_set_identifier_value(const std::string& value);
+  std::string* _internal_mutable_identifier_value();
+  public:
+
+  // optional bytes string_value = 7;
+  bool has_string_value() const;
+  private:
+  bool _internal_has_string_value() const;
+  public:
+  void clear_string_value();
+  const std::string& string_value() const;
+  void set_string_value(const std::string& value);
+  void set_string_value(std::string&& value);
+  void set_string_value(const char* value);
+  void set_string_value(const void* value, size_t size);
+  std::string* mutable_string_value();
+  std::string* release_string_value();
+  void set_allocated_string_value(std::string* string_value);
+  private:
+  const std::string& _internal_string_value() const;
+  void _internal_set_string_value(const std::string& value);
+  std::string* _internal_mutable_string_value();
+  public:
+
+  // optional string aggregate_value = 8;
+  bool has_aggregate_value() const;
+  private:
+  bool _internal_has_aggregate_value() const;
+  public:
+  void clear_aggregate_value();
+  const std::string& aggregate_value() const;
+  void set_aggregate_value(const std::string& value);
+  void set_aggregate_value(std::string&& value);
+  void set_aggregate_value(const char* value);
+  void set_aggregate_value(const char* value, size_t size);
+  std::string* mutable_aggregate_value();
+  std::string* release_aggregate_value();
+  void set_allocated_aggregate_value(std::string* aggregate_value);
+  private:
+  const std::string& _internal_aggregate_value() const;
+  void _internal_set_aggregate_value(const std::string& value);
+  std::string* _internal_mutable_aggregate_value();
+  public:
+
+  // optional uint64 positive_int_value = 4;
+  bool has_positive_int_value() const;
+  private:
+  bool _internal_has_positive_int_value() const;
+  public:
+  void clear_positive_int_value();
+  ::PROTOBUF_NAMESPACE_ID::uint64 positive_int_value() const;
+  void set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::uint64 _internal_positive_int_value() const;
+  void _internal_set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value);
+  public:
+
+  // optional int64 negative_int_value = 5;
+  bool has_negative_int_value() const;
+  private:
+  bool _internal_has_negative_int_value() const;
+  public:
+  void clear_negative_int_value();
+  ::PROTOBUF_NAMESPACE_ID::int64 negative_int_value() const;
+  void set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int64 _internal_negative_int_value() const;
+  void _internal_set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value);
+  public:
+
+  // optional double double_value = 6;
+  bool has_double_value() const;
+  private:
+  bool _internal_has_double_value() const;
+  public:
+  void clear_double_value();
+  double double_value() const;
+  void set_double_value(double value);
+  private:
+  double _internal_double_value() const;
+  void _internal_set_double_value(double value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.UninterpretedOption)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart > name_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr identifier_value_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr string_value_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr aggregate_value_;
+  ::PROTOBUF_NAMESPACE_ID::uint64 positive_int_value_;
+  ::PROTOBUF_NAMESPACE_ID::int64 negative_int_value_;
+  double double_value_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT SourceCodeInfo_Location PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceCodeInfo.Location) */ {
+ public:
+  inline SourceCodeInfo_Location() : SourceCodeInfo_Location(nullptr) {}
+  virtual ~SourceCodeInfo_Location();
+
+  SourceCodeInfo_Location(const SourceCodeInfo_Location& from);
+  SourceCodeInfo_Location(SourceCodeInfo_Location&& from) noexcept
+    : SourceCodeInfo_Location() {
+    *this = ::std::move(from);
+  }
+
+  inline SourceCodeInfo_Location& operator=(const SourceCodeInfo_Location& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline SourceCodeInfo_Location& operator=(SourceCodeInfo_Location&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const SourceCodeInfo_Location& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const SourceCodeInfo_Location* internal_default_instance() {
+    return reinterpret_cast<const SourceCodeInfo_Location*>(
+               &_SourceCodeInfo_Location_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    23;
+
+  friend void swap(SourceCodeInfo_Location& a, SourceCodeInfo_Location& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(SourceCodeInfo_Location* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(SourceCodeInfo_Location* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline SourceCodeInfo_Location* New() const final {
+    return CreateMaybeMessage<SourceCodeInfo_Location>(nullptr);
+  }
+
+  SourceCodeInfo_Location* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<SourceCodeInfo_Location>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const SourceCodeInfo_Location& from);
+  void MergeFrom(const SourceCodeInfo_Location& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(SourceCodeInfo_Location* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.SourceCodeInfo.Location";
+  }
+  protected:
+  explicit SourceCodeInfo_Location(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kPathFieldNumber = 1,
+    kSpanFieldNumber = 2,
+    kLeadingDetachedCommentsFieldNumber = 6,
+    kLeadingCommentsFieldNumber = 3,
+    kTrailingCommentsFieldNumber = 4,
+  };
+  // repeated int32 path = 1 [packed = true];
+  int path_size() const;
+  private:
+  int _internal_path_size() const;
+  public:
+  void clear_path();
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_path(int index) const;
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      _internal_path() const;
+  void _internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      _internal_mutable_path();
+  public:
+  ::PROTOBUF_NAMESPACE_ID::int32 path(int index) const;
+  void set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
+  void add_path(::PROTOBUF_NAMESPACE_ID::int32 value);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      path() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      mutable_path();
+
+  // repeated int32 span = 2 [packed = true];
+  int span_size() const;
+  private:
+  int _internal_span_size() const;
+  public:
+  void clear_span();
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_span(int index) const;
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      _internal_span() const;
+  void _internal_add_span(::PROTOBUF_NAMESPACE_ID::int32 value);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      _internal_mutable_span();
+  public:
+  ::PROTOBUF_NAMESPACE_ID::int32 span(int index) const;
+  void set_span(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
+  void add_span(::PROTOBUF_NAMESPACE_ID::int32 value);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      span() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      mutable_span();
+
+  // repeated string leading_detached_comments = 6;
+  int leading_detached_comments_size() const;
+  private:
+  int _internal_leading_detached_comments_size() const;
+  public:
+  void clear_leading_detached_comments();
+  const std::string& leading_detached_comments(int index) const;
+  std::string* mutable_leading_detached_comments(int index);
+  void set_leading_detached_comments(int index, const std::string& value);
+  void set_leading_detached_comments(int index, std::string&& value);
+  void set_leading_detached_comments(int index, const char* value);
+  void set_leading_detached_comments(int index, const char* value, size_t size);
+  std::string* add_leading_detached_comments();
+  void add_leading_detached_comments(const std::string& value);
+  void add_leading_detached_comments(std::string&& value);
+  void add_leading_detached_comments(const char* value);
+  void add_leading_detached_comments(const char* value, size_t size);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>& leading_detached_comments() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>* mutable_leading_detached_comments();
+  private:
+  const std::string& _internal_leading_detached_comments(int index) const;
+  std::string* _internal_add_leading_detached_comments();
+  public:
+
+  // optional string leading_comments = 3;
+  bool has_leading_comments() const;
+  private:
+  bool _internal_has_leading_comments() const;
+  public:
+  void clear_leading_comments();
+  const std::string& leading_comments() const;
+  void set_leading_comments(const std::string& value);
+  void set_leading_comments(std::string&& value);
+  void set_leading_comments(const char* value);
+  void set_leading_comments(const char* value, size_t size);
+  std::string* mutable_leading_comments();
+  std::string* release_leading_comments();
+  void set_allocated_leading_comments(std::string* leading_comments);
+  private:
+  const std::string& _internal_leading_comments() const;
+  void _internal_set_leading_comments(const std::string& value);
+  std::string* _internal_mutable_leading_comments();
+  public:
+
+  // optional string trailing_comments = 4;
+  bool has_trailing_comments() const;
+  private:
+  bool _internal_has_trailing_comments() const;
+  public:
+  void clear_trailing_comments();
+  const std::string& trailing_comments() const;
+  void set_trailing_comments(const std::string& value);
+  void set_trailing_comments(std::string&& value);
+  void set_trailing_comments(const char* value);
+  void set_trailing_comments(const char* value, size_t size);
+  std::string* mutable_trailing_comments();
+  std::string* release_trailing_comments();
+  void set_allocated_trailing_comments(std::string* trailing_comments);
+  private:
+  const std::string& _internal_trailing_comments() const;
+  void _internal_set_trailing_comments(const std::string& value);
+  std::string* _internal_mutable_trailing_comments();
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.SourceCodeInfo.Location)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > path_;
+  mutable std::atomic<int> _path_cached_byte_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > span_;
+  mutable std::atomic<int> _span_cached_byte_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string> leading_detached_comments_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr leading_comments_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr trailing_comments_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT SourceCodeInfo PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceCodeInfo) */ {
+ public:
+  inline SourceCodeInfo() : SourceCodeInfo(nullptr) {}
+  virtual ~SourceCodeInfo();
+
+  SourceCodeInfo(const SourceCodeInfo& from);
+  SourceCodeInfo(SourceCodeInfo&& from) noexcept
+    : SourceCodeInfo() {
+    *this = ::std::move(from);
+  }
+
+  inline SourceCodeInfo& operator=(const SourceCodeInfo& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline SourceCodeInfo& operator=(SourceCodeInfo&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const SourceCodeInfo& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const SourceCodeInfo* internal_default_instance() {
+    return reinterpret_cast<const SourceCodeInfo*>(
+               &_SourceCodeInfo_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    24;
+
+  friend void swap(SourceCodeInfo& a, SourceCodeInfo& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(SourceCodeInfo* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(SourceCodeInfo* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline SourceCodeInfo* New() const final {
+    return CreateMaybeMessage<SourceCodeInfo>(nullptr);
+  }
+
+  SourceCodeInfo* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<SourceCodeInfo>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const SourceCodeInfo& from);
+  void MergeFrom(const SourceCodeInfo& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(SourceCodeInfo* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.SourceCodeInfo";
+  }
+  protected:
+  explicit SourceCodeInfo(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef SourceCodeInfo_Location Location;
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kLocationFieldNumber = 1,
+  };
+  // repeated .google.protobuf.SourceCodeInfo.Location location = 1;
+  int location_size() const;
+  private:
+  int _internal_location_size() const;
+  public:
+  void clear_location();
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* mutable_location(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >*
+      mutable_location();
+  private:
+  const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& _internal_location(int index) const;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* _internal_add_location();
+  public:
+  const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& location(int index) const;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* add_location();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >&
+      location() const;
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.SourceCodeInfo)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location > location_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT GeneratedCodeInfo_Annotation PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.GeneratedCodeInfo.Annotation) */ {
+ public:
+  inline GeneratedCodeInfo_Annotation() : GeneratedCodeInfo_Annotation(nullptr) {}
+  virtual ~GeneratedCodeInfo_Annotation();
+
+  GeneratedCodeInfo_Annotation(const GeneratedCodeInfo_Annotation& from);
+  GeneratedCodeInfo_Annotation(GeneratedCodeInfo_Annotation&& from) noexcept
+    : GeneratedCodeInfo_Annotation() {
+    *this = ::std::move(from);
+  }
+
+  inline GeneratedCodeInfo_Annotation& operator=(const GeneratedCodeInfo_Annotation& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline GeneratedCodeInfo_Annotation& operator=(GeneratedCodeInfo_Annotation&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const GeneratedCodeInfo_Annotation& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const GeneratedCodeInfo_Annotation* internal_default_instance() {
+    return reinterpret_cast<const GeneratedCodeInfo_Annotation*>(
+               &_GeneratedCodeInfo_Annotation_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    25;
+
+  friend void swap(GeneratedCodeInfo_Annotation& a, GeneratedCodeInfo_Annotation& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(GeneratedCodeInfo_Annotation* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(GeneratedCodeInfo_Annotation* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline GeneratedCodeInfo_Annotation* New() const final {
+    return CreateMaybeMessage<GeneratedCodeInfo_Annotation>(nullptr);
+  }
+
+  GeneratedCodeInfo_Annotation* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<GeneratedCodeInfo_Annotation>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const GeneratedCodeInfo_Annotation& from);
+  void MergeFrom(const GeneratedCodeInfo_Annotation& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(GeneratedCodeInfo_Annotation* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.GeneratedCodeInfo.Annotation";
+  }
+  protected:
+  explicit GeneratedCodeInfo_Annotation(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kPathFieldNumber = 1,
+    kSourceFileFieldNumber = 2,
+    kBeginFieldNumber = 3,
+    kEndFieldNumber = 4,
+  };
+  // repeated int32 path = 1 [packed = true];
+  int path_size() const;
+  private:
+  int _internal_path_size() const;
+  public:
+  void clear_path();
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_path(int index) const;
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      _internal_path() const;
+  void _internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      _internal_mutable_path();
+  public:
+  ::PROTOBUF_NAMESPACE_ID::int32 path(int index) const;
+  void set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
+  void add_path(::PROTOBUF_NAMESPACE_ID::int32 value);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+      path() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+      mutable_path();
+
+  // optional string source_file = 2;
+  bool has_source_file() const;
+  private:
+  bool _internal_has_source_file() const;
+  public:
+  void clear_source_file();
+  const std::string& source_file() const;
+  void set_source_file(const std::string& value);
+  void set_source_file(std::string&& value);
+  void set_source_file(const char* value);
+  void set_source_file(const char* value, size_t size);
+  std::string* mutable_source_file();
+  std::string* release_source_file();
+  void set_allocated_source_file(std::string* source_file);
+  private:
+  const std::string& _internal_source_file() const;
+  void _internal_set_source_file(const std::string& value);
+  std::string* _internal_mutable_source_file();
+  public:
+
+  // optional int32 begin = 3;
+  bool has_begin() const;
+  private:
+  bool _internal_has_begin() const;
+  public:
+  void clear_begin();
+  ::PROTOBUF_NAMESPACE_ID::int32 begin() const;
+  void set_begin(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_begin() const;
+  void _internal_set_begin(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // optional int32 end = 4;
+  bool has_end() const;
+  private:
+  bool _internal_has_end() const;
+  public:
+  void clear_end();
+  ::PROTOBUF_NAMESPACE_ID::int32 end() const;
+  void set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_end() const;
+  void _internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.GeneratedCodeInfo.Annotation)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::HasBits<1> _has_bits_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > path_;
+  mutable std::atomic<int> _path_cached_byte_size_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr source_file_;
+  ::PROTOBUF_NAMESPACE_ID::int32 begin_;
+  ::PROTOBUF_NAMESPACE_ID::int32 end_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT GeneratedCodeInfo PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.GeneratedCodeInfo) */ {
+ public:
+  inline GeneratedCodeInfo() : GeneratedCodeInfo(nullptr) {}
+  virtual ~GeneratedCodeInfo();
+
+  GeneratedCodeInfo(const GeneratedCodeInfo& from);
+  GeneratedCodeInfo(GeneratedCodeInfo&& from) noexcept
+    : GeneratedCodeInfo() {
+    *this = ::std::move(from);
+  }
+
+  inline GeneratedCodeInfo& operator=(const GeneratedCodeInfo& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline GeneratedCodeInfo& operator=(GeneratedCodeInfo&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  inline const ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet& unknown_fields() const {
+    return _internal_metadata_.unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>(::PROTOBUF_NAMESPACE_ID::UnknownFieldSet::default_instance);
+  }
+  inline ::PROTOBUF_NAMESPACE_ID::UnknownFieldSet* mutable_unknown_fields() {
+    return _internal_metadata_.mutable_unknown_fields<::PROTOBUF_NAMESPACE_ID::UnknownFieldSet>();
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const GeneratedCodeInfo& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const GeneratedCodeInfo* internal_default_instance() {
+    return reinterpret_cast<const GeneratedCodeInfo*>(
+               &_GeneratedCodeInfo_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    26;
+
+  friend void swap(GeneratedCodeInfo& a, GeneratedCodeInfo& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(GeneratedCodeInfo* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(GeneratedCodeInfo* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline GeneratedCodeInfo* New() const final {
+    return CreateMaybeMessage<GeneratedCodeInfo>(nullptr);
+  }
+
+  GeneratedCodeInfo* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<GeneratedCodeInfo>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const GeneratedCodeInfo& from);
+  void MergeFrom(const GeneratedCodeInfo& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(GeneratedCodeInfo* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.GeneratedCodeInfo";
+  }
+  protected:
+  explicit GeneratedCodeInfo(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  typedef GeneratedCodeInfo_Annotation Annotation;
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kAnnotationFieldNumber = 1,
+  };
+  // repeated .google.protobuf.GeneratedCodeInfo.Annotation annotation = 1;
+  int annotation_size() const;
+  private:
+  int _internal_annotation_size() const;
+  public:
+  void clear_annotation();
+  PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* mutable_annotation(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >*
+      mutable_annotation();
+  private:
+  const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& _internal_annotation(int index) const;
+  PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* _internal_add_annotation();
+  public:
+  const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& annotation(int index) const;
+  PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* add_annotation();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >&
+      annotation() const;
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.GeneratedCodeInfo)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation > annotation_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fdescriptor_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// FileDescriptorSet
+
+// repeated .google.protobuf.FileDescriptorProto file = 1;
+inline int FileDescriptorSet::_internal_file_size() const {
+  return file_.size();
+}
+inline int FileDescriptorSet::file_size() const {
+  return _internal_file_size();
+}
+inline void FileDescriptorSet::clear_file() {
+  file_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::mutable_file(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorSet.file)
+  return file_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >*
+FileDescriptorSet::mutable_file() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorSet.file)
+  return &file_;
+}
+inline const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& FileDescriptorSet::_internal_file(int index) const {
+  return file_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::FileDescriptorProto& FileDescriptorSet::file(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorSet.file)
+  return _internal_file(index);
+}
+inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::_internal_add_file() {
+  return file_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::FileDescriptorProto* FileDescriptorSet::add_file() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorSet.file)
+  return _internal_add_file();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FileDescriptorProto >&
+FileDescriptorSet::file() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorSet.file)
+  return file_;
+}
+
+// -------------------------------------------------------------------
+
+// FileDescriptorProto
+
+// optional string name = 1;
+inline bool FileDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool FileDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void FileDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& FileDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.name)
+  return _internal_name();
+}
+inline void FileDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.name)
+}
+inline std::string* FileDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& FileDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void FileDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.name)
+}
+inline void FileDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.name)
+}
+inline void FileDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.name)
+}
+inline std::string* FileDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.name)
+}
+
+// optional string package = 2;
+inline bool FileDescriptorProto::_internal_has_package() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool FileDescriptorProto::has_package() const {
+  return _internal_has_package();
+}
+inline void FileDescriptorProto::clear_package() {
+  package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& FileDescriptorProto::package() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.package)
+  return _internal_package();
+}
+inline void FileDescriptorProto::set_package(const std::string& value) {
+  _internal_set_package(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.package)
+}
+inline std::string* FileDescriptorProto::mutable_package() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.package)
+  return _internal_mutable_package();
+}
+inline const std::string& FileDescriptorProto::_internal_package() const {
+  return package_.Get();
+}
+inline void FileDescriptorProto::_internal_set_package(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileDescriptorProto::set_package(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  package_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.package)
+}
+inline void FileDescriptorProto::set_package(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.package)
+}
+inline void FileDescriptorProto::set_package(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.package)
+}
+inline std::string* FileDescriptorProto::_internal_mutable_package() {
+  _has_bits_[0] |= 0x00000002u;
+  return package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileDescriptorProto::release_package() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.package)
+  if (!_internal_has_package()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileDescriptorProto::set_allocated_package(std::string* package) {
+  if (package != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), package,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.package)
+}
+
+// repeated string dependency = 3;
+inline int FileDescriptorProto::_internal_dependency_size() const {
+  return dependency_.size();
+}
+inline int FileDescriptorProto::dependency_size() const {
+  return _internal_dependency_size();
+}
+inline void FileDescriptorProto::clear_dependency() {
+  dependency_.Clear();
+}
+inline std::string* FileDescriptorProto::add_dependency() {
+  // @@protoc_insertion_point(field_add_mutable:google.protobuf.FileDescriptorProto.dependency)
+  return _internal_add_dependency();
+}
+inline const std::string& FileDescriptorProto::_internal_dependency(int index) const {
+  return dependency_.Get(index);
+}
+inline const std::string& FileDescriptorProto::dependency(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.dependency)
+  return _internal_dependency(index);
+}
+inline std::string* FileDescriptorProto::mutable_dependency(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.dependency)
+  return dependency_.Mutable(index);
+}
+inline void FileDescriptorProto::set_dependency(int index, const std::string& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.dependency)
+  dependency_.Mutable(index)->assign(value);
+}
+inline void FileDescriptorProto::set_dependency(int index, std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.dependency)
+  dependency_.Mutable(index)->assign(std::move(value));
+}
+inline void FileDescriptorProto::set_dependency(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  dependency_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.dependency)
+}
+inline void FileDescriptorProto::set_dependency(int index, const char* value, size_t size) {
+  dependency_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.dependency)
+}
+inline std::string* FileDescriptorProto::_internal_add_dependency() {
+  return dependency_.Add();
+}
+inline void FileDescriptorProto::add_dependency(const std::string& value) {
+  dependency_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.dependency)
+}
+inline void FileDescriptorProto::add_dependency(std::string&& value) {
+  dependency_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.dependency)
+}
+inline void FileDescriptorProto::add_dependency(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  dependency_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:google.protobuf.FileDescriptorProto.dependency)
+}
+inline void FileDescriptorProto::add_dependency(const char* value, size_t size) {
+  dependency_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:google.protobuf.FileDescriptorProto.dependency)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>&
+FileDescriptorProto::dependency() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.dependency)
+  return dependency_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>*
+FileDescriptorProto::mutable_dependency() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.dependency)
+  return &dependency_;
+}
+
+// repeated int32 public_dependency = 10;
+inline int FileDescriptorProto::_internal_public_dependency_size() const {
+  return public_dependency_.size();
+}
+inline int FileDescriptorProto::public_dependency_size() const {
+  return _internal_public_dependency_size();
+}
+inline void FileDescriptorProto::clear_public_dependency() {
+  public_dependency_.Clear();
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::_internal_public_dependency(int index) const {
+  return public_dependency_.Get(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::public_dependency(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.public_dependency)
+  return _internal_public_dependency(index);
+}
+inline void FileDescriptorProto::set_public_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
+  public_dependency_.Set(index, value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.public_dependency)
+}
+inline void FileDescriptorProto::_internal_add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  public_dependency_.Add(value);
+}
+inline void FileDescriptorProto::add_public_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_add_public_dependency(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.public_dependency)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+FileDescriptorProto::_internal_public_dependency() const {
+  return public_dependency_;
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+FileDescriptorProto::public_dependency() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.public_dependency)
+  return _internal_public_dependency();
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+FileDescriptorProto::_internal_mutable_public_dependency() {
+  return &public_dependency_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+FileDescriptorProto::mutable_public_dependency() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.public_dependency)
+  return _internal_mutable_public_dependency();
+}
+
+// repeated int32 weak_dependency = 11;
+inline int FileDescriptorProto::_internal_weak_dependency_size() const {
+  return weak_dependency_.size();
+}
+inline int FileDescriptorProto::weak_dependency_size() const {
+  return _internal_weak_dependency_size();
+}
+inline void FileDescriptorProto::clear_weak_dependency() {
+  weak_dependency_.Clear();
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::_internal_weak_dependency(int index) const {
+  return weak_dependency_.Get(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FileDescriptorProto::weak_dependency(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.weak_dependency)
+  return _internal_weak_dependency(index);
+}
+inline void FileDescriptorProto::set_weak_dependency(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
+  weak_dependency_.Set(index, value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.weak_dependency)
+}
+inline void FileDescriptorProto::_internal_add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  weak_dependency_.Add(value);
+}
+inline void FileDescriptorProto::add_weak_dependency(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_add_weak_dependency(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.weak_dependency)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+FileDescriptorProto::_internal_weak_dependency() const {
+  return weak_dependency_;
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+FileDescriptorProto::weak_dependency() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.weak_dependency)
+  return _internal_weak_dependency();
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+FileDescriptorProto::_internal_mutable_weak_dependency() {
+  return &weak_dependency_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+FileDescriptorProto::mutable_weak_dependency() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.weak_dependency)
+  return _internal_mutable_weak_dependency();
+}
+
+// repeated .google.protobuf.DescriptorProto message_type = 4;
+inline int FileDescriptorProto::_internal_message_type_size() const {
+  return message_type_.size();
+}
+inline int FileDescriptorProto::message_type_size() const {
+  return _internal_message_type_size();
+}
+inline void FileDescriptorProto::clear_message_type() {
+  message_type_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::mutable_message_type(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.message_type)
+  return message_type_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >*
+FileDescriptorProto::mutable_message_type() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.message_type)
+  return &message_type_;
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& FileDescriptorProto::_internal_message_type(int index) const {
+  return message_type_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& FileDescriptorProto::message_type(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.message_type)
+  return _internal_message_type(index);
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::_internal_add_message_type() {
+  return message_type_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* FileDescriptorProto::add_message_type() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.message_type)
+  return _internal_add_message_type();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >&
+FileDescriptorProto::message_type() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.message_type)
+  return message_type_;
+}
+
+// repeated .google.protobuf.EnumDescriptorProto enum_type = 5;
+inline int FileDescriptorProto::_internal_enum_type_size() const {
+  return enum_type_.size();
+}
+inline int FileDescriptorProto::enum_type_size() const {
+  return _internal_enum_type_size();
+}
+inline void FileDescriptorProto::clear_enum_type() {
+  enum_type_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::mutable_enum_type(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.enum_type)
+  return enum_type_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >*
+FileDescriptorProto::mutable_enum_type() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.enum_type)
+  return &enum_type_;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& FileDescriptorProto::_internal_enum_type(int index) const {
+  return enum_type_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& FileDescriptorProto::enum_type(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.enum_type)
+  return _internal_enum_type(index);
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::_internal_add_enum_type() {
+  return enum_type_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* FileDescriptorProto::add_enum_type() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.enum_type)
+  return _internal_add_enum_type();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >&
+FileDescriptorProto::enum_type() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.enum_type)
+  return enum_type_;
+}
+
+// repeated .google.protobuf.ServiceDescriptorProto service = 6;
+inline int FileDescriptorProto::_internal_service_size() const {
+  return service_.size();
+}
+inline int FileDescriptorProto::service_size() const {
+  return _internal_service_size();
+}
+inline void FileDescriptorProto::clear_service() {
+  service_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::mutable_service(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.service)
+  return service_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >*
+FileDescriptorProto::mutable_service() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.service)
+  return &service_;
+}
+inline const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& FileDescriptorProto::_internal_service(int index) const {
+  return service_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto& FileDescriptorProto::service(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.service)
+  return _internal_service(index);
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::_internal_add_service() {
+  return service_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto* FileDescriptorProto::add_service() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.service)
+  return _internal_add_service();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::ServiceDescriptorProto >&
+FileDescriptorProto::service() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.service)
+  return service_;
+}
+
+// repeated .google.protobuf.FieldDescriptorProto extension = 7;
+inline int FileDescriptorProto::_internal_extension_size() const {
+  return extension_.size();
+}
+inline int FileDescriptorProto::extension_size() const {
+  return _internal_extension_size();
+}
+inline void FileDescriptorProto::clear_extension() {
+  extension_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::mutable_extension(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.extension)
+  return extension_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+FileDescriptorProto::mutable_extension() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileDescriptorProto.extension)
+  return &extension_;
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& FileDescriptorProto::_internal_extension(int index) const {
+  return extension_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& FileDescriptorProto::extension(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.extension)
+  return _internal_extension(index);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::_internal_add_extension() {
+  return extension_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* FileDescriptorProto::add_extension() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileDescriptorProto.extension)
+  return _internal_add_extension();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+FileDescriptorProto::extension() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileDescriptorProto.extension)
+  return extension_;
+}
+
+// optional .google.protobuf.FileOptions options = 8;
+inline bool FileDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool FileDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void FileDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline const PROTOBUF_NAMESPACE_ID::FileOptions& FileDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::FileOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::FileOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_FileOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::FileOptions& FileDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.options)
+  return _internal_options();
+}
+inline void FileDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::FileOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FileDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000008u;
+  PROTOBUF_NAMESPACE_ID::FileOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000008u;
+  PROTOBUF_NAMESPACE_ID::FileOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000008u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FileOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions* FileDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void FileDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::FileOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.options)
+}
+
+// optional .google.protobuf.SourceCodeInfo source_code_info = 9;
+inline bool FileDescriptorProto::_internal_has_source_code_info() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  PROTOBUF_ASSUME(!value || source_code_info_ != nullptr);
+  return value;
+}
+inline bool FileDescriptorProto::has_source_code_info() const {
+  return _internal_has_source_code_info();
+}
+inline void FileDescriptorProto::clear_source_code_info() {
+  if (source_code_info_ != nullptr) source_code_info_->Clear();
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& FileDescriptorProto::_internal_source_code_info() const {
+  const PROTOBUF_NAMESPACE_ID::SourceCodeInfo* p = source_code_info_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::SourceCodeInfo*>(
+      &PROTOBUF_NAMESPACE_ID::_SourceCodeInfo_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo& FileDescriptorProto::source_code_info() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.source_code_info)
+  return _internal_source_code_info();
+}
+inline void FileDescriptorProto::unsafe_arena_set_allocated_source_code_info(
+    PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(source_code_info_);
+  }
+  source_code_info_ = source_code_info;
+  if (source_code_info) {
+    _has_bits_[0] |= 0x00000010u;
+  } else {
+    _has_bits_[0] &= ~0x00000010u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FileDescriptorProto.source_code_info)
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::release_source_code_info() {
+  _has_bits_[0] &= ~0x00000010u;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* temp = source_code_info_;
+  source_code_info_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::unsafe_arena_release_source_code_info() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.source_code_info)
+  _has_bits_[0] &= ~0x00000010u;
+  PROTOBUF_NAMESPACE_ID::SourceCodeInfo* temp = source_code_info_;
+  source_code_info_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::_internal_mutable_source_code_info() {
+  _has_bits_[0] |= 0x00000010u;
+  if (source_code_info_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::SourceCodeInfo>(GetArena());
+    source_code_info_ = p;
+  }
+  return source_code_info_;
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo* FileDescriptorProto::mutable_source_code_info() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.source_code_info)
+  return _internal_mutable_source_code_info();
+}
+inline void FileDescriptorProto::set_allocated_source_code_info(PROTOBUF_NAMESPACE_ID::SourceCodeInfo* source_code_info) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete source_code_info_;
+  }
+  if (source_code_info) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(source_code_info);
+    if (message_arena != submessage_arena) {
+      source_code_info = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, source_code_info, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000010u;
+  } else {
+    _has_bits_[0] &= ~0x00000010u;
+  }
+  source_code_info_ = source_code_info;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.source_code_info)
+}
+
+// optional string syntax = 12;
+inline bool FileDescriptorProto::_internal_has_syntax() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool FileDescriptorProto::has_syntax() const {
+  return _internal_has_syntax();
+}
+inline void FileDescriptorProto::clear_syntax() {
+  syntax_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline const std::string& FileDescriptorProto::syntax() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileDescriptorProto.syntax)
+  return _internal_syntax();
+}
+inline void FileDescriptorProto::set_syntax(const std::string& value) {
+  _internal_set_syntax(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileDescriptorProto.syntax)
+}
+inline std::string* FileDescriptorProto::mutable_syntax() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileDescriptorProto.syntax)
+  return _internal_mutable_syntax();
+}
+inline const std::string& FileDescriptorProto::_internal_syntax() const {
+  return syntax_.Get();
+}
+inline void FileDescriptorProto::_internal_set_syntax(const std::string& value) {
+  _has_bits_[0] |= 0x00000004u;
+  syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileDescriptorProto::set_syntax(std::string&& value) {
+  _has_bits_[0] |= 0x00000004u;
+  syntax_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileDescriptorProto.syntax)
+}
+inline void FileDescriptorProto::set_syntax(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000004u;
+  syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileDescriptorProto.syntax)
+}
+inline void FileDescriptorProto::set_syntax(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000004u;
+  syntax_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileDescriptorProto.syntax)
+}
+inline std::string* FileDescriptorProto::_internal_mutable_syntax() {
+  _has_bits_[0] |= 0x00000004u;
+  return syntax_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileDescriptorProto::release_syntax() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileDescriptorProto.syntax)
+  if (!_internal_has_syntax()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000004u;
+  return syntax_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileDescriptorProto::set_allocated_syntax(std::string* syntax) {
+  if (syntax != nullptr) {
+    _has_bits_[0] |= 0x00000004u;
+  } else {
+    _has_bits_[0] &= ~0x00000004u;
+  }
+  syntax_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), syntax,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileDescriptorProto.syntax)
+}
+
+// -------------------------------------------------------------------
+
+// DescriptorProto_ExtensionRange
+
+// optional int32 start = 1;
+inline bool DescriptorProto_ExtensionRange::_internal_has_start() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool DescriptorProto_ExtensionRange::has_start() const {
+  return _internal_has_start();
+}
+inline void DescriptorProto_ExtensionRange::clear_start() {
+  start_ = 0;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::_internal_start() const {
+  return start_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::start() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.start)
+  return _internal_start();
+}
+inline void DescriptorProto_ExtensionRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000002u;
+  start_ = value;
+}
+inline void DescriptorProto_ExtensionRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_start(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ExtensionRange.start)
+}
+
+// optional int32 end = 2;
+inline bool DescriptorProto_ExtensionRange::_internal_has_end() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool DescriptorProto_ExtensionRange::has_end() const {
+  return _internal_has_end();
+}
+inline void DescriptorProto_ExtensionRange::clear_end() {
+  end_ = 0;
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::_internal_end() const {
+  return end_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ExtensionRange::end() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.end)
+  return _internal_end();
+}
+inline void DescriptorProto_ExtensionRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000004u;
+  end_ = value;
+}
+inline void DescriptorProto_ExtensionRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_end(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ExtensionRange.end)
+}
+
+// optional .google.protobuf.ExtensionRangeOptions options = 3;
+inline bool DescriptorProto_ExtensionRange::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool DescriptorProto_ExtensionRange::has_options() const {
+  return _internal_has_options();
+}
+inline void DescriptorProto_ExtensionRange::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& DescriptorProto_ExtensionRange::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_ExtensionRangeOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions& DescriptorProto_ExtensionRange::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ExtensionRange.options)
+  return _internal_options();
+}
+inline void DescriptorProto_ExtensionRange::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.DescriptorProto.ExtensionRange.options)
+}
+inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::release_options() {
+  _has_bits_[0] &= ~0x00000001u;
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.ExtensionRange.options)
+  _has_bits_[0] &= ~0x00000001u;
+  PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000001u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* DescriptorProto_ExtensionRange::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.ExtensionRange.options)
+  return _internal_mutable_options();
+}
+inline void DescriptorProto_ExtensionRange::set_allocated_options(PROTOBUF_NAMESPACE_ID::ExtensionRangeOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.ExtensionRange.options)
+}
+
+// -------------------------------------------------------------------
+
+// DescriptorProto_ReservedRange
+
+// optional int32 start = 1;
+inline bool DescriptorProto_ReservedRange::_internal_has_start() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool DescriptorProto_ReservedRange::has_start() const {
+  return _internal_has_start();
+}
+inline void DescriptorProto_ReservedRange::clear_start() {
+  start_ = 0;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::_internal_start() const {
+  return start_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::start() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ReservedRange.start)
+  return _internal_start();
+}
+inline void DescriptorProto_ReservedRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000001u;
+  start_ = value;
+}
+inline void DescriptorProto_ReservedRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_start(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ReservedRange.start)
+}
+
+// optional int32 end = 2;
+inline bool DescriptorProto_ReservedRange::_internal_has_end() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool DescriptorProto_ReservedRange::has_end() const {
+  return _internal_has_end();
+}
+inline void DescriptorProto_ReservedRange::clear_end() {
+  end_ = 0;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::_internal_end() const {
+  return end_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 DescriptorProto_ReservedRange::end() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.ReservedRange.end)
+  return _internal_end();
+}
+inline void DescriptorProto_ReservedRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000002u;
+  end_ = value;
+}
+inline void DescriptorProto_ReservedRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_end(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.ReservedRange.end)
+}
+
+// -------------------------------------------------------------------
+
+// DescriptorProto
+
+// optional string name = 1;
+inline bool DescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool DescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void DescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& DescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.name)
+  return _internal_name();
+}
+inline void DescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.name)
+}
+inline std::string* DescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& DescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void DescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void DescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.DescriptorProto.name)
+}
+inline void DescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.DescriptorProto.name)
+}
+inline void DescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.DescriptorProto.name)
+}
+inline std::string* DescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* DescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void DescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.name)
+}
+
+// repeated .google.protobuf.FieldDescriptorProto field = 2;
+inline int DescriptorProto::_internal_field_size() const {
+  return field_.size();
+}
+inline int DescriptorProto::field_size() const {
+  return _internal_field_size();
+}
+inline void DescriptorProto::clear_field() {
+  field_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::mutable_field(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.field)
+  return field_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+DescriptorProto::mutable_field() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.field)
+  return &field_;
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::_internal_field(int index) const {
+  return field_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::field(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.field)
+  return _internal_field(index);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::_internal_add_field() {
+  return field_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::add_field() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.field)
+  return _internal_add_field();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+DescriptorProto::field() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.field)
+  return field_;
+}
+
+// repeated .google.protobuf.FieldDescriptorProto extension = 6;
+inline int DescriptorProto::_internal_extension_size() const {
+  return extension_.size();
+}
+inline int DescriptorProto::extension_size() const {
+  return _internal_extension_size();
+}
+inline void DescriptorProto::clear_extension() {
+  extension_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::mutable_extension(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.extension)
+  return extension_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >*
+DescriptorProto::mutable_extension() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.extension)
+  return &extension_;
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::_internal_extension(int index) const {
+  return extension_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldDescriptorProto& DescriptorProto::extension(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.extension)
+  return _internal_extension(index);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::_internal_add_extension() {
+  return extension_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto* DescriptorProto::add_extension() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.extension)
+  return _internal_add_extension();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto >&
+DescriptorProto::extension() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.extension)
+  return extension_;
+}
+
+// repeated .google.protobuf.DescriptorProto nested_type = 3;
+inline int DescriptorProto::_internal_nested_type_size() const {
+  return nested_type_.size();
+}
+inline int DescriptorProto::nested_type_size() const {
+  return _internal_nested_type_size();
+}
+inline void DescriptorProto::clear_nested_type() {
+  nested_type_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::mutable_nested_type(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.nested_type)
+  return nested_type_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >*
+DescriptorProto::mutable_nested_type() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.nested_type)
+  return &nested_type_;
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& DescriptorProto::_internal_nested_type(int index) const {
+  return nested_type_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto& DescriptorProto::nested_type(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.nested_type)
+  return _internal_nested_type(index);
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::_internal_add_nested_type() {
+  return nested_type_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto* DescriptorProto::add_nested_type() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.nested_type)
+  return _internal_add_nested_type();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto >&
+DescriptorProto::nested_type() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.nested_type)
+  return nested_type_;
+}
+
+// repeated .google.protobuf.EnumDescriptorProto enum_type = 4;
+inline int DescriptorProto::_internal_enum_type_size() const {
+  return enum_type_.size();
+}
+inline int DescriptorProto::enum_type_size() const {
+  return _internal_enum_type_size();
+}
+inline void DescriptorProto::clear_enum_type() {
+  enum_type_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::mutable_enum_type(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.enum_type)
+  return enum_type_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >*
+DescriptorProto::mutable_enum_type() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.enum_type)
+  return &enum_type_;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& DescriptorProto::_internal_enum_type(int index) const {
+  return enum_type_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto& DescriptorProto::enum_type(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.enum_type)
+  return _internal_enum_type(index);
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::_internal_add_enum_type() {
+  return enum_type_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto* DescriptorProto::add_enum_type() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.enum_type)
+  return _internal_add_enum_type();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto >&
+DescriptorProto::enum_type() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.enum_type)
+  return enum_type_;
+}
+
+// repeated .google.protobuf.DescriptorProto.ExtensionRange extension_range = 5;
+inline int DescriptorProto::_internal_extension_range_size() const {
+  return extension_range_.size();
+}
+inline int DescriptorProto::extension_range_size() const {
+  return _internal_extension_range_size();
+}
+inline void DescriptorProto::clear_extension_range() {
+  extension_range_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::mutable_extension_range(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.extension_range)
+  return extension_range_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >*
+DescriptorProto::mutable_extension_range() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.extension_range)
+  return &extension_range_;
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& DescriptorProto::_internal_extension_range(int index) const {
+  return extension_range_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange& DescriptorProto::extension_range(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.extension_range)
+  return _internal_extension_range(index);
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::_internal_add_extension_range() {
+  return extension_range_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange* DescriptorProto::add_extension_range() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.extension_range)
+  return _internal_add_extension_range();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ExtensionRange >&
+DescriptorProto::extension_range() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.extension_range)
+  return extension_range_;
+}
+
+// repeated .google.protobuf.OneofDescriptorProto oneof_decl = 8;
+inline int DescriptorProto::_internal_oneof_decl_size() const {
+  return oneof_decl_.size();
+}
+inline int DescriptorProto::oneof_decl_size() const {
+  return _internal_oneof_decl_size();
+}
+inline void DescriptorProto::clear_oneof_decl() {
+  oneof_decl_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::mutable_oneof_decl(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.oneof_decl)
+  return oneof_decl_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >*
+DescriptorProto::mutable_oneof_decl() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.oneof_decl)
+  return &oneof_decl_;
+}
+inline const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& DescriptorProto::_internal_oneof_decl(int index) const {
+  return oneof_decl_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::OneofDescriptorProto& DescriptorProto::oneof_decl(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.oneof_decl)
+  return _internal_oneof_decl(index);
+}
+inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::_internal_add_oneof_decl() {
+  return oneof_decl_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::OneofDescriptorProto* DescriptorProto::add_oneof_decl() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.oneof_decl)
+  return _internal_add_oneof_decl();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::OneofDescriptorProto >&
+DescriptorProto::oneof_decl() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.oneof_decl)
+  return oneof_decl_;
+}
+
+// optional .google.protobuf.MessageOptions options = 7;
+inline bool DescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool DescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void DescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const PROTOBUF_NAMESPACE_ID::MessageOptions& DescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::MessageOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::MessageOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_MessageOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::MessageOptions& DescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.options)
+  return _internal_options();
+}
+inline void DescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::MessageOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.DescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::MessageOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.DescriptorProto.options)
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::MessageOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000002u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::MessageOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::MessageOptions* DescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void DescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::MessageOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.DescriptorProto.options)
+}
+
+// repeated .google.protobuf.DescriptorProto.ReservedRange reserved_range = 9;
+inline int DescriptorProto::_internal_reserved_range_size() const {
+  return reserved_range_.size();
+}
+inline int DescriptorProto::reserved_range_size() const {
+  return _internal_reserved_range_size();
+}
+inline void DescriptorProto::clear_reserved_range() {
+  reserved_range_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::mutable_reserved_range(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.reserved_range)
+  return reserved_range_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >*
+DescriptorProto::mutable_reserved_range() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.reserved_range)
+  return &reserved_range_;
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& DescriptorProto::_internal_reserved_range(int index) const {
+  return reserved_range_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange& DescriptorProto::reserved_range(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.reserved_range)
+  return _internal_reserved_range(index);
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::_internal_add_reserved_range() {
+  return reserved_range_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange* DescriptorProto::add_reserved_range() {
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_range)
+  return _internal_add_reserved_range();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::DescriptorProto_ReservedRange >&
+DescriptorProto::reserved_range() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.reserved_range)
+  return reserved_range_;
+}
+
+// repeated string reserved_name = 10;
+inline int DescriptorProto::_internal_reserved_name_size() const {
+  return reserved_name_.size();
+}
+inline int DescriptorProto::reserved_name_size() const {
+  return _internal_reserved_name_size();
+}
+inline void DescriptorProto::clear_reserved_name() {
+  reserved_name_.Clear();
+}
+inline std::string* DescriptorProto::add_reserved_name() {
+  // @@protoc_insertion_point(field_add_mutable:google.protobuf.DescriptorProto.reserved_name)
+  return _internal_add_reserved_name();
+}
+inline const std::string& DescriptorProto::_internal_reserved_name(int index) const {
+  return reserved_name_.Get(index);
+}
+inline const std::string& DescriptorProto::reserved_name(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.DescriptorProto.reserved_name)
+  return _internal_reserved_name(index);
+}
+inline std::string* DescriptorProto::mutable_reserved_name(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.DescriptorProto.reserved_name)
+  return reserved_name_.Mutable(index);
+}
+inline void DescriptorProto::set_reserved_name(int index, const std::string& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.reserved_name)
+  reserved_name_.Mutable(index)->assign(value);
+}
+inline void DescriptorProto::set_reserved_name(int index, std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.DescriptorProto.reserved_name)
+  reserved_name_.Mutable(index)->assign(std::move(value));
+}
+inline void DescriptorProto::set_reserved_name(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  reserved_name_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:google.protobuf.DescriptorProto.reserved_name)
+}
+inline void DescriptorProto::set_reserved_name(int index, const char* value, size_t size) {
+  reserved_name_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.DescriptorProto.reserved_name)
+}
+inline std::string* DescriptorProto::_internal_add_reserved_name() {
+  return reserved_name_.Add();
+}
+inline void DescriptorProto::add_reserved_name(const std::string& value) {
+  reserved_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_name)
+}
+inline void DescriptorProto::add_reserved_name(std::string&& value) {
+  reserved_name_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:google.protobuf.DescriptorProto.reserved_name)
+}
+inline void DescriptorProto::add_reserved_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  reserved_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:google.protobuf.DescriptorProto.reserved_name)
+}
+inline void DescriptorProto::add_reserved_name(const char* value, size_t size) {
+  reserved_name_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:google.protobuf.DescriptorProto.reserved_name)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>&
+DescriptorProto::reserved_name() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.DescriptorProto.reserved_name)
+  return reserved_name_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>*
+DescriptorProto::mutable_reserved_name() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.DescriptorProto.reserved_name)
+  return &reserved_name_;
+}
+
+// -------------------------------------------------------------------
+
+// ExtensionRangeOptions
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int ExtensionRangeOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int ExtensionRangeOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void ExtensionRangeOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ExtensionRangeOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+ExtensionRangeOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.ExtensionRangeOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ExtensionRangeOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ExtensionRangeOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ExtensionRangeOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ExtensionRangeOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.ExtensionRangeOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+ExtensionRangeOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.ExtensionRangeOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// FieldDescriptorProto
+
+// optional string name = 1;
+inline bool FieldDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void FieldDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& FieldDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.name)
+  return _internal_name();
+}
+inline void FieldDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.name)
+}
+inline std::string* FieldDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& FieldDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void FieldDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FieldDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.name)
+}
+inline void FieldDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.name)
+}
+inline void FieldDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.name)
+}
+inline std::string* FieldDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FieldDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FieldDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.name)
+}
+
+// optional int32 number = 3;
+inline bool FieldDescriptorProto::_internal_has_number() const {
+  bool value = (_has_bits_[0] & 0x00000040u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_number() const {
+  return _internal_has_number();
+}
+inline void FieldDescriptorProto::clear_number() {
+  number_ = 0;
+  _has_bits_[0] &= ~0x00000040u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::_internal_number() const {
+  return number_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::number() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.number)
+  return _internal_number();
+}
+inline void FieldDescriptorProto::_internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000040u;
+  number_ = value;
+}
+inline void FieldDescriptorProto::set_number(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_number(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.number)
+}
+
+// optional .google.protobuf.FieldDescriptorProto.Label label = 4;
+inline bool FieldDescriptorProto::_internal_has_label() const {
+  bool value = (_has_bits_[0] & 0x00000200u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_label() const {
+  return _internal_has_label();
+}
+inline void FieldDescriptorProto::clear_label() {
+  label_ = 1;
+  _has_bits_[0] &= ~0x00000200u;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label FieldDescriptorProto::_internal_label() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label >(label_);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label FieldDescriptorProto::label() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.label)
+  return _internal_label();
+}
+inline void FieldDescriptorProto::_internal_set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value) {
+  assert(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label_IsValid(value));
+  _has_bits_[0] |= 0x00000200u;
+  label_ = value;
+}
+inline void FieldDescriptorProto::set_label(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label value) {
+  _internal_set_label(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.label)
+}
+
+// optional .google.protobuf.FieldDescriptorProto.Type type = 5;
+inline bool FieldDescriptorProto::_internal_has_type() const {
+  bool value = (_has_bits_[0] & 0x00000400u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_type() const {
+  return _internal_has_type();
+}
+inline void FieldDescriptorProto::clear_type() {
+  type_ = 1;
+  _has_bits_[0] &= ~0x00000400u;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type FieldDescriptorProto::_internal_type() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type >(type_);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type FieldDescriptorProto::type() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.type)
+  return _internal_type();
+}
+inline void FieldDescriptorProto::_internal_set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value) {
+  assert(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type_IsValid(value));
+  _has_bits_[0] |= 0x00000400u;
+  type_ = value;
+}
+inline void FieldDescriptorProto::set_type(PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type value) {
+  _internal_set_type(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.type)
+}
+
+// optional string type_name = 6;
+inline bool FieldDescriptorProto::_internal_has_type_name() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_type_name() const {
+  return _internal_has_type_name();
+}
+inline void FieldDescriptorProto::clear_type_name() {
+  type_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline const std::string& FieldDescriptorProto::type_name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.type_name)
+  return _internal_type_name();
+}
+inline void FieldDescriptorProto::set_type_name(const std::string& value) {
+  _internal_set_type_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.type_name)
+}
+inline std::string* FieldDescriptorProto::mutable_type_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.type_name)
+  return _internal_mutable_type_name();
+}
+inline const std::string& FieldDescriptorProto::_internal_type_name() const {
+  return type_name_.Get();
+}
+inline void FieldDescriptorProto::_internal_set_type_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000004u;
+  type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FieldDescriptorProto::set_type_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000004u;
+  type_name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.type_name)
+}
+inline void FieldDescriptorProto::set_type_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000004u;
+  type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.type_name)
+}
+inline void FieldDescriptorProto::set_type_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000004u;
+  type_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.type_name)
+}
+inline std::string* FieldDescriptorProto::_internal_mutable_type_name() {
+  _has_bits_[0] |= 0x00000004u;
+  return type_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FieldDescriptorProto::release_type_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.type_name)
+  if (!_internal_has_type_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000004u;
+  return type_name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FieldDescriptorProto::set_allocated_type_name(std::string* type_name) {
+  if (type_name != nullptr) {
+    _has_bits_[0] |= 0x00000004u;
+  } else {
+    _has_bits_[0] &= ~0x00000004u;
+  }
+  type_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), type_name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.type_name)
+}
+
+// optional string extendee = 2;
+inline bool FieldDescriptorProto::_internal_has_extendee() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_extendee() const {
+  return _internal_has_extendee();
+}
+inline void FieldDescriptorProto::clear_extendee() {
+  extendee_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& FieldDescriptorProto::extendee() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.extendee)
+  return _internal_extendee();
+}
+inline void FieldDescriptorProto::set_extendee(const std::string& value) {
+  _internal_set_extendee(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.extendee)
+}
+inline std::string* FieldDescriptorProto::mutable_extendee() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.extendee)
+  return _internal_mutable_extendee();
+}
+inline const std::string& FieldDescriptorProto::_internal_extendee() const {
+  return extendee_.Get();
+}
+inline void FieldDescriptorProto::_internal_set_extendee(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FieldDescriptorProto::set_extendee(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  extendee_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.extendee)
+}
+inline void FieldDescriptorProto::set_extendee(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.extendee)
+}
+inline void FieldDescriptorProto::set_extendee(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  extendee_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.extendee)
+}
+inline std::string* FieldDescriptorProto::_internal_mutable_extendee() {
+  _has_bits_[0] |= 0x00000002u;
+  return extendee_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FieldDescriptorProto::release_extendee() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.extendee)
+  if (!_internal_has_extendee()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return extendee_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FieldDescriptorProto::set_allocated_extendee(std::string* extendee) {
+  if (extendee != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  extendee_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), extendee,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.extendee)
+}
+
+// optional string default_value = 7;
+inline bool FieldDescriptorProto::_internal_has_default_value() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_default_value() const {
+  return _internal_has_default_value();
+}
+inline void FieldDescriptorProto::clear_default_value() {
+  default_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline const std::string& FieldDescriptorProto::default_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.default_value)
+  return _internal_default_value();
+}
+inline void FieldDescriptorProto::set_default_value(const std::string& value) {
+  _internal_set_default_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.default_value)
+}
+inline std::string* FieldDescriptorProto::mutable_default_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.default_value)
+  return _internal_mutable_default_value();
+}
+inline const std::string& FieldDescriptorProto::_internal_default_value() const {
+  return default_value_.Get();
+}
+inline void FieldDescriptorProto::_internal_set_default_value(const std::string& value) {
+  _has_bits_[0] |= 0x00000008u;
+  default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FieldDescriptorProto::set_default_value(std::string&& value) {
+  _has_bits_[0] |= 0x00000008u;
+  default_value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.default_value)
+}
+inline void FieldDescriptorProto::set_default_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000008u;
+  default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.default_value)
+}
+inline void FieldDescriptorProto::set_default_value(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000008u;
+  default_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.default_value)
+}
+inline std::string* FieldDescriptorProto::_internal_mutable_default_value() {
+  _has_bits_[0] |= 0x00000008u;
+  return default_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FieldDescriptorProto::release_default_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.default_value)
+  if (!_internal_has_default_value()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000008u;
+  return default_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FieldDescriptorProto::set_allocated_default_value(std::string* default_value) {
+  if (default_value != nullptr) {
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  default_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), default_value,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.default_value)
+}
+
+// optional int32 oneof_index = 9;
+inline bool FieldDescriptorProto::_internal_has_oneof_index() const {
+  bool value = (_has_bits_[0] & 0x00000080u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_oneof_index() const {
+  return _internal_has_oneof_index();
+}
+inline void FieldDescriptorProto::clear_oneof_index() {
+  oneof_index_ = 0;
+  _has_bits_[0] &= ~0x00000080u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::_internal_oneof_index() const {
+  return oneof_index_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 FieldDescriptorProto::oneof_index() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.oneof_index)
+  return _internal_oneof_index();
+}
+inline void FieldDescriptorProto::_internal_set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000080u;
+  oneof_index_ = value;
+}
+inline void FieldDescriptorProto::set_oneof_index(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_oneof_index(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.oneof_index)
+}
+
+// optional string json_name = 10;
+inline bool FieldDescriptorProto::_internal_has_json_name() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_json_name() const {
+  return _internal_has_json_name();
+}
+inline void FieldDescriptorProto::clear_json_name() {
+  json_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline const std::string& FieldDescriptorProto::json_name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.json_name)
+  return _internal_json_name();
+}
+inline void FieldDescriptorProto::set_json_name(const std::string& value) {
+  _internal_set_json_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.json_name)
+}
+inline std::string* FieldDescriptorProto::mutable_json_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.json_name)
+  return _internal_mutable_json_name();
+}
+inline const std::string& FieldDescriptorProto::_internal_json_name() const {
+  return json_name_.Get();
+}
+inline void FieldDescriptorProto::_internal_set_json_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000010u;
+  json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FieldDescriptorProto::set_json_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000010u;
+  json_name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FieldDescriptorProto.json_name)
+}
+inline void FieldDescriptorProto::set_json_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000010u;
+  json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldDescriptorProto.json_name)
+}
+inline void FieldDescriptorProto::set_json_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000010u;
+  json_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldDescriptorProto.json_name)
+}
+inline std::string* FieldDescriptorProto::_internal_mutable_json_name() {
+  _has_bits_[0] |= 0x00000010u;
+  return json_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FieldDescriptorProto::release_json_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.json_name)
+  if (!_internal_has_json_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000010u;
+  return json_name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FieldDescriptorProto::set_allocated_json_name(std::string* json_name) {
+  if (json_name != nullptr) {
+    _has_bits_[0] |= 0x00000010u;
+  } else {
+    _has_bits_[0] &= ~0x00000010u;
+  }
+  json_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), json_name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.json_name)
+}
+
+// optional .google.protobuf.FieldOptions options = 8;
+inline bool FieldDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000020u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool FieldDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void FieldDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldOptions& FieldDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::FieldOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::FieldOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_FieldOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::FieldOptions& FieldDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.options)
+  return _internal_options();
+}
+inline void FieldDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::FieldOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000020u;
+  } else {
+    _has_bits_[0] &= ~0x00000020u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.FieldDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000020u;
+  PROTOBUF_NAMESPACE_ID::FieldOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FieldDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000020u;
+  PROTOBUF_NAMESPACE_ID::FieldOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000020u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FieldOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions* FieldDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void FieldDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::FieldOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000020u;
+  } else {
+    _has_bits_[0] &= ~0x00000020u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FieldDescriptorProto.options)
+}
+
+// optional bool proto3_optional = 17;
+inline bool FieldDescriptorProto::_internal_has_proto3_optional() const {
+  bool value = (_has_bits_[0] & 0x00000100u) != 0;
+  return value;
+}
+inline bool FieldDescriptorProto::has_proto3_optional() const {
+  return _internal_has_proto3_optional();
+}
+inline void FieldDescriptorProto::clear_proto3_optional() {
+  proto3_optional_ = false;
+  _has_bits_[0] &= ~0x00000100u;
+}
+inline bool FieldDescriptorProto::_internal_proto3_optional() const {
+  return proto3_optional_;
+}
+inline bool FieldDescriptorProto::proto3_optional() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldDescriptorProto.proto3_optional)
+  return _internal_proto3_optional();
+}
+inline void FieldDescriptorProto::_internal_set_proto3_optional(bool value) {
+  _has_bits_[0] |= 0x00000100u;
+  proto3_optional_ = value;
+}
+inline void FieldDescriptorProto::set_proto3_optional(bool value) {
+  _internal_set_proto3_optional(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldDescriptorProto.proto3_optional)
+}
+
+// -------------------------------------------------------------------
+
+// OneofDescriptorProto
+
+// optional string name = 1;
+inline bool OneofDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool OneofDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void OneofDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& OneofDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.OneofDescriptorProto.name)
+  return _internal_name();
+}
+inline void OneofDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.OneofDescriptorProto.name)
+}
+inline std::string* OneofDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.OneofDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& OneofDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void OneofDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void OneofDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.OneofDescriptorProto.name)
+}
+inline void OneofDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.OneofDescriptorProto.name)
+}
+inline void OneofDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.OneofDescriptorProto.name)
+}
+inline std::string* OneofDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* OneofDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.OneofDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void OneofDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.OneofDescriptorProto.name)
+}
+
+// optional .google.protobuf.OneofOptions options = 2;
+inline bool OneofDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool OneofDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void OneofDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const PROTOBUF_NAMESPACE_ID::OneofOptions& OneofDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::OneofOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::OneofOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_OneofOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::OneofOptions& OneofDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.OneofDescriptorProto.options)
+  return _internal_options();
+}
+inline void OneofDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::OneofOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.OneofDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::OneofOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.OneofDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::OneofOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000002u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::OneofOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::OneofOptions* OneofDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.OneofDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void OneofDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::OneofOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.OneofDescriptorProto.options)
+}
+
+// -------------------------------------------------------------------
+
+// EnumDescriptorProto_EnumReservedRange
+
+// optional int32 start = 1;
+inline bool EnumDescriptorProto_EnumReservedRange::_internal_has_start() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool EnumDescriptorProto_EnumReservedRange::has_start() const {
+  return _internal_has_start();
+}
+inline void EnumDescriptorProto_EnumReservedRange::clear_start() {
+  start_ = 0;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::_internal_start() const {
+  return start_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::start() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.EnumReservedRange.start)
+  return _internal_start();
+}
+inline void EnumDescriptorProto_EnumReservedRange::_internal_set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000001u;
+  start_ = value;
+}
+inline void EnumDescriptorProto_EnumReservedRange::set_start(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_start(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.EnumReservedRange.start)
+}
+
+// optional int32 end = 2;
+inline bool EnumDescriptorProto_EnumReservedRange::_internal_has_end() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool EnumDescriptorProto_EnumReservedRange::has_end() const {
+  return _internal_has_end();
+}
+inline void EnumDescriptorProto_EnumReservedRange::clear_end() {
+  end_ = 0;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::_internal_end() const {
+  return end_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumDescriptorProto_EnumReservedRange::end() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.EnumReservedRange.end)
+  return _internal_end();
+}
+inline void EnumDescriptorProto_EnumReservedRange::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000002u;
+  end_ = value;
+}
+inline void EnumDescriptorProto_EnumReservedRange::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_end(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.EnumReservedRange.end)
+}
+
+// -------------------------------------------------------------------
+
+// EnumDescriptorProto
+
+// optional string name = 1;
+inline bool EnumDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool EnumDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void EnumDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& EnumDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.name)
+  return _internal_name();
+}
+inline void EnumDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.name)
+}
+inline std::string* EnumDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& EnumDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void EnumDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void EnumDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.EnumDescriptorProto.name)
+}
+inline void EnumDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.EnumDescriptorProto.name)
+}
+inline void EnumDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumDescriptorProto.name)
+}
+inline std::string* EnumDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* EnumDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.EnumDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void EnumDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumDescriptorProto.name)
+}
+
+// repeated .google.protobuf.EnumValueDescriptorProto value = 2;
+inline int EnumDescriptorProto::_internal_value_size() const {
+  return value_.size();
+}
+inline int EnumDescriptorProto::value_size() const {
+  return _internal_value_size();
+}
+inline void EnumDescriptorProto::clear_value() {
+  value_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::mutable_value(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.value)
+  return value_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >*
+EnumDescriptorProto::mutable_value() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.value)
+  return &value_;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& EnumDescriptorProto::_internal_value(int index) const {
+  return value_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto& EnumDescriptorProto::value(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.value)
+  return _internal_value(index);
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::_internal_add_value() {
+  return value_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto* EnumDescriptorProto::add_value() {
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.value)
+  return _internal_add_value();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumValueDescriptorProto >&
+EnumDescriptorProto::value() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.value)
+  return value_;
+}
+
+// optional .google.protobuf.EnumOptions options = 3;
+inline bool EnumDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool EnumDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void EnumDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumOptions& EnumDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::EnumOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::EnumOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_EnumOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumOptions& EnumDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.options)
+  return _internal_options();
+}
+inline void EnumDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::EnumOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.EnumDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::EnumOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.EnumDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::EnumOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000002u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumOptions* EnumDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void EnumDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumDescriptorProto.options)
+}
+
+// repeated .google.protobuf.EnumDescriptorProto.EnumReservedRange reserved_range = 4;
+inline int EnumDescriptorProto::_internal_reserved_range_size() const {
+  return reserved_range_.size();
+}
+inline int EnumDescriptorProto::reserved_range_size() const {
+  return _internal_reserved_range_size();
+}
+inline void EnumDescriptorProto::clear_reserved_range() {
+  reserved_range_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::mutable_reserved_range(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.reserved_range)
+  return reserved_range_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >*
+EnumDescriptorProto::mutable_reserved_range() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.reserved_range)
+  return &reserved_range_;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& EnumDescriptorProto::_internal_reserved_range(int index) const {
+  return reserved_range_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange& EnumDescriptorProto::reserved_range(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.reserved_range)
+  return _internal_reserved_range(index);
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::_internal_add_reserved_range() {
+  return reserved_range_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange* EnumDescriptorProto::add_reserved_range() {
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_range)
+  return _internal_add_reserved_range();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::EnumDescriptorProto_EnumReservedRange >&
+EnumDescriptorProto::reserved_range() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.reserved_range)
+  return reserved_range_;
+}
+
+// repeated string reserved_name = 5;
+inline int EnumDescriptorProto::_internal_reserved_name_size() const {
+  return reserved_name_.size();
+}
+inline int EnumDescriptorProto::reserved_name_size() const {
+  return _internal_reserved_name_size();
+}
+inline void EnumDescriptorProto::clear_reserved_name() {
+  reserved_name_.Clear();
+}
+inline std::string* EnumDescriptorProto::add_reserved_name() {
+  // @@protoc_insertion_point(field_add_mutable:google.protobuf.EnumDescriptorProto.reserved_name)
+  return _internal_add_reserved_name();
+}
+inline const std::string& EnumDescriptorProto::_internal_reserved_name(int index) const {
+  return reserved_name_.Get(index);
+}
+inline const std::string& EnumDescriptorProto::reserved_name(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumDescriptorProto.reserved_name)
+  return _internal_reserved_name(index);
+}
+inline std::string* EnumDescriptorProto::mutable_reserved_name(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumDescriptorProto.reserved_name)
+  return reserved_name_.Mutable(index);
+}
+inline void EnumDescriptorProto::set_reserved_name(int index, const std::string& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.reserved_name)
+  reserved_name_.Mutable(index)->assign(value);
+}
+inline void EnumDescriptorProto::set_reserved_name(int index, std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumDescriptorProto.reserved_name)
+  reserved_name_.Mutable(index)->assign(std::move(value));
+}
+inline void EnumDescriptorProto::set_reserved_name(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  reserved_name_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline void EnumDescriptorProto::set_reserved_name(int index, const char* value, size_t size) {
+  reserved_name_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline std::string* EnumDescriptorProto::_internal_add_reserved_name() {
+  return reserved_name_.Add();
+}
+inline void EnumDescriptorProto::add_reserved_name(const std::string& value) {
+  reserved_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline void EnumDescriptorProto::add_reserved_name(std::string&& value) {
+  reserved_name_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline void EnumDescriptorProto::add_reserved_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  reserved_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline void EnumDescriptorProto::add_reserved_name(const char* value, size_t size) {
+  reserved_name_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:google.protobuf.EnumDescriptorProto.reserved_name)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>&
+EnumDescriptorProto::reserved_name() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.EnumDescriptorProto.reserved_name)
+  return reserved_name_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>*
+EnumDescriptorProto::mutable_reserved_name() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumDescriptorProto.reserved_name)
+  return &reserved_name_;
+}
+
+// -------------------------------------------------------------------
+
+// EnumValueDescriptorProto
+
+// optional string name = 1;
+inline bool EnumValueDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool EnumValueDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void EnumValueDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& EnumValueDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.name)
+  return _internal_name();
+}
+inline void EnumValueDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumValueDescriptorProto.name)
+}
+inline std::string* EnumValueDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& EnumValueDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void EnumValueDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void EnumValueDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.EnumValueDescriptorProto.name)
+}
+inline void EnumValueDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.EnumValueDescriptorProto.name)
+}
+inline void EnumValueDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.EnumValueDescriptorProto.name)
+}
+inline std::string* EnumValueDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* EnumValueDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.EnumValueDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void EnumValueDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumValueDescriptorProto.name)
+}
+
+// optional int32 number = 2;
+inline bool EnumValueDescriptorProto::_internal_has_number() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool EnumValueDescriptorProto::has_number() const {
+  return _internal_has_number();
+}
+inline void EnumValueDescriptorProto::clear_number() {
+  number_ = 0;
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumValueDescriptorProto::_internal_number() const {
+  return number_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 EnumValueDescriptorProto::number() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.number)
+  return _internal_number();
+}
+inline void EnumValueDescriptorProto::_internal_set_number(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000004u;
+  number_ = value;
+}
+inline void EnumValueDescriptorProto::set_number(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_number(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumValueDescriptorProto.number)
+}
+
+// optional .google.protobuf.EnumValueOptions options = 3;
+inline bool EnumValueDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool EnumValueDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void EnumValueDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumValueOptions& EnumValueDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::EnumValueOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::EnumValueOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_EnumValueOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::EnumValueOptions& EnumValueDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumValueDescriptorProto.options)
+  return _internal_options();
+}
+inline void EnumValueDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::EnumValueOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.EnumValueDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.EnumValueDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::EnumValueOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000002u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::EnumValueOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::EnumValueOptions* EnumValueDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void EnumValueDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::EnumValueOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.EnumValueDescriptorProto.options)
+}
+
+// -------------------------------------------------------------------
+
+// ServiceDescriptorProto
+
+// optional string name = 1;
+inline bool ServiceDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool ServiceDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void ServiceDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& ServiceDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.name)
+  return _internal_name();
+}
+inline void ServiceDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.ServiceDescriptorProto.name)
+}
+inline std::string* ServiceDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& ServiceDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void ServiceDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void ServiceDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.ServiceDescriptorProto.name)
+}
+inline void ServiceDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.ServiceDescriptorProto.name)
+}
+inline void ServiceDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.ServiceDescriptorProto.name)
+}
+inline std::string* ServiceDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* ServiceDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.ServiceDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void ServiceDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.ServiceDescriptorProto.name)
+}
+
+// repeated .google.protobuf.MethodDescriptorProto method = 2;
+inline int ServiceDescriptorProto::_internal_method_size() const {
+  return method_.size();
+}
+inline int ServiceDescriptorProto::method_size() const {
+  return _internal_method_size();
+}
+inline void ServiceDescriptorProto::clear_method() {
+  method_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::mutable_method(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.method)
+  return method_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >*
+ServiceDescriptorProto::mutable_method() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.ServiceDescriptorProto.method)
+  return &method_;
+}
+inline const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& ServiceDescriptorProto::_internal_method(int index) const {
+  return method_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::MethodDescriptorProto& ServiceDescriptorProto::method(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.method)
+  return _internal_method(index);
+}
+inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::_internal_add_method() {
+  return method_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::MethodDescriptorProto* ServiceDescriptorProto::add_method() {
+  // @@protoc_insertion_point(field_add:google.protobuf.ServiceDescriptorProto.method)
+  return _internal_add_method();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::MethodDescriptorProto >&
+ServiceDescriptorProto::method() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.ServiceDescriptorProto.method)
+  return method_;
+}
+
+// optional .google.protobuf.ServiceOptions options = 3;
+inline bool ServiceDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool ServiceDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void ServiceDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const PROTOBUF_NAMESPACE_ID::ServiceOptions& ServiceDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::ServiceOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::ServiceOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_ServiceOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::ServiceOptions& ServiceDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ServiceDescriptorProto.options)
+  return _internal_options();
+}
+inline void ServiceDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::ServiceOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.ServiceDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.ServiceDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000002u;
+  PROTOBUF_NAMESPACE_ID::ServiceOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000002u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ServiceOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::ServiceOptions* ServiceDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void ServiceDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::ServiceOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.ServiceDescriptorProto.options)
+}
+
+// -------------------------------------------------------------------
+
+// MethodDescriptorProto
+
+// optional string name = 1;
+inline bool MethodDescriptorProto::_internal_has_name() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool MethodDescriptorProto::has_name() const {
+  return _internal_has_name();
+}
+inline void MethodDescriptorProto::clear_name() {
+  name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& MethodDescriptorProto::name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.name)
+  return _internal_name();
+}
+inline void MethodDescriptorProto::set_name(const std::string& value) {
+  _internal_set_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.name)
+}
+inline std::string* MethodDescriptorProto::mutable_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.name)
+  return _internal_mutable_name();
+}
+inline const std::string& MethodDescriptorProto::_internal_name() const {
+  return name_.Get();
+}
+inline void MethodDescriptorProto::_internal_set_name(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void MethodDescriptorProto::set_name(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.name)
+}
+inline void MethodDescriptorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.name)
+}
+inline void MethodDescriptorProto::set_name(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.name)
+}
+inline std::string* MethodDescriptorProto::_internal_mutable_name() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* MethodDescriptorProto::release_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.name)
+  if (!_internal_has_name()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void MethodDescriptorProto::set_allocated_name(std::string* name) {
+  if (name != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.name)
+}
+
+// optional string input_type = 2;
+inline bool MethodDescriptorProto::_internal_has_input_type() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool MethodDescriptorProto::has_input_type() const {
+  return _internal_has_input_type();
+}
+inline void MethodDescriptorProto::clear_input_type() {
+  input_type_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& MethodDescriptorProto::input_type() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.input_type)
+  return _internal_input_type();
+}
+inline void MethodDescriptorProto::set_input_type(const std::string& value) {
+  _internal_set_input_type(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.input_type)
+}
+inline std::string* MethodDescriptorProto::mutable_input_type() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.input_type)
+  return _internal_mutable_input_type();
+}
+inline const std::string& MethodDescriptorProto::_internal_input_type() const {
+  return input_type_.Get();
+}
+inline void MethodDescriptorProto::_internal_set_input_type(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void MethodDescriptorProto::set_input_type(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  input_type_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.input_type)
+}
+inline void MethodDescriptorProto::set_input_type(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.input_type)
+}
+inline void MethodDescriptorProto::set_input_type(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  input_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.input_type)
+}
+inline std::string* MethodDescriptorProto::_internal_mutable_input_type() {
+  _has_bits_[0] |= 0x00000002u;
+  return input_type_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* MethodDescriptorProto::release_input_type() {
+  // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.input_type)
+  if (!_internal_has_input_type()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return input_type_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void MethodDescriptorProto::set_allocated_input_type(std::string* input_type) {
+  if (input_type != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  input_type_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), input_type,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.input_type)
+}
+
+// optional string output_type = 3;
+inline bool MethodDescriptorProto::_internal_has_output_type() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool MethodDescriptorProto::has_output_type() const {
+  return _internal_has_output_type();
+}
+inline void MethodDescriptorProto::clear_output_type() {
+  output_type_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline const std::string& MethodDescriptorProto::output_type() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.output_type)
+  return _internal_output_type();
+}
+inline void MethodDescriptorProto::set_output_type(const std::string& value) {
+  _internal_set_output_type(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.output_type)
+}
+inline std::string* MethodDescriptorProto::mutable_output_type() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.output_type)
+  return _internal_mutable_output_type();
+}
+inline const std::string& MethodDescriptorProto::_internal_output_type() const {
+  return output_type_.Get();
+}
+inline void MethodDescriptorProto::_internal_set_output_type(const std::string& value) {
+  _has_bits_[0] |= 0x00000004u;
+  output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void MethodDescriptorProto::set_output_type(std::string&& value) {
+  _has_bits_[0] |= 0x00000004u;
+  output_type_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.MethodDescriptorProto.output_type)
+}
+inline void MethodDescriptorProto::set_output_type(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000004u;
+  output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.MethodDescriptorProto.output_type)
+}
+inline void MethodDescriptorProto::set_output_type(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000004u;
+  output_type_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.MethodDescriptorProto.output_type)
+}
+inline std::string* MethodDescriptorProto::_internal_mutable_output_type() {
+  _has_bits_[0] |= 0x00000004u;
+  return output_type_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* MethodDescriptorProto::release_output_type() {
+  // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.output_type)
+  if (!_internal_has_output_type()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000004u;
+  return output_type_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void MethodDescriptorProto::set_allocated_output_type(std::string* output_type) {
+  if (output_type != nullptr) {
+    _has_bits_[0] |= 0x00000004u;
+  } else {
+    _has_bits_[0] &= ~0x00000004u;
+  }
+  output_type_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), output_type,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.output_type)
+}
+
+// optional .google.protobuf.MethodOptions options = 4;
+inline bool MethodDescriptorProto::_internal_has_options() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  PROTOBUF_ASSUME(!value || options_ != nullptr);
+  return value;
+}
+inline bool MethodDescriptorProto::has_options() const {
+  return _internal_has_options();
+}
+inline void MethodDescriptorProto::clear_options() {
+  if (options_ != nullptr) options_->Clear();
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline const PROTOBUF_NAMESPACE_ID::MethodOptions& MethodDescriptorProto::_internal_options() const {
+  const PROTOBUF_NAMESPACE_ID::MethodOptions* p = options_;
+  return p != nullptr ? *p : *reinterpret_cast<const PROTOBUF_NAMESPACE_ID::MethodOptions*>(
+      &PROTOBUF_NAMESPACE_ID::_MethodOptions_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::MethodOptions& MethodDescriptorProto::options() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.options)
+  return _internal_options();
+}
+inline void MethodDescriptorProto::unsafe_arena_set_allocated_options(
+    PROTOBUF_NAMESPACE_ID::MethodOptions* options) {
+  if (GetArena() == nullptr) {
+    delete reinterpret_cast<::PROTOBUF_NAMESPACE_ID::MessageLite*>(options_);
+  }
+  options_ = options;
+  if (options) {
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.MethodDescriptorProto.options)
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::release_options() {
+  _has_bits_[0] &= ~0x00000008u;
+  PROTOBUF_NAMESPACE_ID::MethodOptions* temp = options_;
+  options_ = nullptr;
+  if (GetArena() != nullptr) {
+    temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+  }
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::unsafe_arena_release_options() {
+  // @@protoc_insertion_point(field_release:google.protobuf.MethodDescriptorProto.options)
+  _has_bits_[0] &= ~0x00000008u;
+  PROTOBUF_NAMESPACE_ID::MethodOptions* temp = options_;
+  options_ = nullptr;
+  return temp;
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::_internal_mutable_options() {
+  _has_bits_[0] |= 0x00000008u;
+  if (options_ == nullptr) {
+    auto* p = CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::MethodOptions>(GetArena());
+    options_ = p;
+  }
+  return options_;
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions* MethodDescriptorProto::mutable_options() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MethodDescriptorProto.options)
+  return _internal_mutable_options();
+}
+inline void MethodDescriptorProto::set_allocated_options(PROTOBUF_NAMESPACE_ID::MethodOptions* options) {
+  ::PROTOBUF_NAMESPACE_ID::Arena* message_arena = GetArena();
+  if (message_arena == nullptr) {
+    delete options_;
+  }
+  if (options) {
+    ::PROTOBUF_NAMESPACE_ID::Arena* submessage_arena =
+      ::PROTOBUF_NAMESPACE_ID::Arena::GetArena(options);
+    if (message_arena != submessage_arena) {
+      options = ::PROTOBUF_NAMESPACE_ID::internal::GetOwnedMessage(
+          message_arena, options, submessage_arena);
+    }
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  options_ = options;
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.MethodDescriptorProto.options)
+}
+
+// optional bool client_streaming = 5 [default = false];
+inline bool MethodDescriptorProto::_internal_has_client_streaming() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  return value;
+}
+inline bool MethodDescriptorProto::has_client_streaming() const {
+  return _internal_has_client_streaming();
+}
+inline void MethodDescriptorProto::clear_client_streaming() {
+  client_streaming_ = false;
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline bool MethodDescriptorProto::_internal_client_streaming() const {
+  return client_streaming_;
+}
+inline bool MethodDescriptorProto::client_streaming() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.client_streaming)
+  return _internal_client_streaming();
+}
+inline void MethodDescriptorProto::_internal_set_client_streaming(bool value) {
+  _has_bits_[0] |= 0x00000010u;
+  client_streaming_ = value;
+}
+inline void MethodDescriptorProto::set_client_streaming(bool value) {
+  _internal_set_client_streaming(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.client_streaming)
+}
+
+// optional bool server_streaming = 6 [default = false];
+inline bool MethodDescriptorProto::_internal_has_server_streaming() const {
+  bool value = (_has_bits_[0] & 0x00000020u) != 0;
+  return value;
+}
+inline bool MethodDescriptorProto::has_server_streaming() const {
+  return _internal_has_server_streaming();
+}
+inline void MethodDescriptorProto::clear_server_streaming() {
+  server_streaming_ = false;
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline bool MethodDescriptorProto::_internal_server_streaming() const {
+  return server_streaming_;
+}
+inline bool MethodDescriptorProto::server_streaming() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodDescriptorProto.server_streaming)
+  return _internal_server_streaming();
+}
+inline void MethodDescriptorProto::_internal_set_server_streaming(bool value) {
+  _has_bits_[0] |= 0x00000020u;
+  server_streaming_ = value;
+}
+inline void MethodDescriptorProto::set_server_streaming(bool value) {
+  _internal_set_server_streaming(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodDescriptorProto.server_streaming)
+}
+
+// -------------------------------------------------------------------
+
+// FileOptions
+
+// optional string java_package = 1;
+inline bool FileOptions::_internal_has_java_package() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_package() const {
+  return _internal_has_java_package();
+}
+inline void FileOptions::clear_java_package() {
+  java_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& FileOptions::java_package() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_package)
+  return _internal_java_package();
+}
+inline void FileOptions::set_java_package(const std::string& value) {
+  _internal_set_java_package(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_package)
+}
+inline std::string* FileOptions::mutable_java_package() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.java_package)
+  return _internal_mutable_java_package();
+}
+inline const std::string& FileOptions::_internal_java_package() const {
+  return java_package_.Get();
+}
+inline void FileOptions::_internal_set_java_package(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_java_package(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  java_package_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.java_package)
+}
+inline void FileOptions::set_java_package(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.java_package)
+}
+inline void FileOptions::set_java_package(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  java_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.java_package)
+}
+inline std::string* FileOptions::_internal_mutable_java_package() {
+  _has_bits_[0] |= 0x00000001u;
+  return java_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_java_package() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.java_package)
+  if (!_internal_has_java_package()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return java_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_java_package(std::string* java_package) {
+  if (java_package != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  java_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), java_package,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.java_package)
+}
+
+// optional string java_outer_classname = 8;
+inline bool FileOptions::_internal_has_java_outer_classname() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_outer_classname() const {
+  return _internal_has_java_outer_classname();
+}
+inline void FileOptions::clear_java_outer_classname() {
+  java_outer_classname_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& FileOptions::java_outer_classname() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_outer_classname)
+  return _internal_java_outer_classname();
+}
+inline void FileOptions::set_java_outer_classname(const std::string& value) {
+  _internal_set_java_outer_classname(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_outer_classname)
+}
+inline std::string* FileOptions::mutable_java_outer_classname() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.java_outer_classname)
+  return _internal_mutable_java_outer_classname();
+}
+inline const std::string& FileOptions::_internal_java_outer_classname() const {
+  return java_outer_classname_.Get();
+}
+inline void FileOptions::_internal_set_java_outer_classname(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_java_outer_classname(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  java_outer_classname_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.java_outer_classname)
+}
+inline void FileOptions::set_java_outer_classname(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.java_outer_classname)
+}
+inline void FileOptions::set_java_outer_classname(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  java_outer_classname_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.java_outer_classname)
+}
+inline std::string* FileOptions::_internal_mutable_java_outer_classname() {
+  _has_bits_[0] |= 0x00000002u;
+  return java_outer_classname_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_java_outer_classname() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.java_outer_classname)
+  if (!_internal_has_java_outer_classname()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return java_outer_classname_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_java_outer_classname(std::string* java_outer_classname) {
+  if (java_outer_classname != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  java_outer_classname_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), java_outer_classname,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.java_outer_classname)
+}
+
+// optional bool java_multiple_files = 10 [default = false];
+inline bool FileOptions::_internal_has_java_multiple_files() const {
+  bool value = (_has_bits_[0] & 0x00000400u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_multiple_files() const {
+  return _internal_has_java_multiple_files();
+}
+inline void FileOptions::clear_java_multiple_files() {
+  java_multiple_files_ = false;
+  _has_bits_[0] &= ~0x00000400u;
+}
+inline bool FileOptions::_internal_java_multiple_files() const {
+  return java_multiple_files_;
+}
+inline bool FileOptions::java_multiple_files() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_multiple_files)
+  return _internal_java_multiple_files();
+}
+inline void FileOptions::_internal_set_java_multiple_files(bool value) {
+  _has_bits_[0] |= 0x00000400u;
+  java_multiple_files_ = value;
+}
+inline void FileOptions::set_java_multiple_files(bool value) {
+  _internal_set_java_multiple_files(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_multiple_files)
+}
+
+// optional bool java_generate_equals_and_hash = 20 [deprecated = true];
+inline bool FileOptions::_internal_has_java_generate_equals_and_hash() const {
+  bool value = (_has_bits_[0] & 0x00000800u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_generate_equals_and_hash() const {
+  return _internal_has_java_generate_equals_and_hash();
+}
+inline void FileOptions::clear_java_generate_equals_and_hash() {
+  java_generate_equals_and_hash_ = false;
+  _has_bits_[0] &= ~0x00000800u;
+}
+inline bool FileOptions::_internal_java_generate_equals_and_hash() const {
+  return java_generate_equals_and_hash_;
+}
+inline bool FileOptions::java_generate_equals_and_hash() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_generate_equals_and_hash)
+  return _internal_java_generate_equals_and_hash();
+}
+inline void FileOptions::_internal_set_java_generate_equals_and_hash(bool value) {
+  _has_bits_[0] |= 0x00000800u;
+  java_generate_equals_and_hash_ = value;
+}
+inline void FileOptions::set_java_generate_equals_and_hash(bool value) {
+  _internal_set_java_generate_equals_and_hash(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_generate_equals_and_hash)
+}
+
+// optional bool java_string_check_utf8 = 27 [default = false];
+inline bool FileOptions::_internal_has_java_string_check_utf8() const {
+  bool value = (_has_bits_[0] & 0x00001000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_string_check_utf8() const {
+  return _internal_has_java_string_check_utf8();
+}
+inline void FileOptions::clear_java_string_check_utf8() {
+  java_string_check_utf8_ = false;
+  _has_bits_[0] &= ~0x00001000u;
+}
+inline bool FileOptions::_internal_java_string_check_utf8() const {
+  return java_string_check_utf8_;
+}
+inline bool FileOptions::java_string_check_utf8() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_string_check_utf8)
+  return _internal_java_string_check_utf8();
+}
+inline void FileOptions::_internal_set_java_string_check_utf8(bool value) {
+  _has_bits_[0] |= 0x00001000u;
+  java_string_check_utf8_ = value;
+}
+inline void FileOptions::set_java_string_check_utf8(bool value) {
+  _internal_set_java_string_check_utf8(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_string_check_utf8)
+}
+
+// optional .google.protobuf.FileOptions.OptimizeMode optimize_for = 9 [default = SPEED];
+inline bool FileOptions::_internal_has_optimize_for() const {
+  bool value = (_has_bits_[0] & 0x00040000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_optimize_for() const {
+  return _internal_has_optimize_for();
+}
+inline void FileOptions::clear_optimize_for() {
+  optimize_for_ = 1;
+  _has_bits_[0] &= ~0x00040000u;
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode FileOptions::_internal_optimize_for() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode >(optimize_for_);
+}
+inline PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode FileOptions::optimize_for() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.optimize_for)
+  return _internal_optimize_for();
+}
+inline void FileOptions::_internal_set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value) {
+  assert(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode_IsValid(value));
+  _has_bits_[0] |= 0x00040000u;
+  optimize_for_ = value;
+}
+inline void FileOptions::set_optimize_for(PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode value) {
+  _internal_set_optimize_for(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.optimize_for)
+}
+
+// optional string go_package = 11;
+inline bool FileOptions::_internal_has_go_package() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool FileOptions::has_go_package() const {
+  return _internal_has_go_package();
+}
+inline void FileOptions::clear_go_package() {
+  go_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline const std::string& FileOptions::go_package() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.go_package)
+  return _internal_go_package();
+}
+inline void FileOptions::set_go_package(const std::string& value) {
+  _internal_set_go_package(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.go_package)
+}
+inline std::string* FileOptions::mutable_go_package() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.go_package)
+  return _internal_mutable_go_package();
+}
+inline const std::string& FileOptions::_internal_go_package() const {
+  return go_package_.Get();
+}
+inline void FileOptions::_internal_set_go_package(const std::string& value) {
+  _has_bits_[0] |= 0x00000004u;
+  go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_go_package(std::string&& value) {
+  _has_bits_[0] |= 0x00000004u;
+  go_package_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.go_package)
+}
+inline void FileOptions::set_go_package(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000004u;
+  go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.go_package)
+}
+inline void FileOptions::set_go_package(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000004u;
+  go_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.go_package)
+}
+inline std::string* FileOptions::_internal_mutable_go_package() {
+  _has_bits_[0] |= 0x00000004u;
+  return go_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_go_package() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.go_package)
+  if (!_internal_has_go_package()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000004u;
+  return go_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_go_package(std::string* go_package) {
+  if (go_package != nullptr) {
+    _has_bits_[0] |= 0x00000004u;
+  } else {
+    _has_bits_[0] &= ~0x00000004u;
+  }
+  go_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), go_package,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.go_package)
+}
+
+// optional bool cc_generic_services = 16 [default = false];
+inline bool FileOptions::_internal_has_cc_generic_services() const {
+  bool value = (_has_bits_[0] & 0x00002000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_cc_generic_services() const {
+  return _internal_has_cc_generic_services();
+}
+inline void FileOptions::clear_cc_generic_services() {
+  cc_generic_services_ = false;
+  _has_bits_[0] &= ~0x00002000u;
+}
+inline bool FileOptions::_internal_cc_generic_services() const {
+  return cc_generic_services_;
+}
+inline bool FileOptions::cc_generic_services() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.cc_generic_services)
+  return _internal_cc_generic_services();
+}
+inline void FileOptions::_internal_set_cc_generic_services(bool value) {
+  _has_bits_[0] |= 0x00002000u;
+  cc_generic_services_ = value;
+}
+inline void FileOptions::set_cc_generic_services(bool value) {
+  _internal_set_cc_generic_services(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.cc_generic_services)
+}
+
+// optional bool java_generic_services = 17 [default = false];
+inline bool FileOptions::_internal_has_java_generic_services() const {
+  bool value = (_has_bits_[0] & 0x00004000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_java_generic_services() const {
+  return _internal_has_java_generic_services();
+}
+inline void FileOptions::clear_java_generic_services() {
+  java_generic_services_ = false;
+  _has_bits_[0] &= ~0x00004000u;
+}
+inline bool FileOptions::_internal_java_generic_services() const {
+  return java_generic_services_;
+}
+inline bool FileOptions::java_generic_services() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.java_generic_services)
+  return _internal_java_generic_services();
+}
+inline void FileOptions::_internal_set_java_generic_services(bool value) {
+  _has_bits_[0] |= 0x00004000u;
+  java_generic_services_ = value;
+}
+inline void FileOptions::set_java_generic_services(bool value) {
+  _internal_set_java_generic_services(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.java_generic_services)
+}
+
+// optional bool py_generic_services = 18 [default = false];
+inline bool FileOptions::_internal_has_py_generic_services() const {
+  bool value = (_has_bits_[0] & 0x00008000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_py_generic_services() const {
+  return _internal_has_py_generic_services();
+}
+inline void FileOptions::clear_py_generic_services() {
+  py_generic_services_ = false;
+  _has_bits_[0] &= ~0x00008000u;
+}
+inline bool FileOptions::_internal_py_generic_services() const {
+  return py_generic_services_;
+}
+inline bool FileOptions::py_generic_services() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.py_generic_services)
+  return _internal_py_generic_services();
+}
+inline void FileOptions::_internal_set_py_generic_services(bool value) {
+  _has_bits_[0] |= 0x00008000u;
+  py_generic_services_ = value;
+}
+inline void FileOptions::set_py_generic_services(bool value) {
+  _internal_set_py_generic_services(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.py_generic_services)
+}
+
+// optional bool php_generic_services = 42 [default = false];
+inline bool FileOptions::_internal_has_php_generic_services() const {
+  bool value = (_has_bits_[0] & 0x00010000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_php_generic_services() const {
+  return _internal_has_php_generic_services();
+}
+inline void FileOptions::clear_php_generic_services() {
+  php_generic_services_ = false;
+  _has_bits_[0] &= ~0x00010000u;
+}
+inline bool FileOptions::_internal_php_generic_services() const {
+  return php_generic_services_;
+}
+inline bool FileOptions::php_generic_services() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_generic_services)
+  return _internal_php_generic_services();
+}
+inline void FileOptions::_internal_set_php_generic_services(bool value) {
+  _has_bits_[0] |= 0x00010000u;
+  php_generic_services_ = value;
+}
+inline void FileOptions::set_php_generic_services(bool value) {
+  _internal_set_php_generic_services(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_generic_services)
+}
+
+// optional bool deprecated = 23 [default = false];
+inline bool FileOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00020000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void FileOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00020000u;
+}
+inline bool FileOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool FileOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void FileOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00020000u;
+  deprecated_ = value;
+}
+inline void FileOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.deprecated)
+}
+
+// optional bool cc_enable_arenas = 31 [default = true];
+inline bool FileOptions::_internal_has_cc_enable_arenas() const {
+  bool value = (_has_bits_[0] & 0x00080000u) != 0;
+  return value;
+}
+inline bool FileOptions::has_cc_enable_arenas() const {
+  return _internal_has_cc_enable_arenas();
+}
+inline void FileOptions::clear_cc_enable_arenas() {
+  cc_enable_arenas_ = true;
+  _has_bits_[0] &= ~0x00080000u;
+}
+inline bool FileOptions::_internal_cc_enable_arenas() const {
+  return cc_enable_arenas_;
+}
+inline bool FileOptions::cc_enable_arenas() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.cc_enable_arenas)
+  return _internal_cc_enable_arenas();
+}
+inline void FileOptions::_internal_set_cc_enable_arenas(bool value) {
+  _has_bits_[0] |= 0x00080000u;
+  cc_enable_arenas_ = value;
+}
+inline void FileOptions::set_cc_enable_arenas(bool value) {
+  _internal_set_cc_enable_arenas(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.cc_enable_arenas)
+}
+
+// optional string objc_class_prefix = 36;
+inline bool FileOptions::_internal_has_objc_class_prefix() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  return value;
+}
+inline bool FileOptions::has_objc_class_prefix() const {
+  return _internal_has_objc_class_prefix();
+}
+inline void FileOptions::clear_objc_class_prefix() {
+  objc_class_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline const std::string& FileOptions::objc_class_prefix() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.objc_class_prefix)
+  return _internal_objc_class_prefix();
+}
+inline void FileOptions::set_objc_class_prefix(const std::string& value) {
+  _internal_set_objc_class_prefix(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.objc_class_prefix)
+}
+inline std::string* FileOptions::mutable_objc_class_prefix() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.objc_class_prefix)
+  return _internal_mutable_objc_class_prefix();
+}
+inline const std::string& FileOptions::_internal_objc_class_prefix() const {
+  return objc_class_prefix_.Get();
+}
+inline void FileOptions::_internal_set_objc_class_prefix(const std::string& value) {
+  _has_bits_[0] |= 0x00000008u;
+  objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_objc_class_prefix(std::string&& value) {
+  _has_bits_[0] |= 0x00000008u;
+  objc_class_prefix_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.objc_class_prefix)
+}
+inline void FileOptions::set_objc_class_prefix(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000008u;
+  objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.objc_class_prefix)
+}
+inline void FileOptions::set_objc_class_prefix(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000008u;
+  objc_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.objc_class_prefix)
+}
+inline std::string* FileOptions::_internal_mutable_objc_class_prefix() {
+  _has_bits_[0] |= 0x00000008u;
+  return objc_class_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_objc_class_prefix() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.objc_class_prefix)
+  if (!_internal_has_objc_class_prefix()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000008u;
+  return objc_class_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_objc_class_prefix(std::string* objc_class_prefix) {
+  if (objc_class_prefix != nullptr) {
+    _has_bits_[0] |= 0x00000008u;
+  } else {
+    _has_bits_[0] &= ~0x00000008u;
+  }
+  objc_class_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), objc_class_prefix,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.objc_class_prefix)
+}
+
+// optional string csharp_namespace = 37;
+inline bool FileOptions::_internal_has_csharp_namespace() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  return value;
+}
+inline bool FileOptions::has_csharp_namespace() const {
+  return _internal_has_csharp_namespace();
+}
+inline void FileOptions::clear_csharp_namespace() {
+  csharp_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline const std::string& FileOptions::csharp_namespace() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.csharp_namespace)
+  return _internal_csharp_namespace();
+}
+inline void FileOptions::set_csharp_namespace(const std::string& value) {
+  _internal_set_csharp_namespace(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.csharp_namespace)
+}
+inline std::string* FileOptions::mutable_csharp_namespace() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.csharp_namespace)
+  return _internal_mutable_csharp_namespace();
+}
+inline const std::string& FileOptions::_internal_csharp_namespace() const {
+  return csharp_namespace_.Get();
+}
+inline void FileOptions::_internal_set_csharp_namespace(const std::string& value) {
+  _has_bits_[0] |= 0x00000010u;
+  csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_csharp_namespace(std::string&& value) {
+  _has_bits_[0] |= 0x00000010u;
+  csharp_namespace_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.csharp_namespace)
+}
+inline void FileOptions::set_csharp_namespace(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000010u;
+  csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.csharp_namespace)
+}
+inline void FileOptions::set_csharp_namespace(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000010u;
+  csharp_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.csharp_namespace)
+}
+inline std::string* FileOptions::_internal_mutable_csharp_namespace() {
+  _has_bits_[0] |= 0x00000010u;
+  return csharp_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_csharp_namespace() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.csharp_namespace)
+  if (!_internal_has_csharp_namespace()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000010u;
+  return csharp_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_csharp_namespace(std::string* csharp_namespace) {
+  if (csharp_namespace != nullptr) {
+    _has_bits_[0] |= 0x00000010u;
+  } else {
+    _has_bits_[0] &= ~0x00000010u;
+  }
+  csharp_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), csharp_namespace,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.csharp_namespace)
+}
+
+// optional string swift_prefix = 39;
+inline bool FileOptions::_internal_has_swift_prefix() const {
+  bool value = (_has_bits_[0] & 0x00000020u) != 0;
+  return value;
+}
+inline bool FileOptions::has_swift_prefix() const {
+  return _internal_has_swift_prefix();
+}
+inline void FileOptions::clear_swift_prefix() {
+  swift_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline const std::string& FileOptions::swift_prefix() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.swift_prefix)
+  return _internal_swift_prefix();
+}
+inline void FileOptions::set_swift_prefix(const std::string& value) {
+  _internal_set_swift_prefix(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.swift_prefix)
+}
+inline std::string* FileOptions::mutable_swift_prefix() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.swift_prefix)
+  return _internal_mutable_swift_prefix();
+}
+inline const std::string& FileOptions::_internal_swift_prefix() const {
+  return swift_prefix_.Get();
+}
+inline void FileOptions::_internal_set_swift_prefix(const std::string& value) {
+  _has_bits_[0] |= 0x00000020u;
+  swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_swift_prefix(std::string&& value) {
+  _has_bits_[0] |= 0x00000020u;
+  swift_prefix_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.swift_prefix)
+}
+inline void FileOptions::set_swift_prefix(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000020u;
+  swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.swift_prefix)
+}
+inline void FileOptions::set_swift_prefix(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000020u;
+  swift_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.swift_prefix)
+}
+inline std::string* FileOptions::_internal_mutable_swift_prefix() {
+  _has_bits_[0] |= 0x00000020u;
+  return swift_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_swift_prefix() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.swift_prefix)
+  if (!_internal_has_swift_prefix()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000020u;
+  return swift_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_swift_prefix(std::string* swift_prefix) {
+  if (swift_prefix != nullptr) {
+    _has_bits_[0] |= 0x00000020u;
+  } else {
+    _has_bits_[0] &= ~0x00000020u;
+  }
+  swift_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), swift_prefix,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.swift_prefix)
+}
+
+// optional string php_class_prefix = 40;
+inline bool FileOptions::_internal_has_php_class_prefix() const {
+  bool value = (_has_bits_[0] & 0x00000040u) != 0;
+  return value;
+}
+inline bool FileOptions::has_php_class_prefix() const {
+  return _internal_has_php_class_prefix();
+}
+inline void FileOptions::clear_php_class_prefix() {
+  php_class_prefix_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000040u;
+}
+inline const std::string& FileOptions::php_class_prefix() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_class_prefix)
+  return _internal_php_class_prefix();
+}
+inline void FileOptions::set_php_class_prefix(const std::string& value) {
+  _internal_set_php_class_prefix(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_class_prefix)
+}
+inline std::string* FileOptions::mutable_php_class_prefix() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_class_prefix)
+  return _internal_mutable_php_class_prefix();
+}
+inline const std::string& FileOptions::_internal_php_class_prefix() const {
+  return php_class_prefix_.Get();
+}
+inline void FileOptions::_internal_set_php_class_prefix(const std::string& value) {
+  _has_bits_[0] |= 0x00000040u;
+  php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_php_class_prefix(std::string&& value) {
+  _has_bits_[0] |= 0x00000040u;
+  php_class_prefix_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_class_prefix)
+}
+inline void FileOptions::set_php_class_prefix(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000040u;
+  php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_class_prefix)
+}
+inline void FileOptions::set_php_class_prefix(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000040u;
+  php_class_prefix_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_class_prefix)
+}
+inline std::string* FileOptions::_internal_mutable_php_class_prefix() {
+  _has_bits_[0] |= 0x00000040u;
+  return php_class_prefix_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_php_class_prefix() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_class_prefix)
+  if (!_internal_has_php_class_prefix()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000040u;
+  return php_class_prefix_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_php_class_prefix(std::string* php_class_prefix) {
+  if (php_class_prefix != nullptr) {
+    _has_bits_[0] |= 0x00000040u;
+  } else {
+    _has_bits_[0] &= ~0x00000040u;
+  }
+  php_class_prefix_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_class_prefix,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_class_prefix)
+}
+
+// optional string php_namespace = 41;
+inline bool FileOptions::_internal_has_php_namespace() const {
+  bool value = (_has_bits_[0] & 0x00000080u) != 0;
+  return value;
+}
+inline bool FileOptions::has_php_namespace() const {
+  return _internal_has_php_namespace();
+}
+inline void FileOptions::clear_php_namespace() {
+  php_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000080u;
+}
+inline const std::string& FileOptions::php_namespace() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_namespace)
+  return _internal_php_namespace();
+}
+inline void FileOptions::set_php_namespace(const std::string& value) {
+  _internal_set_php_namespace(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_namespace)
+}
+inline std::string* FileOptions::mutable_php_namespace() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_namespace)
+  return _internal_mutable_php_namespace();
+}
+inline const std::string& FileOptions::_internal_php_namespace() const {
+  return php_namespace_.Get();
+}
+inline void FileOptions::_internal_set_php_namespace(const std::string& value) {
+  _has_bits_[0] |= 0x00000080u;
+  php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_php_namespace(std::string&& value) {
+  _has_bits_[0] |= 0x00000080u;
+  php_namespace_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_namespace)
+}
+inline void FileOptions::set_php_namespace(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000080u;
+  php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_namespace)
+}
+inline void FileOptions::set_php_namespace(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000080u;
+  php_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_namespace)
+}
+inline std::string* FileOptions::_internal_mutable_php_namespace() {
+  _has_bits_[0] |= 0x00000080u;
+  return php_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_php_namespace() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_namespace)
+  if (!_internal_has_php_namespace()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000080u;
+  return php_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_php_namespace(std::string* php_namespace) {
+  if (php_namespace != nullptr) {
+    _has_bits_[0] |= 0x00000080u;
+  } else {
+    _has_bits_[0] &= ~0x00000080u;
+  }
+  php_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_namespace,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_namespace)
+}
+
+// optional string php_metadata_namespace = 44;
+inline bool FileOptions::_internal_has_php_metadata_namespace() const {
+  bool value = (_has_bits_[0] & 0x00000100u) != 0;
+  return value;
+}
+inline bool FileOptions::has_php_metadata_namespace() const {
+  return _internal_has_php_metadata_namespace();
+}
+inline void FileOptions::clear_php_metadata_namespace() {
+  php_metadata_namespace_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000100u;
+}
+inline const std::string& FileOptions::php_metadata_namespace() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.php_metadata_namespace)
+  return _internal_php_metadata_namespace();
+}
+inline void FileOptions::set_php_metadata_namespace(const std::string& value) {
+  _internal_set_php_metadata_namespace(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.php_metadata_namespace)
+}
+inline std::string* FileOptions::mutable_php_metadata_namespace() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.php_metadata_namespace)
+  return _internal_mutable_php_metadata_namespace();
+}
+inline const std::string& FileOptions::_internal_php_metadata_namespace() const {
+  return php_metadata_namespace_.Get();
+}
+inline void FileOptions::_internal_set_php_metadata_namespace(const std::string& value) {
+  _has_bits_[0] |= 0x00000100u;
+  php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_php_metadata_namespace(std::string&& value) {
+  _has_bits_[0] |= 0x00000100u;
+  php_metadata_namespace_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.php_metadata_namespace)
+}
+inline void FileOptions::set_php_metadata_namespace(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000100u;
+  php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.php_metadata_namespace)
+}
+inline void FileOptions::set_php_metadata_namespace(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000100u;
+  php_metadata_namespace_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.php_metadata_namespace)
+}
+inline std::string* FileOptions::_internal_mutable_php_metadata_namespace() {
+  _has_bits_[0] |= 0x00000100u;
+  return php_metadata_namespace_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_php_metadata_namespace() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.php_metadata_namespace)
+  if (!_internal_has_php_metadata_namespace()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000100u;
+  return php_metadata_namespace_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_php_metadata_namespace(std::string* php_metadata_namespace) {
+  if (php_metadata_namespace != nullptr) {
+    _has_bits_[0] |= 0x00000100u;
+  } else {
+    _has_bits_[0] &= ~0x00000100u;
+  }
+  php_metadata_namespace_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), php_metadata_namespace,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.php_metadata_namespace)
+}
+
+// optional string ruby_package = 45;
+inline bool FileOptions::_internal_has_ruby_package() const {
+  bool value = (_has_bits_[0] & 0x00000200u) != 0;
+  return value;
+}
+inline bool FileOptions::has_ruby_package() const {
+  return _internal_has_ruby_package();
+}
+inline void FileOptions::clear_ruby_package() {
+  ruby_package_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000200u;
+}
+inline const std::string& FileOptions::ruby_package() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.ruby_package)
+  return _internal_ruby_package();
+}
+inline void FileOptions::set_ruby_package(const std::string& value) {
+  _internal_set_ruby_package(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FileOptions.ruby_package)
+}
+inline std::string* FileOptions::mutable_ruby_package() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.ruby_package)
+  return _internal_mutable_ruby_package();
+}
+inline const std::string& FileOptions::_internal_ruby_package() const {
+  return ruby_package_.Get();
+}
+inline void FileOptions::_internal_set_ruby_package(const std::string& value) {
+  _has_bits_[0] |= 0x00000200u;
+  ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void FileOptions::set_ruby_package(std::string&& value) {
+  _has_bits_[0] |= 0x00000200u;
+  ruby_package_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.FileOptions.ruby_package)
+}
+inline void FileOptions::set_ruby_package(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000200u;
+  ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FileOptions.ruby_package)
+}
+inline void FileOptions::set_ruby_package(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000200u;
+  ruby_package_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FileOptions.ruby_package)
+}
+inline std::string* FileOptions::_internal_mutable_ruby_package() {
+  _has_bits_[0] |= 0x00000200u;
+  return ruby_package_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* FileOptions::release_ruby_package() {
+  // @@protoc_insertion_point(field_release:google.protobuf.FileOptions.ruby_package)
+  if (!_internal_has_ruby_package()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000200u;
+  return ruby_package_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void FileOptions::set_allocated_ruby_package(std::string* ruby_package) {
+  if (ruby_package != nullptr) {
+    _has_bits_[0] |= 0x00000200u;
+  } else {
+    _has_bits_[0] &= ~0x00000200u;
+  }
+  ruby_package_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ruby_package,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.FileOptions.ruby_package)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int FileOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int FileOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void FileOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FileOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+FileOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FileOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FileOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FileOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FileOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FileOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FileOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+FileOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FileOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// MessageOptions
+
+// optional bool message_set_wire_format = 1 [default = false];
+inline bool MessageOptions::_internal_has_message_set_wire_format() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool MessageOptions::has_message_set_wire_format() const {
+  return _internal_has_message_set_wire_format();
+}
+inline void MessageOptions::clear_message_set_wire_format() {
+  message_set_wire_format_ = false;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline bool MessageOptions::_internal_message_set_wire_format() const {
+  return message_set_wire_format_;
+}
+inline bool MessageOptions::message_set_wire_format() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.message_set_wire_format)
+  return _internal_message_set_wire_format();
+}
+inline void MessageOptions::_internal_set_message_set_wire_format(bool value) {
+  _has_bits_[0] |= 0x00000001u;
+  message_set_wire_format_ = value;
+}
+inline void MessageOptions::set_message_set_wire_format(bool value) {
+  _internal_set_message_set_wire_format(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.message_set_wire_format)
+}
+
+// optional bool no_standard_descriptor_accessor = 2 [default = false];
+inline bool MessageOptions::_internal_has_no_standard_descriptor_accessor() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool MessageOptions::has_no_standard_descriptor_accessor() const {
+  return _internal_has_no_standard_descriptor_accessor();
+}
+inline void MessageOptions::clear_no_standard_descriptor_accessor() {
+  no_standard_descriptor_accessor_ = false;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline bool MessageOptions::_internal_no_standard_descriptor_accessor() const {
+  return no_standard_descriptor_accessor_;
+}
+inline bool MessageOptions::no_standard_descriptor_accessor() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.no_standard_descriptor_accessor)
+  return _internal_no_standard_descriptor_accessor();
+}
+inline void MessageOptions::_internal_set_no_standard_descriptor_accessor(bool value) {
+  _has_bits_[0] |= 0x00000002u;
+  no_standard_descriptor_accessor_ = value;
+}
+inline void MessageOptions::set_no_standard_descriptor_accessor(bool value) {
+  _internal_set_no_standard_descriptor_accessor(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.no_standard_descriptor_accessor)
+}
+
+// optional bool deprecated = 3 [default = false];
+inline bool MessageOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool MessageOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void MessageOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline bool MessageOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool MessageOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void MessageOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000004u;
+  deprecated_ = value;
+}
+inline void MessageOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.deprecated)
+}
+
+// optional bool map_entry = 7;
+inline bool MessageOptions::_internal_has_map_entry() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  return value;
+}
+inline bool MessageOptions::has_map_entry() const {
+  return _internal_has_map_entry();
+}
+inline void MessageOptions::clear_map_entry() {
+  map_entry_ = false;
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline bool MessageOptions::_internal_map_entry() const {
+  return map_entry_;
+}
+inline bool MessageOptions::map_entry() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.map_entry)
+  return _internal_map_entry();
+}
+inline void MessageOptions::_internal_set_map_entry(bool value) {
+  _has_bits_[0] |= 0x00000008u;
+  map_entry_ = value;
+}
+inline void MessageOptions::set_map_entry(bool value) {
+  _internal_set_map_entry(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MessageOptions.map_entry)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int MessageOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int MessageOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void MessageOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MessageOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+MessageOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.MessageOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MessageOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MessageOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MessageOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MessageOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.MessageOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+MessageOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.MessageOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// FieldOptions
+
+// optional .google.protobuf.FieldOptions.CType ctype = 1 [default = STRING];
+inline bool FieldOptions::_internal_has_ctype() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_ctype() const {
+  return _internal_has_ctype();
+}
+inline void FieldOptions::clear_ctype() {
+  ctype_ = 0;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions_CType FieldOptions::_internal_ctype() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::FieldOptions_CType >(ctype_);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions_CType FieldOptions::ctype() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.ctype)
+  return _internal_ctype();
+}
+inline void FieldOptions::_internal_set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value) {
+  assert(PROTOBUF_NAMESPACE_ID::FieldOptions_CType_IsValid(value));
+  _has_bits_[0] |= 0x00000001u;
+  ctype_ = value;
+}
+inline void FieldOptions::set_ctype(PROTOBUF_NAMESPACE_ID::FieldOptions_CType value) {
+  _internal_set_ctype(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.ctype)
+}
+
+// optional bool packed = 2;
+inline bool FieldOptions::_internal_has_packed() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_packed() const {
+  return _internal_has_packed();
+}
+inline void FieldOptions::clear_packed() {
+  packed_ = false;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline bool FieldOptions::_internal_packed() const {
+  return packed_;
+}
+inline bool FieldOptions::packed() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.packed)
+  return _internal_packed();
+}
+inline void FieldOptions::_internal_set_packed(bool value) {
+  _has_bits_[0] |= 0x00000002u;
+  packed_ = value;
+}
+inline void FieldOptions::set_packed(bool value) {
+  _internal_set_packed(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.packed)
+}
+
+// optional .google.protobuf.FieldOptions.JSType jstype = 6 [default = JS_NORMAL];
+inline bool FieldOptions::_internal_has_jstype() const {
+  bool value = (_has_bits_[0] & 0x00000020u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_jstype() const {
+  return _internal_has_jstype();
+}
+inline void FieldOptions::clear_jstype() {
+  jstype_ = 0;
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions_JSType FieldOptions::_internal_jstype() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType >(jstype_);
+}
+inline PROTOBUF_NAMESPACE_ID::FieldOptions_JSType FieldOptions::jstype() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.jstype)
+  return _internal_jstype();
+}
+inline void FieldOptions::_internal_set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value) {
+  assert(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType_IsValid(value));
+  _has_bits_[0] |= 0x00000020u;
+  jstype_ = value;
+}
+inline void FieldOptions::set_jstype(PROTOBUF_NAMESPACE_ID::FieldOptions_JSType value) {
+  _internal_set_jstype(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.jstype)
+}
+
+// optional bool lazy = 5 [default = false];
+inline bool FieldOptions::_internal_has_lazy() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_lazy() const {
+  return _internal_has_lazy();
+}
+inline void FieldOptions::clear_lazy() {
+  lazy_ = false;
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline bool FieldOptions::_internal_lazy() const {
+  return lazy_;
+}
+inline bool FieldOptions::lazy() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.lazy)
+  return _internal_lazy();
+}
+inline void FieldOptions::_internal_set_lazy(bool value) {
+  _has_bits_[0] |= 0x00000004u;
+  lazy_ = value;
+}
+inline void FieldOptions::set_lazy(bool value) {
+  _internal_set_lazy(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.lazy)
+}
+
+// optional bool deprecated = 3 [default = false];
+inline bool FieldOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void FieldOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline bool FieldOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool FieldOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void FieldOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000008u;
+  deprecated_ = value;
+}
+inline void FieldOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.deprecated)
+}
+
+// optional bool weak = 10 [default = false];
+inline bool FieldOptions::_internal_has_weak() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  return value;
+}
+inline bool FieldOptions::has_weak() const {
+  return _internal_has_weak();
+}
+inline void FieldOptions::clear_weak() {
+  weak_ = false;
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline bool FieldOptions::_internal_weak() const {
+  return weak_;
+}
+inline bool FieldOptions::weak() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.weak)
+  return _internal_weak();
+}
+inline void FieldOptions::_internal_set_weak(bool value) {
+  _has_bits_[0] |= 0x00000010u;
+  weak_ = value;
+}
+inline void FieldOptions::set_weak(bool value) {
+  _internal_set_weak(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldOptions.weak)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int FieldOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int FieldOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void FieldOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+FieldOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FieldOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FieldOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& FieldOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* FieldOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.FieldOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+FieldOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FieldOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// OneofOptions
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int OneofOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int OneofOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void OneofOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.OneofOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+OneofOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.OneofOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& OneofOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& OneofOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.OneofOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* OneofOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.OneofOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+OneofOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.OneofOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// EnumOptions
+
+// optional bool allow_alias = 2;
+inline bool EnumOptions::_internal_has_allow_alias() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool EnumOptions::has_allow_alias() const {
+  return _internal_has_allow_alias();
+}
+inline void EnumOptions::clear_allow_alias() {
+  allow_alias_ = false;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline bool EnumOptions::_internal_allow_alias() const {
+  return allow_alias_;
+}
+inline bool EnumOptions::allow_alias() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.allow_alias)
+  return _internal_allow_alias();
+}
+inline void EnumOptions::_internal_set_allow_alias(bool value) {
+  _has_bits_[0] |= 0x00000001u;
+  allow_alias_ = value;
+}
+inline void EnumOptions::set_allow_alias(bool value) {
+  _internal_set_allow_alias(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumOptions.allow_alias)
+}
+
+// optional bool deprecated = 3 [default = false];
+inline bool EnumOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool EnumOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void EnumOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline bool EnumOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool EnumOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void EnumOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000002u;
+  deprecated_ = value;
+}
+inline void EnumOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumOptions.deprecated)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int EnumOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int EnumOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void EnumOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+EnumOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+EnumOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.EnumOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// EnumValueOptions
+
+// optional bool deprecated = 1 [default = false];
+inline bool EnumValueOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool EnumValueOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void EnumValueOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline bool EnumValueOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool EnumValueOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumValueOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void EnumValueOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000001u;
+  deprecated_ = value;
+}
+inline void EnumValueOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.EnumValueOptions.deprecated)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int EnumValueOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int EnumValueOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void EnumValueOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.EnumValueOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+EnumValueOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.EnumValueOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumValueOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& EnumValueOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.EnumValueOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* EnumValueOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.EnumValueOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+EnumValueOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.EnumValueOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// ServiceOptions
+
+// optional bool deprecated = 33 [default = false];
+inline bool ServiceOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool ServiceOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void ServiceOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline bool ServiceOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool ServiceOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ServiceOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void ServiceOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000001u;
+  deprecated_ = value;
+}
+inline void ServiceOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.ServiceOptions.deprecated)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int ServiceOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int ServiceOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void ServiceOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ServiceOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+ServiceOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.ServiceOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ServiceOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& ServiceOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ServiceOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* ServiceOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.ServiceOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+ServiceOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.ServiceOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// MethodOptions
+
+// optional bool deprecated = 33 [default = false];
+inline bool MethodOptions::_internal_has_deprecated() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool MethodOptions::has_deprecated() const {
+  return _internal_has_deprecated();
+}
+inline void MethodOptions::clear_deprecated() {
+  deprecated_ = false;
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline bool MethodOptions::_internal_deprecated() const {
+  return deprecated_;
+}
+inline bool MethodOptions::deprecated() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.deprecated)
+  return _internal_deprecated();
+}
+inline void MethodOptions::_internal_set_deprecated(bool value) {
+  _has_bits_[0] |= 0x00000001u;
+  deprecated_ = value;
+}
+inline void MethodOptions::set_deprecated(bool value) {
+  _internal_set_deprecated(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodOptions.deprecated)
+}
+
+// optional .google.protobuf.MethodOptions.IdempotencyLevel idempotency_level = 34 [default = IDEMPOTENCY_UNKNOWN];
+inline bool MethodOptions::_internal_has_idempotency_level() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool MethodOptions::has_idempotency_level() const {
+  return _internal_has_idempotency_level();
+}
+inline void MethodOptions::clear_idempotency_level() {
+  idempotency_level_ = 0;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel MethodOptions::_internal_idempotency_level() const {
+  return static_cast< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel >(idempotency_level_);
+}
+inline PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel MethodOptions::idempotency_level() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.idempotency_level)
+  return _internal_idempotency_level();
+}
+inline void MethodOptions::_internal_set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value) {
+  assert(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel_IsValid(value));
+  _has_bits_[0] |= 0x00000002u;
+  idempotency_level_ = value;
+}
+inline void MethodOptions::set_idempotency_level(PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel value) {
+  _internal_set_idempotency_level(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.MethodOptions.idempotency_level)
+}
+
+// repeated .google.protobuf.UninterpretedOption uninterpreted_option = 999;
+inline int MethodOptions::_internal_uninterpreted_option_size() const {
+  return uninterpreted_option_.size();
+}
+inline int MethodOptions::uninterpreted_option_size() const {
+  return _internal_uninterpreted_option_size();
+}
+inline void MethodOptions::clear_uninterpreted_option() {
+  uninterpreted_option_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::mutable_uninterpreted_option(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.MethodOptions.uninterpreted_option)
+  return uninterpreted_option_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >*
+MethodOptions::mutable_uninterpreted_option() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.MethodOptions.uninterpreted_option)
+  return &uninterpreted_option_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MethodOptions::_internal_uninterpreted_option(int index) const {
+  return uninterpreted_option_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption& MethodOptions::uninterpreted_option(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.MethodOptions.uninterpreted_option)
+  return _internal_uninterpreted_option(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::_internal_add_uninterpreted_option() {
+  return uninterpreted_option_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption* MethodOptions::add_uninterpreted_option() {
+  // @@protoc_insertion_point(field_add:google.protobuf.MethodOptions.uninterpreted_option)
+  return _internal_add_uninterpreted_option();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption >&
+MethodOptions::uninterpreted_option() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.MethodOptions.uninterpreted_option)
+  return uninterpreted_option_;
+}
+
+// -------------------------------------------------------------------
+
+// UninterpretedOption_NamePart
+
+// required string name_part = 1;
+inline bool UninterpretedOption_NamePart::_internal_has_name_part() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool UninterpretedOption_NamePart::has_name_part() const {
+  return _internal_has_name_part();
+}
+inline void UninterpretedOption_NamePart::clear_name_part() {
+  name_part_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& UninterpretedOption_NamePart::name_part() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.NamePart.name_part)
+  return _internal_name_part();
+}
+inline void UninterpretedOption_NamePart::set_name_part(const std::string& value) {
+  _internal_set_name_part(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.NamePart.name_part)
+}
+inline std::string* UninterpretedOption_NamePart::mutable_name_part() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.NamePart.name_part)
+  return _internal_mutable_name_part();
+}
+inline const std::string& UninterpretedOption_NamePart::_internal_name_part() const {
+  return name_part_.Get();
+}
+inline void UninterpretedOption_NamePart::_internal_set_name_part(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void UninterpretedOption_NamePart::set_name_part(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  name_part_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.NamePart.name_part)
+}
+inline void UninterpretedOption_NamePart::set_name_part(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.NamePart.name_part)
+}
+inline void UninterpretedOption_NamePart::set_name_part(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  name_part_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.NamePart.name_part)
+}
+inline std::string* UninterpretedOption_NamePart::_internal_mutable_name_part() {
+  _has_bits_[0] |= 0x00000001u;
+  return name_part_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* UninterpretedOption_NamePart::release_name_part() {
+  // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.NamePart.name_part)
+  if (!_internal_has_name_part()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return name_part_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void UninterpretedOption_NamePart::set_allocated_name_part(std::string* name_part) {
+  if (name_part != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  name_part_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), name_part,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.NamePart.name_part)
+}
+
+// required bool is_extension = 2;
+inline bool UninterpretedOption_NamePart::_internal_has_is_extension() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool UninterpretedOption_NamePart::has_is_extension() const {
+  return _internal_has_is_extension();
+}
+inline void UninterpretedOption_NamePart::clear_is_extension() {
+  is_extension_ = false;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline bool UninterpretedOption_NamePart::_internal_is_extension() const {
+  return is_extension_;
+}
+inline bool UninterpretedOption_NamePart::is_extension() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.NamePart.is_extension)
+  return _internal_is_extension();
+}
+inline void UninterpretedOption_NamePart::_internal_set_is_extension(bool value) {
+  _has_bits_[0] |= 0x00000002u;
+  is_extension_ = value;
+}
+inline void UninterpretedOption_NamePart::set_is_extension(bool value) {
+  _internal_set_is_extension(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.NamePart.is_extension)
+}
+
+// -------------------------------------------------------------------
+
+// UninterpretedOption
+
+// repeated .google.protobuf.UninterpretedOption.NamePart name = 2;
+inline int UninterpretedOption::_internal_name_size() const {
+  return name_.size();
+}
+inline int UninterpretedOption::name_size() const {
+  return _internal_name_size();
+}
+inline void UninterpretedOption::clear_name() {
+  name_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::mutable_name(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.name)
+  return name_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >*
+UninterpretedOption::mutable_name() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.UninterpretedOption.name)
+  return &name_;
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& UninterpretedOption::_internal_name(int index) const {
+  return name_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart& UninterpretedOption::name(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.name)
+  return _internal_name(index);
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::_internal_add_name() {
+  return name_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart* UninterpretedOption::add_name() {
+  // @@protoc_insertion_point(field_add:google.protobuf.UninterpretedOption.name)
+  return _internal_add_name();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::UninterpretedOption_NamePart >&
+UninterpretedOption::name() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.UninterpretedOption.name)
+  return name_;
+}
+
+// optional string identifier_value = 3;
+inline bool UninterpretedOption::_internal_has_identifier_value() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_identifier_value() const {
+  return _internal_has_identifier_value();
+}
+inline void UninterpretedOption::clear_identifier_value() {
+  identifier_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& UninterpretedOption::identifier_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.identifier_value)
+  return _internal_identifier_value();
+}
+inline void UninterpretedOption::set_identifier_value(const std::string& value) {
+  _internal_set_identifier_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.identifier_value)
+}
+inline std::string* UninterpretedOption::mutable_identifier_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.identifier_value)
+  return _internal_mutable_identifier_value();
+}
+inline const std::string& UninterpretedOption::_internal_identifier_value() const {
+  return identifier_value_.Get();
+}
+inline void UninterpretedOption::_internal_set_identifier_value(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void UninterpretedOption::set_identifier_value(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  identifier_value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.identifier_value)
+}
+inline void UninterpretedOption::set_identifier_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.identifier_value)
+}
+inline void UninterpretedOption::set_identifier_value(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  identifier_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.identifier_value)
+}
+inline std::string* UninterpretedOption::_internal_mutable_identifier_value() {
+  _has_bits_[0] |= 0x00000001u;
+  return identifier_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* UninterpretedOption::release_identifier_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.identifier_value)
+  if (!_internal_has_identifier_value()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return identifier_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void UninterpretedOption::set_allocated_identifier_value(std::string* identifier_value) {
+  if (identifier_value != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  identifier_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), identifier_value,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.identifier_value)
+}
+
+// optional uint64 positive_int_value = 4;
+inline bool UninterpretedOption::_internal_has_positive_int_value() const {
+  bool value = (_has_bits_[0] & 0x00000008u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_positive_int_value() const {
+  return _internal_has_positive_int_value();
+}
+inline void UninterpretedOption::clear_positive_int_value() {
+  positive_int_value_ = PROTOBUF_ULONGLONG(0);
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::uint64 UninterpretedOption::_internal_positive_int_value() const {
+  return positive_int_value_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::uint64 UninterpretedOption::positive_int_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.positive_int_value)
+  return _internal_positive_int_value();
+}
+inline void UninterpretedOption::_internal_set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value) {
+  _has_bits_[0] |= 0x00000008u;
+  positive_int_value_ = value;
+}
+inline void UninterpretedOption::set_positive_int_value(::PROTOBUF_NAMESPACE_ID::uint64 value) {
+  _internal_set_positive_int_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.positive_int_value)
+}
+
+// optional int64 negative_int_value = 5;
+inline bool UninterpretedOption::_internal_has_negative_int_value() const {
+  bool value = (_has_bits_[0] & 0x00000010u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_negative_int_value() const {
+  return _internal_has_negative_int_value();
+}
+inline void UninterpretedOption::clear_negative_int_value() {
+  negative_int_value_ = PROTOBUF_LONGLONG(0);
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 UninterpretedOption::_internal_negative_int_value() const {
+  return negative_int_value_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 UninterpretedOption::negative_int_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.negative_int_value)
+  return _internal_negative_int_value();
+}
+inline void UninterpretedOption::_internal_set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  _has_bits_[0] |= 0x00000010u;
+  negative_int_value_ = value;
+}
+inline void UninterpretedOption::set_negative_int_value(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  _internal_set_negative_int_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.negative_int_value)
+}
+
+// optional double double_value = 6;
+inline bool UninterpretedOption::_internal_has_double_value() const {
+  bool value = (_has_bits_[0] & 0x00000020u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_double_value() const {
+  return _internal_has_double_value();
+}
+inline void UninterpretedOption::clear_double_value() {
+  double_value_ = 0;
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline double UninterpretedOption::_internal_double_value() const {
+  return double_value_;
+}
+inline double UninterpretedOption::double_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.double_value)
+  return _internal_double_value();
+}
+inline void UninterpretedOption::_internal_set_double_value(double value) {
+  _has_bits_[0] |= 0x00000020u;
+  double_value_ = value;
+}
+inline void UninterpretedOption::set_double_value(double value) {
+  _internal_set_double_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.double_value)
+}
+
+// optional bytes string_value = 7;
+inline bool UninterpretedOption::_internal_has_string_value() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_string_value() const {
+  return _internal_has_string_value();
+}
+inline void UninterpretedOption::clear_string_value() {
+  string_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& UninterpretedOption::string_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.string_value)
+  return _internal_string_value();
+}
+inline void UninterpretedOption::set_string_value(const std::string& value) {
+  _internal_set_string_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.string_value)
+}
+inline std::string* UninterpretedOption::mutable_string_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.string_value)
+  return _internal_mutable_string_value();
+}
+inline const std::string& UninterpretedOption::_internal_string_value() const {
+  return string_value_.Get();
+}
+inline void UninterpretedOption::_internal_set_string_value(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void UninterpretedOption::set_string_value(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  string_value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.string_value)
+}
+inline void UninterpretedOption::set_string_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.string_value)
+}
+inline void UninterpretedOption::set_string_value(const void* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.string_value)
+}
+inline std::string* UninterpretedOption::_internal_mutable_string_value() {
+  _has_bits_[0] |= 0x00000002u;
+  return string_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* UninterpretedOption::release_string_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.string_value)
+  if (!_internal_has_string_value()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return string_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void UninterpretedOption::set_allocated_string_value(std::string* string_value) {
+  if (string_value != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  string_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), string_value,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.string_value)
+}
+
+// optional string aggregate_value = 8;
+inline bool UninterpretedOption::_internal_has_aggregate_value() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool UninterpretedOption::has_aggregate_value() const {
+  return _internal_has_aggregate_value();
+}
+inline void UninterpretedOption::clear_aggregate_value() {
+  aggregate_value_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline const std::string& UninterpretedOption::aggregate_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.UninterpretedOption.aggregate_value)
+  return _internal_aggregate_value();
+}
+inline void UninterpretedOption::set_aggregate_value(const std::string& value) {
+  _internal_set_aggregate_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.UninterpretedOption.aggregate_value)
+}
+inline std::string* UninterpretedOption::mutable_aggregate_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.UninterpretedOption.aggregate_value)
+  return _internal_mutable_aggregate_value();
+}
+inline const std::string& UninterpretedOption::_internal_aggregate_value() const {
+  return aggregate_value_.Get();
+}
+inline void UninterpretedOption::_internal_set_aggregate_value(const std::string& value) {
+  _has_bits_[0] |= 0x00000004u;
+  aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void UninterpretedOption::set_aggregate_value(std::string&& value) {
+  _has_bits_[0] |= 0x00000004u;
+  aggregate_value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.UninterpretedOption.aggregate_value)
+}
+inline void UninterpretedOption::set_aggregate_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000004u;
+  aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.UninterpretedOption.aggregate_value)
+}
+inline void UninterpretedOption::set_aggregate_value(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000004u;
+  aggregate_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.UninterpretedOption.aggregate_value)
+}
+inline std::string* UninterpretedOption::_internal_mutable_aggregate_value() {
+  _has_bits_[0] |= 0x00000004u;
+  return aggregate_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* UninterpretedOption::release_aggregate_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.UninterpretedOption.aggregate_value)
+  if (!_internal_has_aggregate_value()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000004u;
+  return aggregate_value_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void UninterpretedOption::set_allocated_aggregate_value(std::string* aggregate_value) {
+  if (aggregate_value != nullptr) {
+    _has_bits_[0] |= 0x00000004u;
+  } else {
+    _has_bits_[0] &= ~0x00000004u;
+  }
+  aggregate_value_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), aggregate_value,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.UninterpretedOption.aggregate_value)
+}
+
+// -------------------------------------------------------------------
+
+// SourceCodeInfo_Location
+
+// repeated int32 path = 1 [packed = true];
+inline int SourceCodeInfo_Location::_internal_path_size() const {
+  return path_.size();
+}
+inline int SourceCodeInfo_Location::path_size() const {
+  return _internal_path_size();
+}
+inline void SourceCodeInfo_Location::clear_path() {
+  path_.Clear();
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::_internal_path(int index) const {
+  return path_.Get(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::path(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.path)
+  return _internal_path(index);
+}
+inline void SourceCodeInfo_Location::set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
+  path_.Set(index, value);
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.path)
+}
+inline void SourceCodeInfo_Location::_internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  path_.Add(value);
+}
+inline void SourceCodeInfo_Location::add_path(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_add_path(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.path)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+SourceCodeInfo_Location::_internal_path() const {
+  return path_;
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+SourceCodeInfo_Location::path() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.path)
+  return _internal_path();
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+SourceCodeInfo_Location::_internal_mutable_path() {
+  return &path_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+SourceCodeInfo_Location::mutable_path() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.path)
+  return _internal_mutable_path();
+}
+
+// repeated int32 span = 2 [packed = true];
+inline int SourceCodeInfo_Location::_internal_span_size() const {
+  return span_.size();
+}
+inline int SourceCodeInfo_Location::span_size() const {
+  return _internal_span_size();
+}
+inline void SourceCodeInfo_Location::clear_span() {
+  span_.Clear();
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::_internal_span(int index) const {
+  return span_.Get(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 SourceCodeInfo_Location::span(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.span)
+  return _internal_span(index);
+}
+inline void SourceCodeInfo_Location::set_span(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
+  span_.Set(index, value);
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.span)
+}
+inline void SourceCodeInfo_Location::_internal_add_span(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  span_.Add(value);
+}
+inline void SourceCodeInfo_Location::add_span(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_add_span(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.span)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+SourceCodeInfo_Location::_internal_span() const {
+  return span_;
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+SourceCodeInfo_Location::span() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.span)
+  return _internal_span();
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+SourceCodeInfo_Location::_internal_mutable_span() {
+  return &span_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+SourceCodeInfo_Location::mutable_span() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.span)
+  return _internal_mutable_span();
+}
+
+// optional string leading_comments = 3;
+inline bool SourceCodeInfo_Location::_internal_has_leading_comments() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool SourceCodeInfo_Location::has_leading_comments() const {
+  return _internal_has_leading_comments();
+}
+inline void SourceCodeInfo_Location::clear_leading_comments() {
+  leading_comments_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& SourceCodeInfo_Location::leading_comments() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.leading_comments)
+  return _internal_leading_comments();
+}
+inline void SourceCodeInfo_Location::set_leading_comments(const std::string& value) {
+  _internal_set_leading_comments(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_comments)
+}
+inline std::string* SourceCodeInfo_Location::mutable_leading_comments() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.leading_comments)
+  return _internal_mutable_leading_comments();
+}
+inline const std::string& SourceCodeInfo_Location::_internal_leading_comments() const {
+  return leading_comments_.Get();
+}
+inline void SourceCodeInfo_Location::_internal_set_leading_comments(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void SourceCodeInfo_Location::set_leading_comments(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  leading_comments_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceCodeInfo.Location.leading_comments)
+}
+inline void SourceCodeInfo_Location::set_leading_comments(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.leading_comments)
+}
+inline void SourceCodeInfo_Location::set_leading_comments(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  leading_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.leading_comments)
+}
+inline std::string* SourceCodeInfo_Location::_internal_mutable_leading_comments() {
+  _has_bits_[0] |= 0x00000001u;
+  return leading_comments_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* SourceCodeInfo_Location::release_leading_comments() {
+  // @@protoc_insertion_point(field_release:google.protobuf.SourceCodeInfo.Location.leading_comments)
+  if (!_internal_has_leading_comments()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return leading_comments_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void SourceCodeInfo_Location::set_allocated_leading_comments(std::string* leading_comments) {
+  if (leading_comments != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  leading_comments_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), leading_comments,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceCodeInfo.Location.leading_comments)
+}
+
+// optional string trailing_comments = 4;
+inline bool SourceCodeInfo_Location::_internal_has_trailing_comments() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool SourceCodeInfo_Location::has_trailing_comments() const {
+  return _internal_has_trailing_comments();
+}
+inline void SourceCodeInfo_Location::clear_trailing_comments() {
+  trailing_comments_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline const std::string& SourceCodeInfo_Location::trailing_comments() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+  return _internal_trailing_comments();
+}
+inline void SourceCodeInfo_Location::set_trailing_comments(const std::string& value) {
+  _internal_set_trailing_comments(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+}
+inline std::string* SourceCodeInfo_Location::mutable_trailing_comments() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+  return _internal_mutable_trailing_comments();
+}
+inline const std::string& SourceCodeInfo_Location::_internal_trailing_comments() const {
+  return trailing_comments_.Get();
+}
+inline void SourceCodeInfo_Location::_internal_set_trailing_comments(const std::string& value) {
+  _has_bits_[0] |= 0x00000002u;
+  trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void SourceCodeInfo_Location::set_trailing_comments(std::string&& value) {
+  _has_bits_[0] |= 0x00000002u;
+  trailing_comments_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+}
+inline void SourceCodeInfo_Location::set_trailing_comments(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000002u;
+  trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+}
+inline void SourceCodeInfo_Location::set_trailing_comments(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000002u;
+  trailing_comments_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+}
+inline std::string* SourceCodeInfo_Location::_internal_mutable_trailing_comments() {
+  _has_bits_[0] |= 0x00000002u;
+  return trailing_comments_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* SourceCodeInfo_Location::release_trailing_comments() {
+  // @@protoc_insertion_point(field_release:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+  if (!_internal_has_trailing_comments()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000002u;
+  return trailing_comments_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void SourceCodeInfo_Location::set_allocated_trailing_comments(std::string* trailing_comments) {
+  if (trailing_comments != nullptr) {
+    _has_bits_[0] |= 0x00000002u;
+  } else {
+    _has_bits_[0] &= ~0x00000002u;
+  }
+  trailing_comments_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), trailing_comments,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceCodeInfo.Location.trailing_comments)
+}
+
+// repeated string leading_detached_comments = 6;
+inline int SourceCodeInfo_Location::_internal_leading_detached_comments_size() const {
+  return leading_detached_comments_.size();
+}
+inline int SourceCodeInfo_Location::leading_detached_comments_size() const {
+  return _internal_leading_detached_comments_size();
+}
+inline void SourceCodeInfo_Location::clear_leading_detached_comments() {
+  leading_detached_comments_.Clear();
+}
+inline std::string* SourceCodeInfo_Location::add_leading_detached_comments() {
+  // @@protoc_insertion_point(field_add_mutable:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  return _internal_add_leading_detached_comments();
+}
+inline const std::string& SourceCodeInfo_Location::_internal_leading_detached_comments(int index) const {
+  return leading_detached_comments_.Get(index);
+}
+inline const std::string& SourceCodeInfo_Location::leading_detached_comments(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  return _internal_leading_detached_comments(index);
+}
+inline std::string* SourceCodeInfo_Location::mutable_leading_detached_comments(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  return leading_detached_comments_.Mutable(index);
+}
+inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const std::string& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  leading_detached_comments_.Mutable(index)->assign(value);
+}
+inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  leading_detached_comments_.Mutable(index)->assign(std::move(value));
+}
+inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  leading_detached_comments_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline void SourceCodeInfo_Location::set_leading_detached_comments(int index, const char* value, size_t size) {
+  leading_detached_comments_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline std::string* SourceCodeInfo_Location::_internal_add_leading_detached_comments() {
+  return leading_detached_comments_.Add();
+}
+inline void SourceCodeInfo_Location::add_leading_detached_comments(const std::string& value) {
+  leading_detached_comments_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline void SourceCodeInfo_Location::add_leading_detached_comments(std::string&& value) {
+  leading_detached_comments_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline void SourceCodeInfo_Location::add_leading_detached_comments(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  leading_detached_comments_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline void SourceCodeInfo_Location::add_leading_detached_comments(const char* value, size_t size) {
+  leading_detached_comments_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>&
+SourceCodeInfo_Location::leading_detached_comments() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  return leading_detached_comments_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>*
+SourceCodeInfo_Location::mutable_leading_detached_comments() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.Location.leading_detached_comments)
+  return &leading_detached_comments_;
+}
+
+// -------------------------------------------------------------------
+
+// SourceCodeInfo
+
+// repeated .google.protobuf.SourceCodeInfo.Location location = 1;
+inline int SourceCodeInfo::_internal_location_size() const {
+  return location_.size();
+}
+inline int SourceCodeInfo::location_size() const {
+  return _internal_location_size();
+}
+inline void SourceCodeInfo::clear_location() {
+  location_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::mutable_location(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.SourceCodeInfo.location)
+  return location_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >*
+SourceCodeInfo::mutable_location() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.SourceCodeInfo.location)
+  return &location_;
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& SourceCodeInfo::_internal_location(int index) const {
+  return location_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location& SourceCodeInfo::location(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceCodeInfo.location)
+  return _internal_location(index);
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::_internal_add_location() {
+  return location_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location* SourceCodeInfo::add_location() {
+  // @@protoc_insertion_point(field_add:google.protobuf.SourceCodeInfo.location)
+  return _internal_add_location();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::SourceCodeInfo_Location >&
+SourceCodeInfo::location() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.SourceCodeInfo.location)
+  return location_;
+}
+
+// -------------------------------------------------------------------
+
+// GeneratedCodeInfo_Annotation
+
+// repeated int32 path = 1 [packed = true];
+inline int GeneratedCodeInfo_Annotation::_internal_path_size() const {
+  return path_.size();
+}
+inline int GeneratedCodeInfo_Annotation::path_size() const {
+  return _internal_path_size();
+}
+inline void GeneratedCodeInfo_Annotation::clear_path() {
+  path_.Clear();
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_path(int index) const {
+  return path_.Get(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::path(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.path)
+  return _internal_path(index);
+}
+inline void GeneratedCodeInfo_Annotation::set_path(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
+  path_.Set(index, value);
+  // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.path)
+}
+inline void GeneratedCodeInfo_Annotation::_internal_add_path(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  path_.Add(value);
+}
+inline void GeneratedCodeInfo_Annotation::add_path(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_add_path(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.GeneratedCodeInfo.Annotation.path)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+GeneratedCodeInfo_Annotation::_internal_path() const {
+  return path_;
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
+GeneratedCodeInfo_Annotation::path() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.GeneratedCodeInfo.Annotation.path)
+  return _internal_path();
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+GeneratedCodeInfo_Annotation::_internal_mutable_path() {
+  return &path_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
+GeneratedCodeInfo_Annotation::mutable_path() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.GeneratedCodeInfo.Annotation.path)
+  return _internal_mutable_path();
+}
+
+// optional string source_file = 2;
+inline bool GeneratedCodeInfo_Annotation::_internal_has_source_file() const {
+  bool value = (_has_bits_[0] & 0x00000001u) != 0;
+  return value;
+}
+inline bool GeneratedCodeInfo_Annotation::has_source_file() const {
+  return _internal_has_source_file();
+}
+inline void GeneratedCodeInfo_Annotation::clear_source_file() {
+  source_file_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline const std::string& GeneratedCodeInfo_Annotation::source_file() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+  return _internal_source_file();
+}
+inline void GeneratedCodeInfo_Annotation::set_source_file(const std::string& value) {
+  _internal_set_source_file(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+}
+inline std::string* GeneratedCodeInfo_Annotation::mutable_source_file() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+  return _internal_mutable_source_file();
+}
+inline const std::string& GeneratedCodeInfo_Annotation::_internal_source_file() const {
+  return source_file_.Get();
+}
+inline void GeneratedCodeInfo_Annotation::_internal_set_source_file(const std::string& value) {
+  _has_bits_[0] |= 0x00000001u;
+  source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void GeneratedCodeInfo_Annotation::set_source_file(std::string&& value) {
+  _has_bits_[0] |= 0x00000001u;
+  source_file_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+}
+inline void GeneratedCodeInfo_Annotation::set_source_file(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  _has_bits_[0] |= 0x00000001u;
+  source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+}
+inline void GeneratedCodeInfo_Annotation::set_source_file(const char* value,
+    size_t size) {
+  _has_bits_[0] |= 0x00000001u;
+  source_file_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+}
+inline std::string* GeneratedCodeInfo_Annotation::_internal_mutable_source_file() {
+  _has_bits_[0] |= 0x00000001u;
+  return source_file_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* GeneratedCodeInfo_Annotation::release_source_file() {
+  // @@protoc_insertion_point(field_release:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+  if (!_internal_has_source_file()) {
+    return nullptr;
+  }
+  _has_bits_[0] &= ~0x00000001u;
+  return source_file_.ReleaseNonDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void GeneratedCodeInfo_Annotation::set_allocated_source_file(std::string* source_file) {
+  if (source_file != nullptr) {
+    _has_bits_[0] |= 0x00000001u;
+  } else {
+    _has_bits_[0] &= ~0x00000001u;
+  }
+  source_file_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), source_file,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.GeneratedCodeInfo.Annotation.source_file)
+}
+
+// optional int32 begin = 3;
+inline bool GeneratedCodeInfo_Annotation::_internal_has_begin() const {
+  bool value = (_has_bits_[0] & 0x00000002u) != 0;
+  return value;
+}
+inline bool GeneratedCodeInfo_Annotation::has_begin() const {
+  return _internal_has_begin();
+}
+inline void GeneratedCodeInfo_Annotation::clear_begin() {
+  begin_ = 0;
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_begin() const {
+  return begin_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::begin() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.begin)
+  return _internal_begin();
+}
+inline void GeneratedCodeInfo_Annotation::_internal_set_begin(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000002u;
+  begin_ = value;
+}
+inline void GeneratedCodeInfo_Annotation::set_begin(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_begin(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.begin)
+}
+
+// optional int32 end = 4;
+inline bool GeneratedCodeInfo_Annotation::_internal_has_end() const {
+  bool value = (_has_bits_[0] & 0x00000004u) != 0;
+  return value;
+}
+inline bool GeneratedCodeInfo_Annotation::has_end() const {
+  return _internal_has_end();
+}
+inline void GeneratedCodeInfo_Annotation::clear_end() {
+  end_ = 0;
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::_internal_end() const {
+  return end_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 GeneratedCodeInfo_Annotation::end() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.Annotation.end)
+  return _internal_end();
+}
+inline void GeneratedCodeInfo_Annotation::_internal_set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _has_bits_[0] |= 0x00000004u;
+  end_ = value;
+}
+inline void GeneratedCodeInfo_Annotation::set_end(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_end(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.GeneratedCodeInfo.Annotation.end)
+}
+
+// -------------------------------------------------------------------
+
+// GeneratedCodeInfo
+
+// repeated .google.protobuf.GeneratedCodeInfo.Annotation annotation = 1;
+inline int GeneratedCodeInfo::_internal_annotation_size() const {
+  return annotation_.size();
+}
+inline int GeneratedCodeInfo::annotation_size() const {
+  return _internal_annotation_size();
+}
+inline void GeneratedCodeInfo::clear_annotation() {
+  annotation_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::mutable_annotation(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.GeneratedCodeInfo.annotation)
+  return annotation_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >*
+GeneratedCodeInfo::mutable_annotation() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.GeneratedCodeInfo.annotation)
+  return &annotation_;
+}
+inline const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& GeneratedCodeInfo::_internal_annotation(int index) const {
+  return annotation_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation& GeneratedCodeInfo::annotation(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.GeneratedCodeInfo.annotation)
+  return _internal_annotation(index);
+}
+inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::_internal_add_annotation() {
+  return annotation_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation* GeneratedCodeInfo::add_annotation() {
+  // @@protoc_insertion_point(field_add:google.protobuf.GeneratedCodeInfo.annotation)
+  return _internal_add_annotation();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::GeneratedCodeInfo_Annotation >&
+GeneratedCodeInfo::annotation() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.GeneratedCodeInfo.annotation)
+  return annotation_;
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+PROTOBUF_NAMESPACE_OPEN
+
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type>() {
+  return PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Type_descriptor();
+}
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label>() {
+  return PROTOBUF_NAMESPACE_ID::FieldDescriptorProto_Label_descriptor();
+}
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode>() {
+  return PROTOBUF_NAMESPACE_ID::FileOptions_OptimizeMode_descriptor();
+}
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldOptions_CType> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldOptions_CType>() {
+  return PROTOBUF_NAMESPACE_ID::FieldOptions_CType_descriptor();
+}
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::FieldOptions_JSType>() {
+  return PROTOBUF_NAMESPACE_ID::FieldOptions_JSType_descriptor();
+}
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel>() {
+  return PROTOBUF_NAMESPACE_ID::MethodOptions_IdempotencyLevel_descriptor();
+}
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fdescriptor_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..769fd2ba6f068eb8d94e18fdbc7f97452d9a9e06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/duration.pb.h
@@ -0,0 +1,282 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/duration.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fduration_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fduration_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fduration_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class Duration;
+class DurationDefaultTypeInternal;
+PROTOBUF_EXPORT extern DurationDefaultTypeInternal _Duration_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Duration* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Duration>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT Duration PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Duration) */ {
+ public:
+  inline Duration() : Duration(nullptr) {}
+  virtual ~Duration();
+
+  Duration(const Duration& from);
+  Duration(Duration&& from) noexcept
+    : Duration() {
+    *this = ::std::move(from);
+  }
+
+  inline Duration& operator=(const Duration& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Duration& operator=(Duration&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Duration& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Duration* internal_default_instance() {
+    return reinterpret_cast<const Duration*>(
+               &_Duration_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(Duration& a, Duration& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Duration* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Duration* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Duration* New() const final {
+    return CreateMaybeMessage<Duration>(nullptr);
+  }
+
+  Duration* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Duration>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Duration& from);
+  void MergeFrom(const Duration& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Duration* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Duration";
+  }
+  protected:
+  explicit Duration(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fduration_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fduration_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kSecondsFieldNumber = 1,
+    kNanosFieldNumber = 2,
+  };
+  // int64 seconds = 1;
+  void clear_seconds();
+  ::PROTOBUF_NAMESPACE_ID::int64 seconds() const;
+  void set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int64 _internal_seconds() const;
+  void _internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value);
+  public:
+
+  // int32 nanos = 2;
+  void clear_nanos();
+  ::PROTOBUF_NAMESPACE_ID::int32 nanos() const;
+  void set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_nanos() const;
+  void _internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Duration)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::int64 seconds_;
+  ::PROTOBUF_NAMESPACE_ID::int32 nanos_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fduration_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// Duration
+
+// int64 seconds = 1;
+inline void Duration::clear_seconds() {
+  seconds_ = PROTOBUF_LONGLONG(0);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 Duration::_internal_seconds() const {
+  return seconds_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 Duration::seconds() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Duration.seconds)
+  return _internal_seconds();
+}
+inline void Duration::_internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  
+  seconds_ = value;
+}
+inline void Duration::set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  _internal_set_seconds(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Duration.seconds)
+}
+
+// int32 nanos = 2;
+inline void Duration::clear_nanos() {
+  nanos_ = 0;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 Duration::_internal_nanos() const {
+  return nanos_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 Duration::nanos() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Duration.nanos)
+  return _internal_nanos();
+}
+inline void Duration::_internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  
+  nanos_ = value;
+}
+inline void Duration::set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_nanos(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Duration.nanos)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fduration_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ccde906f78b2181cd9aeaa3733bdb7fc3b3dc67
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/empty.pb.h
@@ -0,0 +1,218 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/empty.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fempty_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fempty_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fempty_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class Empty;
+class EmptyDefaultTypeInternal;
+PROTOBUF_EXPORT extern EmptyDefaultTypeInternal _Empty_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Empty* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Empty>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT Empty PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Empty) */ {
+ public:
+  inline Empty() : Empty(nullptr) {}
+  virtual ~Empty();
+
+  Empty(const Empty& from);
+  Empty(Empty&& from) noexcept
+    : Empty() {
+    *this = ::std::move(from);
+  }
+
+  inline Empty& operator=(const Empty& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Empty& operator=(Empty&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Empty& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Empty* internal_default_instance() {
+    return reinterpret_cast<const Empty*>(
+               &_Empty_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(Empty& a, Empty& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Empty* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Empty* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Empty* New() const final {
+    return CreateMaybeMessage<Empty>(nullptr);
+  }
+
+  Empty* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Empty>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Empty& from);
+  void MergeFrom(const Empty& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Empty* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Empty";
+  }
+  protected:
+  explicit Empty(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fempty_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fempty_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Empty)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fempty_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// Empty
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fempty_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h
new file mode 100644
index 0000000000000000000000000000000000000000..64257d58ffef9d1094a797b0ec9ce315315ee42f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_reflection.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: jasonh@google.com (Jason Hsueh)
+//
+// This header is logically internal, but is made public because it is used
+// from protocol-compiler-generated code, which may reside in other components.
+// It provides reflection support for generated enums, and is included in
+// generated .pb.h files and should have minimal dependencies. The methods are
+// implemented in generated_message_reflection.cc.
+
+#ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
+#define GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
+
+#include <string>
+
+#include <google/protobuf/generated_enum_util.h>
+#include <google/protobuf/port.h>
+#include <google/protobuf/stubs/strutil.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+class EnumDescriptor;
+}  // namespace protobuf
+}  // namespace google
+
+namespace google {
+namespace protobuf {
+
+// Returns the EnumDescriptor for enum type E, which must be a
+// proto-declared enum type.  Code generated by the protocol compiler
+// will include specializations of this template for each enum type declared.
+template <typename E>
+const EnumDescriptor* GetEnumDescriptor();
+
+namespace internal {
+
+// Helper for EnumType_Parse functions: try to parse the string 'name' as
+// an enum name of the given type, returning true and filling in value on
+// success, or returning false and leaving value unchanged on failure.
+PROTOBUF_EXPORT bool ParseNamedEnum(const EnumDescriptor* descriptor,
+                                    ConstStringParam name, int* value);
+
+template <typename EnumType>
+bool ParseNamedEnum(const EnumDescriptor* descriptor, ConstStringParam name,
+                    EnumType* value) {
+  int tmp;
+  if (!ParseNamedEnum(descriptor, name, &tmp)) return false;
+  *value = static_cast<EnumType>(tmp);
+  return true;
+}
+
+// Just a wrapper around printing the name of a value. The main point of this
+// function is not to be inlined, so that you can do this without including
+// descriptor.h.
+PROTOBUF_EXPORT const std::string& NameOfEnum(const EnumDescriptor* descriptor,
+                                              int value);
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_GENERATED_ENUM_REFLECTION_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..45f5083336bebfda4e5dd65dcb1e68c9e6196daf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_enum_util.h
@@ -0,0 +1,88 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
+#define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
+
+#include <type_traits>
+
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/stubs/strutil.h>
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+
+// This type trait can be used to cause templates to only match proto2 enum
+// types.
+template <typename T>
+struct is_proto_enum : ::std::false_type {};
+
+namespace internal {
+
+// The table entry format for storing enum name-to-value mapping used with lite
+// protos. This struct and the following related functions should only be used
+// by protobuf generated code.
+struct EnumEntry {
+  StringPiece name;
+  int value;
+};
+
+// Looks up a numeric enum value given the string name.
+PROTOBUF_EXPORT bool LookUpEnumValue(const EnumEntry* enums, size_t size,
+                                     StringPiece name, int* value);
+
+// Looks up an enum name given the numeric value.
+PROTOBUF_EXPORT int LookUpEnumName(const EnumEntry* enums,
+                                   const int* sorted_indices, size_t size,
+                                   int value);
+
+// Initializes the list of enum names in std::string form.
+PROTOBUF_EXPORT bool InitializeEnumStrings(
+    const EnumEntry* enums, const int* sorted_indices, size_t size,
+    internal::ExplicitlyConstructed<std::string>* enum_strings);
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b68e93b9b782a74eefcb0cddf844ec0f6a4da8b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_util.h
@@ -0,0 +1,265 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// This file contains miscellaneous helper code used by generated code --
+// including lite types -- but which should not be used directly by users.
+
+#ifndef GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__
+#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__
+
+#include <assert.h>
+
+#include <atomic>
+#include <climits>
+#include <string>
+#include <vector>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/any.h>
+#include <google/protobuf/has_bits.h>
+#include <google/protobuf/implicit_weak_message.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/stubs/once.h>  // Add direct dep on port for pb.cc
+#include <google/protobuf/port.h>
+#include <google/protobuf/repeated_field.h>
+#include <google/protobuf/wire_format_lite.h>
+#include <google/protobuf/stubs/strutil.h>
+#include <google/protobuf/stubs/casts.h>
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+
+class Arena;
+class Message;
+
+namespace io {
+class CodedInputStream;
+}
+
+namespace internal {
+
+template <typename To, typename From>
+inline To DownCast(From* f) {
+  return PROTOBUF_NAMESPACE_ID::internal::down_cast<To>(f);
+}
+template <typename To, typename From>
+inline To DownCast(From& f) {
+  return PROTOBUF_NAMESPACE_ID::internal::down_cast<To>(f);
+}
+
+
+PROTOBUF_EXPORT void InitProtobufDefaults();
+
+// This used by proto1
+PROTOBUF_EXPORT inline const std::string& GetEmptyString() {
+  InitProtobufDefaults();
+  return GetEmptyStringAlreadyInited();
+}
+
+
+// True if IsInitialized() is true for all elements of t.  Type is expected
+// to be a RepeatedPtrField<some message type>.  It's useful to have this
+// helper here to keep the protobuf compiler from ever having to emit loops in
+// IsInitialized() methods.  We want the C++ compiler to inline this or not
+// as it sees fit.
+template <typename Msg>
+bool AllAreInitialized(const RepeatedPtrField<Msg>& t) {
+  for (int i = t.size(); --i >= 0;) {
+    if (!t.Get(i).IsInitialized()) return false;
+  }
+  return true;
+}
+
+// "Weak" variant of AllAreInitialized, used to implement implicit weak fields.
+// This version operates on MessageLite to avoid introducing a dependency on the
+// concrete message type.
+template <class T>
+bool AllAreInitializedWeak(const RepeatedPtrField<T>& t) {
+  for (int i = t.size(); --i >= 0;) {
+    if (!reinterpret_cast<const RepeatedPtrFieldBase&>(t)
+             .Get<ImplicitWeakTypeHandler<T> >(i)
+             .IsInitialized()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool IsPresent(const void* base, uint32 hasbit) {
+  const uint32* has_bits_array = static_cast<const uint32*>(base);
+  return (has_bits_array[hasbit / 32] & (1u << (hasbit & 31))) != 0;
+}
+
+inline bool IsOneofPresent(const void* base, uint32 offset, uint32 tag) {
+  const uint32* oneof =
+      reinterpret_cast<const uint32*>(static_cast<const uint8*>(base) + offset);
+  return *oneof == tag >> 3;
+}
+
+typedef void (*SpecialSerializer)(const uint8* base, uint32 offset, uint32 tag,
+                                  uint32 has_offset,
+                                  io::CodedOutputStream* output);
+
+PROTOBUF_EXPORT void ExtensionSerializer(const uint8* base, uint32 offset,
+                                         uint32 tag, uint32 has_offset,
+                                         io::CodedOutputStream* output);
+PROTOBUF_EXPORT void UnknownFieldSerializerLite(const uint8* base,
+                                                uint32 offset, uint32 tag,
+                                                uint32 has_offset,
+                                                io::CodedOutputStream* output);
+
+PROTOBUF_EXPORT MessageLite* DuplicateIfNonNullInternal(MessageLite* message);
+PROTOBUF_EXPORT MessageLite* GetOwnedMessageInternal(Arena* message_arena,
+                                                     MessageLite* submessage,
+                                                     Arena* submessage_arena);
+PROTOBUF_EXPORT void GenericSwap(MessageLite* m1, MessageLite* m2);
+// We specialize GenericSwap for non-lite messages to benefit from reflection.
+PROTOBUF_EXPORT void GenericSwap(Message* m1, Message* m2);
+
+template <typename T>
+T* DuplicateIfNonNull(T* message) {
+  // The casts must be reinterpret_cast<> because T might be a forward-declared
+  // type that the compiler doesn't know is related to MessageLite.
+  return reinterpret_cast<T*>(
+      DuplicateIfNonNullInternal(reinterpret_cast<MessageLite*>(message)));
+}
+
+template <typename T>
+T* GetOwnedMessage(Arena* message_arena, T* submessage,
+                   Arena* submessage_arena) {
+  // The casts must be reinterpret_cast<> because T might be a forward-declared
+  // type that the compiler doesn't know is related to MessageLite.
+  return reinterpret_cast<T*>(GetOwnedMessageInternal(
+      message_arena, reinterpret_cast<MessageLite*>(submessage),
+      submessage_arena));
+}
+
+// Hide atomic from the public header and allow easy change to regular int
+// on platforms where the atomic might have a perf impact.
+class PROTOBUF_EXPORT CachedSize {
+ public:
+  int Get() const { return size_.load(std::memory_order_relaxed); }
+  void Set(int size) { size_.store(size, std::memory_order_relaxed); }
+
+ private:
+  std::atomic<int> size_{0};
+};
+
+// SCCInfo represents information of a strongly connected component of
+// mutual dependent messages.
+struct PROTOBUF_EXPORT SCCInfoBase {
+  // We use 0 for the Initialized state, because test eax,eax, jnz is smaller
+  // and is subject to macro fusion.
+  enum {
+    kInitialized = 0,  // final state
+    kRunning = 1,
+    kUninitialized = -1,  // initial state
+  };
+#if defined(_MSC_VER) && !defined(__clang__)
+  // MSVC doesn't make std::atomic constant initialized. This union trick
+  // makes it so.
+  union {
+    int visit_status_to_make_linker_init;
+    std::atomic<int> visit_status;
+  };
+#else
+  std::atomic<int> visit_status;
+#endif
+  int num_deps;
+  int num_implicit_weak_deps;
+  void (*init_func)();
+  // This is followed by an array  of num_deps
+  // const SCCInfoBase* deps[];
+};
+
+// Zero-length arrays are a language extension available in GCC and Clang but
+// not MSVC.
+#ifdef __GNUC__
+#define PROTOBUF_ARRAY_SIZE(n) (n)
+#else
+#define PROTOBUF_ARRAY_SIZE(n) ((n) ? (n) : 1)
+#endif
+
+template <int N>
+struct SCCInfo {
+  SCCInfoBase base;
+  // Semantically this is const SCCInfo<T>* which is is a templated type.
+  // The obvious inheriting from SCCInfoBase mucks with struct initialization.
+  // Attempts showed the compiler was generating dynamic initialization code.
+  // This deps array consists of base.num_deps pointers to SCCInfoBase followed
+  // by base.num_implicit_weak_deps pointers to SCCInfoBase*. We need the extra
+  // pointer indirection for implicit weak fields. We cannot use a union type
+  // here, since that would prevent the array from being linker-initialized.
+  void* deps[PROTOBUF_ARRAY_SIZE(N)];
+};
+
+#undef PROTOBUF_ARRAY_SIZE
+
+PROTOBUF_EXPORT void InitSCCImpl(SCCInfoBase* scc);
+
+inline void InitSCC(SCCInfoBase* scc) {
+  auto status = scc->visit_status.load(std::memory_order_acquire);
+  if (PROTOBUF_PREDICT_FALSE(status != SCCInfoBase::kInitialized))
+    InitSCCImpl(scc);
+}
+
+PROTOBUF_EXPORT void DestroyMessage(const void* message);
+PROTOBUF_EXPORT void DestroyString(const void* s);
+// Destroy (not delete) the message
+inline void OnShutdownDestroyMessage(const void* ptr) {
+  OnShutdownRun(DestroyMessage, ptr);
+}
+// Destroy the string (call std::string destructor)
+inline void OnShutdownDestroyString(const std::string* ptr) {
+  OnShutdownRun(DestroyString, ptr);
+}
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h
new file mode 100644
index 0000000000000000000000000000000000000000..14337107a154f25afb83039a633a3cfe3c8367e1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/inlined_string_field.h
@@ -0,0 +1,265 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__
+#define GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__
+
+#include <string>
+#include <utility>
+
+#include <google/protobuf/port.h>
+#include <google/protobuf/stubs/strutil.h>
+
+// Must be included last.
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+
+class Arena;
+
+namespace internal {
+
+// InlinedStringField wraps a std::string instance and exposes an API similar to
+// ArenaStringPtr's wrapping of a std::string* instance.  As std::string is
+// never allocated on the Arena, we expose only the *NoArena methods of
+// ArenaStringPtr.
+//
+// default_value parameters are taken for consistency with ArenaStringPtr, but
+// are not used for most methods.  With inlining, these should be removed from
+// the generated binary.
+class PROTOBUF_EXPORT InlinedStringField {
+ public:
+  InlinedStringField() PROTOBUF_ALWAYS_INLINE;
+  explicit InlinedStringField(const std::string& default_value);
+
+  void AssignWithDefault(const std::string* default_value,
+                         const InlinedStringField& from) PROTOBUF_ALWAYS_INLINE;
+
+  void ClearToEmpty(const std::string* default_value,
+                    Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    ClearToEmptyNoArena(default_value);
+  }
+  void ClearNonDefaultToEmpty() PROTOBUF_ALWAYS_INLINE {
+    ClearNonDefaultToEmptyNoArena();
+  }
+  void ClearToEmptyNoArena(const std::string* /*default_value*/)
+      PROTOBUF_ALWAYS_INLINE {
+    ClearNonDefaultToEmptyNoArena();
+  }
+  void ClearNonDefaultToEmptyNoArena() PROTOBUF_ALWAYS_INLINE;
+
+  void ClearToDefault(const std::string* default_value,
+                      Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    ClearToDefaultNoArena(default_value);
+  }
+  void ClearToDefaultNoArena(const std::string* default_value)
+      PROTOBUF_ALWAYS_INLINE;
+
+  void Destroy(const std::string* default_value,
+               Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    DestroyNoArena(default_value);
+  }
+  void DestroyNoArena(const std::string* default_value) PROTOBUF_ALWAYS_INLINE;
+
+  const std::string& Get() const PROTOBUF_ALWAYS_INLINE { return GetNoArena(); }
+  const std::string& GetNoArena() const PROTOBUF_ALWAYS_INLINE;
+
+  std::string* Mutable(const std::string* default_value,
+                       Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    return MutableNoArena(default_value);
+  }
+  std::string* MutableNoArena(const std::string* default_value)
+      PROTOBUF_ALWAYS_INLINE;
+
+  std::string* Release(const std::string* default_value, Arena* /*arena*/) {
+    return ReleaseNoArena(default_value);
+  }
+  std::string* ReleaseNonDefault(const std::string* default_value,
+                                 Arena* /*arena*/) {
+    return ReleaseNonDefaultNoArena(default_value);
+  }
+  std::string* ReleaseNoArena(const std::string* default_value) {
+    return ReleaseNonDefaultNoArena(default_value);
+  }
+  std::string* ReleaseNonDefaultNoArena(const std::string* default_value);
+
+  void Set(const std::string* default_value, StringPiece value,
+           Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    SetNoArena(default_value, value);
+  }
+  void SetLite(const std::string* default_value, StringPiece value,
+               Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    SetNoArena(default_value, value);
+  }
+  void SetNoArena(const std::string* default_value,
+                  StringPiece value) PROTOBUF_ALWAYS_INLINE;
+
+  void Set(const std::string* default_value, const std::string& value,
+           Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    SetNoArena(default_value, value);
+  }
+  void SetLite(const std::string* default_value, const std::string& value,
+               Arena* /*arena*/) PROTOBUF_ALWAYS_INLINE {
+    SetNoArena(default_value, value);
+  }
+  void SetNoArena(const std::string* default_value,
+                  const std::string& value) PROTOBUF_ALWAYS_INLINE;
+
+  void SetNoArena(const std::string* default_value,
+                  std::string&& value) PROTOBUF_ALWAYS_INLINE;
+  void SetAllocated(const std::string* default_value, std::string* value,
+                    Arena* /*arena*/) {
+    SetAllocatedNoArena(default_value, value);
+  }
+  void SetAllocatedNoArena(const std::string* default_value,
+                           std::string* value);
+  void Swap(InlinedStringField* from) PROTOBUF_ALWAYS_INLINE;
+  std::string* UnsafeMutablePointer();
+  void UnsafeSetDefault(const std::string* default_value);
+  std::string* UnsafeArenaRelease(const std::string* default_value,
+                                  Arena* arena);
+  void UnsafeArenaSetAllocated(const std::string* default_value,
+                               std::string* value, Arena* arena);
+
+  bool IsDefault(const std::string* /*default_value*/) { return false; }
+
+ private:
+  std::string value_;
+};
+
+inline InlinedStringField::InlinedStringField() {}
+
+inline InlinedStringField::InlinedStringField(const std::string& default_value)
+    : value_(default_value) {}
+
+inline void InlinedStringField::AssignWithDefault(
+    const std::string* /*default_value*/, const InlinedStringField& from) {
+  value_ = from.value_;
+}
+
+inline const std::string& InlinedStringField::GetNoArena() const {
+  return value_;
+}
+
+inline std::string* InlinedStringField::MutableNoArena(const std::string*) {
+  return &value_;
+}
+
+inline void InlinedStringField::SetAllocatedNoArena(
+    const std::string* default_value, std::string* value) {
+  if (value == NULL) {
+    value_.assign(*default_value);
+  } else {
+    value_.assign(std::move(*value));
+    delete value;
+  }
+}
+
+inline void InlinedStringField::DestroyNoArena(const std::string*) {
+  // This is invoked from the generated message's ArenaDtor, which is used to
+  // clean up objects not allocated on the Arena.
+  this->~InlinedStringField();
+}
+
+inline void InlinedStringField::ClearNonDefaultToEmptyNoArena() {
+  value_.clear();
+}
+
+inline void InlinedStringField::ClearToDefaultNoArena(
+    const std::string* default_value) {
+  value_.assign(*default_value);
+}
+
+inline std::string* InlinedStringField::ReleaseNonDefaultNoArena(
+    const std::string* default_value) {
+  std::string* released = new std::string(*default_value);
+  value_.swap(*released);
+  return released;
+}
+
+inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/,
+                                           StringPiece value) {
+  value_.assign(value.data(), value.length());
+}
+
+inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/,
+                                           const std::string& value) {
+  value_.assign(value);
+}
+
+inline void InlinedStringField::SetNoArena(const std::string* /*default_value*/,
+                                           std::string&& value) {
+  value_.assign(std::move(value));
+}
+
+inline void InlinedStringField::Swap(InlinedStringField* from) {
+  value_.swap(from->value_);
+}
+
+inline std::string* InlinedStringField::UnsafeMutablePointer() {
+  return &value_;
+}
+
+inline void InlinedStringField::UnsafeSetDefault(
+    const std::string* default_value) {
+  value_.assign(*default_value);
+}
+
+inline std::string* InlinedStringField::UnsafeArenaRelease(
+    const std::string* default_value, Arena* /*arena*/) {
+  return ReleaseNoArena(default_value);
+}
+
+inline void InlinedStringField::UnsafeArenaSetAllocated(
+    const std::string* default_value, std::string* value, Arena* /*arena*/) {
+  if (value == NULL) {
+    value_.assign(*default_value);
+  } else {
+    value_.assign(*value);
+  }
+}
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_INLINED_STRING_FIELD_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h
new file mode 100644
index 0000000000000000000000000000000000000000..540c914b1d675aab8ba3a843500c098f4959863f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map.h
@@ -0,0 +1,1280 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file defines the map container and its helpers to support protobuf maps.
+//
+// The Map and MapIterator types are provided by this header file.
+// Please avoid using other types defined here, unless they are public
+// types within Map or MapIterator, such as Map::value_type.
+
+#ifndef GOOGLE_PROTOBUF_MAP_H__
+#define GOOGLE_PROTOBUF_MAP_H__
+
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <limits>  // To support Visual Studio 2008
+#include <map>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#if defined(__cpp_lib_string_view)
+#include <string_view>
+#endif  // defined(__cpp_lib_string_view)
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/generated_enum_util.h>
+#include <google/protobuf/map_type_handler.h>
+#include <google/protobuf/stubs/hash.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+
+template <typename Key, typename T>
+class Map;
+
+class MapIterator;
+
+template <typename Enum>
+struct is_proto_enum;
+
+namespace internal {
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType key_wire_type,
+          WireFormatLite::FieldType value_wire_type, int default_enum_value>
+class MapFieldLite;
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType key_wire_type,
+          WireFormatLite::FieldType value_wire_type, int default_enum_value>
+class MapField;
+
+template <typename Key, typename T>
+class TypeDefinedMapFieldBase;
+
+class DynamicMapField;
+
+class GeneratedMessageReflection;
+
+// re-implement std::allocator to use arena allocator for memory allocation.
+// Used for Map implementation. Users should not use this class
+// directly.
+template <typename U>
+class MapAllocator {
+ public:
+  using value_type = U;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+
+  MapAllocator() : arena_(nullptr) {}
+  explicit MapAllocator(Arena* arena) : arena_(arena) {}
+  template <typename X>
+  MapAllocator(const MapAllocator<X>& allocator)  // NOLINT(runtime/explicit)
+      : arena_(allocator.arena()) {}
+
+  pointer allocate(size_type n, const void* /* hint */ = nullptr) {
+    // If arena is not given, malloc needs to be called which doesn't
+    // construct element object.
+    if (arena_ == nullptr) {
+      return static_cast<pointer>(::operator new(n * sizeof(value_type)));
+    } else {
+      return reinterpret_cast<pointer>(
+          Arena::CreateArray<uint8>(arena_, n * sizeof(value_type)));
+    }
+  }
+
+  void deallocate(pointer p, size_type n) {
+    if (arena_ == nullptr) {
+#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation)
+      ::operator delete(p, n * sizeof(value_type));
+#else
+      (void)n;
+      ::operator delete(p);
+#endif
+    }
+  }
+
+#if __cplusplus >= 201103L && !defined(GOOGLE_PROTOBUF_OS_APPLE) && \
+    !defined(GOOGLE_PROTOBUF_OS_NACL) &&                            \
+    !defined(GOOGLE_PROTOBUF_OS_EMSCRIPTEN)
+  template <class NodeType, class... Args>
+  void construct(NodeType* p, Args&&... args) {
+    // Clang 3.6 doesn't compile static casting to void* directly. (Issue
+    // #1266) According C++ standard 5.2.9/1: "The static_cast operator shall
+    // not cast away constness". So first the maybe const pointer is casted to
+    // const void* and after the const void* is const casted.
+    new (const_cast<void*>(static_cast<const void*>(p)))
+        NodeType(std::forward<Args>(args)...);
+  }
+
+  template <class NodeType>
+  void destroy(NodeType* p) {
+    p->~NodeType();
+  }
+#else
+  void construct(pointer p, const_reference t) { new (p) value_type(t); }
+
+  void destroy(pointer p) { p->~value_type(); }
+#endif
+
+  template <typename X>
+  struct rebind {
+    using other = MapAllocator<X>;
+  };
+
+  template <typename X>
+  bool operator==(const MapAllocator<X>& other) const {
+    return arena_ == other.arena_;
+  }
+
+  template <typename X>
+  bool operator!=(const MapAllocator<X>& other) const {
+    return arena_ != other.arena_;
+  }
+
+  // To support Visual Studio 2008
+  size_type max_size() const {
+    // parentheses around (std::...:max) prevents macro warning of max()
+    return (std::numeric_limits<size_type>::max)();
+  }
+
+  // To support gcc-4.4, which does not properly
+  // support templated friend classes
+  Arena* arena() const { return arena_; }
+
+ private:
+  using DestructorSkippable_ = void;
+  Arena* const arena_;
+};
+
+template <typename T>
+using KeyForTree =
+    typename std::conditional<std::is_scalar<T>::value, T,
+                              std::reference_wrapper<const T>>::type;
+
+// Default case: Not transparent.
+// We use std::hash<key_type>/std::less<key_type> and all the lookup functions
+// only accept `key_type`.
+template <typename key_type>
+struct TransparentSupport {
+  using hash = std::hash<key_type>;
+  using less = std::less<key_type>;
+
+  static bool Equals(const key_type& a, const key_type& b) { return a == b; }
+
+  template <typename K>
+  using key_arg = key_type;
+};
+
+#if defined(__cpp_lib_string_view)
+// If std::string_view is available, we add transparent support for std::string
+// keys. We use std::hash<std::string_view> as it supports the input types we
+// care about. The lookup functions accept arbitrary `K`. This will include any
+// key type that is convertible to std::string_view.
+template <>
+struct TransparentSupport<std::string> {
+  static std::string_view ImplicitConvert(std::string_view str) { return str; }
+  // If the element is not convertible to std::string_view, try to convert to
+  // std::string first.
+  // The template makes this overload lose resolution when both have the same
+  // rank otherwise.
+  template <typename = void>
+  static std::string_view ImplicitConvert(const std::string& str) {
+    return str;
+  }
+
+  struct hash : private std::hash<std::string_view> {
+    using is_transparent = void;
+
+    template <typename T>
+    size_t operator()(const T& str) const {
+      return base()(ImplicitConvert(str));
+    }
+
+   private:
+    const std::hash<std::string_view>& base() const { return *this; }
+  };
+  struct less {
+    using is_transparent = void;
+
+    template <typename T, typename U>
+    bool operator()(const T& t, const U& u) const {
+      return ImplicitConvert(t) < ImplicitConvert(u);
+    }
+  };
+
+  template <typename T, typename U>
+  static bool Equals(const T& t, const U& u) {
+    return ImplicitConvert(t) == ImplicitConvert(u);
+  }
+
+  template <typename K>
+  using key_arg = K;
+};
+#endif  // defined(__cpp_lib_string_view)
+
+}  // namespace internal
+
+// This is the class for Map's internal value_type. Instead of using
+// std::pair as value_type, we use this class which provides us more control of
+// its process of construction and destruction.
+template <typename Key, typename T>
+struct MapPair {
+  using first_type = const Key;
+  using second_type = T;
+
+  MapPair(const Key& other_first, const T& other_second)
+      : first(other_first), second(other_second) {}
+  explicit MapPair(const Key& other_first) : first(other_first), second() {}
+  MapPair(const MapPair& other) : first(other.first), second(other.second) {}
+
+  ~MapPair() {}
+
+  // Implicitly convertible to std::pair of compatible types.
+  template <typename T1, typename T2>
+  operator std::pair<T1, T2>() const {  // NOLINT(runtime/explicit)
+    return std::pair<T1, T2>(first, second);
+  }
+
+  const Key first;
+  T second;
+
+ private:
+  friend class Arena;
+  friend class Map<Key, T>;
+};
+
+// Map is an associative container type used to store protobuf map
+// fields.  Each Map instance may or may not use a different hash function, a
+// different iteration order, and so on.  E.g., please don't examine
+// implementation details to decide if the following would work:
+//  Map<int, int> m0, m1;
+//  m0[0] = m1[0] = m0[1] = m1[1] = 0;
+//  assert(m0.begin()->first == m1.begin()->first);  // Bug!
+//
+// Map's interface is similar to std::unordered_map, except that Map is not
+// designed to play well with exceptions.
+template <typename Key, typename T>
+class Map {
+ public:
+  using key_type = Key;
+  using mapped_type = T;
+  using value_type = MapPair<Key, T>;
+
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+
+  using size_type = size_t;
+  using hasher = typename internal::TransparentSupport<Key>::hash;
+
+  Map() : arena_(nullptr), default_enum_value_(0) { Init(); }
+  explicit Map(Arena* arena) : arena_(arena), default_enum_value_(0) { Init(); }
+
+  Map(const Map& other)
+      : arena_(nullptr), default_enum_value_(other.default_enum_value_) {
+    Init();
+    insert(other.begin(), other.end());
+  }
+
+  Map(Map&& other) noexcept : Map() {
+    if (other.arena_) {
+      *this = other;
+    } else {
+      swap(other);
+    }
+  }
+  Map& operator=(Map&& other) noexcept {
+    if (this != &other) {
+      if (arena_ != other.arena_) {
+        *this = other;
+      } else {
+        swap(other);
+      }
+    }
+    return *this;
+  }
+
+  template <class InputIt>
+  Map(const InputIt& first, const InputIt& last)
+      : arena_(nullptr), default_enum_value_(0) {
+    Init();
+    insert(first, last);
+  }
+
+  ~Map() {
+    if (arena_ == nullptr) {
+      clear();
+      delete elements_;
+    }
+  }
+
+ private:
+  void Init() { elements_ = Arena::CreateMessage<InnerMap>(arena_, 0); }
+
+  using Allocator = internal::MapAllocator<void*>;
+
+  // InnerMap is a generic hash-based map.  It doesn't contain any
+  // protocol-buffer-specific logic.  It is a chaining hash map with the
+  // additional feature that some buckets can be converted to use an ordered
+  // container.  This ensures O(lg n) bounds on find, insert, and erase, while
+  // avoiding the overheads of ordered containers most of the time.
+  //
+  // The implementation doesn't need the full generality of unordered_map,
+  // and it doesn't have it.  More bells and whistles can be added as needed.
+  // Some implementation details:
+  // 1. The hash function has type hasher and the equality function
+  //    equal_to<Key>.  We inherit from hasher to save space
+  //    (empty-base-class optimization).
+  // 2. The number of buckets is a power of two.
+  // 3. Buckets are converted to trees in pairs: if we convert bucket b then
+  //    buckets b and b^1 will share a tree.  Invariant: buckets b and b^1 have
+  //    the same non-null value iff they are sharing a tree.  (An alternative
+  //    implementation strategy would be to have a tag bit per bucket.)
+  // 4. As is typical for hash_map and such, the Keys and Values are always
+  //    stored in linked list nodes.  Pointers to elements are never invalidated
+  //    until the element is deleted.
+  // 5. The trees' payload type is pointer to linked-list node.  Tree-converting
+  //    a bucket doesn't copy Key-Value pairs.
+  // 6. Once we've tree-converted a bucket, it is never converted back. However,
+  //    the items a tree contains may wind up assigned to trees or lists upon a
+  //    rehash.
+  // 7. The code requires no C++ features from C++14 or later.
+  // 8. Mutations to a map do not invalidate the map's iterators, pointers to
+  //    elements, or references to elements.
+  // 9. Except for erase(iterator), any non-const method can reorder iterators.
+  // 10. InnerMap uses KeyForTree<Key> when using the Tree representation, which
+  //    is either `Key`, if Key is a scalar, or `reference_wrapper<const Key>`
+  //    otherwise. This avoids unncessary copies of string keys, for example.
+  class InnerMap : private hasher {
+   public:
+    explicit InnerMap(size_type n) : InnerMap(nullptr, n) {}
+    InnerMap(Arena* arena, size_type n)
+        : hasher(),
+          num_elements_(0),
+          seed_(Seed()),
+          table_(nullptr),
+          alloc_(arena) {
+      n = TableSize(n);
+      table_ = CreateEmptyTable(n);
+      num_buckets_ = index_of_first_non_null_ = n;
+    }
+
+    ~InnerMap() {
+      if (table_ != nullptr) {
+        clear();
+        Dealloc<void*>(table_, num_buckets_);
+      }
+    }
+
+   private:
+    enum { kMinTableSize = 8 };
+
+    // Linked-list nodes, as one would expect for a chaining hash table.
+    struct Node {
+      value_type kv;
+      Node* next;
+    };
+
+    // Trees. The payload type is a copy of Key, so that we can query the tree
+    // with Keys that are not in any particular data structure.
+    // The value is a void* pointing to Node. We use void* instead of Node* to
+    // avoid code bloat. That way there is only one instantiation of the tree
+    // class per key type.
+    using TreeAllocator = typename Allocator::template rebind<
+        std::pair<const internal::KeyForTree<Key>, void*>>::other;
+    using Tree = std::map<internal::KeyForTree<Key>, void*,
+                          typename internal::TransparentSupport<Key>::less,
+                          TreeAllocator>;
+    using TreeIterator = typename Tree::iterator;
+
+    static Node* NodeFromTreeIterator(TreeIterator it) {
+      return static_cast<Node*>(it->second);
+    }
+
+    // iterator and const_iterator are instantiations of iterator_base.
+    template <typename KeyValueType>
+    class iterator_base {
+     public:
+      using reference = KeyValueType&;
+      using pointer = KeyValueType*;
+
+      // Invariants:
+      // node_ is always correct. This is handy because the most common
+      // operations are operator* and operator-> and they only use node_.
+      // When node_ is set to a non-null value, all the other non-const fields
+      // are updated to be correct also, but those fields can become stale
+      // if the underlying map is modified.  When those fields are needed they
+      // are rechecked, and updated if necessary.
+      iterator_base() : node_(nullptr), m_(nullptr), bucket_index_(0) {}
+
+      explicit iterator_base(const InnerMap* m) : m_(m) {
+        SearchFrom(m->index_of_first_non_null_);
+      }
+
+      // Any iterator_base can convert to any other.  This is overkill, and we
+      // rely on the enclosing class to use it wisely.  The standard "iterator
+      // can convert to const_iterator" is OK but the reverse direction is not.
+      template <typename U>
+      explicit iterator_base(const iterator_base<U>& it)
+          : node_(it.node_), m_(it.m_), bucket_index_(it.bucket_index_) {}
+
+      iterator_base(Node* n, const InnerMap* m, size_type index)
+          : node_(n), m_(m), bucket_index_(index) {}
+
+      iterator_base(TreeIterator tree_it, const InnerMap* m, size_type index)
+          : node_(NodeFromTreeIterator(tree_it)), m_(m), bucket_index_(index) {
+        // Invariant: iterators that use buckets with trees have an even
+        // bucket_index_.
+        GOOGLE_DCHECK_EQ(bucket_index_ % 2, 0u);
+      }
+
+      // Advance through buckets, looking for the first that isn't empty.
+      // If nothing non-empty is found then leave node_ == nullptr.
+      void SearchFrom(size_type start_bucket) {
+        GOOGLE_DCHECK(m_->index_of_first_non_null_ == m_->num_buckets_ ||
+               m_->table_[m_->index_of_first_non_null_] != nullptr);
+        node_ = nullptr;
+        for (bucket_index_ = start_bucket; bucket_index_ < m_->num_buckets_;
+             bucket_index_++) {
+          if (m_->TableEntryIsNonEmptyList(bucket_index_)) {
+            node_ = static_cast<Node*>(m_->table_[bucket_index_]);
+            break;
+          } else if (m_->TableEntryIsTree(bucket_index_)) {
+            Tree* tree = static_cast<Tree*>(m_->table_[bucket_index_]);
+            GOOGLE_DCHECK(!tree->empty());
+            node_ = NodeFromTreeIterator(tree->begin());
+            break;
+          }
+        }
+      }
+
+      reference operator*() const { return node_->kv; }
+      pointer operator->() const { return &(operator*()); }
+
+      friend bool operator==(const iterator_base& a, const iterator_base& b) {
+        return a.node_ == b.node_;
+      }
+      friend bool operator!=(const iterator_base& a, const iterator_base& b) {
+        return a.node_ != b.node_;
+      }
+
+      iterator_base& operator++() {
+        if (node_->next == nullptr) {
+          TreeIterator tree_it;
+          const bool is_list = revalidate_if_necessary(&tree_it);
+          if (is_list) {
+            SearchFrom(bucket_index_ + 1);
+          } else {
+            GOOGLE_DCHECK_EQ(bucket_index_ & 1, 0u);
+            Tree* tree = static_cast<Tree*>(m_->table_[bucket_index_]);
+            if (++tree_it == tree->end()) {
+              SearchFrom(bucket_index_ + 2);
+            } else {
+              node_ = NodeFromTreeIterator(tree_it);
+            }
+          }
+        } else {
+          node_ = node_->next;
+        }
+        return *this;
+      }
+
+      iterator_base operator++(int /* unused */) {
+        iterator_base tmp = *this;
+        ++*this;
+        return tmp;
+      }
+
+      // Assumes node_ and m_ are correct and non-null, but other fields may be
+      // stale.  Fix them as needed.  Then return true iff node_ points to a
+      // Node in a list.  If false is returned then *it is modified to be
+      // a valid iterator for node_.
+      bool revalidate_if_necessary(TreeIterator* it) {
+        GOOGLE_DCHECK(node_ != nullptr && m_ != nullptr);
+        // Force bucket_index_ to be in range.
+        bucket_index_ &= (m_->num_buckets_ - 1);
+        // Common case: the bucket we think is relevant points to node_.
+        if (m_->table_[bucket_index_] == static_cast<void*>(node_)) return true;
+        // Less common: the bucket is a linked list with node_ somewhere in it,
+        // but not at the head.
+        if (m_->TableEntryIsNonEmptyList(bucket_index_)) {
+          Node* l = static_cast<Node*>(m_->table_[bucket_index_]);
+          while ((l = l->next) != nullptr) {
+            if (l == node_) {
+              return true;
+            }
+          }
+        }
+        // Well, bucket_index_ still might be correct, but probably
+        // not.  Revalidate just to be sure.  This case is rare enough that we
+        // don't worry about potential optimizations, such as having a custom
+        // find-like method that compares Node* instead of the key.
+        iterator_base i(m_->find(node_->kv.first, it));
+        bucket_index_ = i.bucket_index_;
+        return m_->TableEntryIsList(bucket_index_);
+      }
+
+      Node* node_;
+      const InnerMap* m_;
+      size_type bucket_index_;
+    };
+
+   public:
+    using iterator = iterator_base<value_type>;
+    using const_iterator = iterator_base<const value_type>;
+
+    iterator begin() { return iterator(this); }
+    iterator end() { return iterator(); }
+    const_iterator begin() const { return const_iterator(this); }
+    const_iterator end() const { return const_iterator(); }
+
+    void clear() {
+      for (size_type b = 0; b < num_buckets_; b++) {
+        if (TableEntryIsNonEmptyList(b)) {
+          Node* node = static_cast<Node*>(table_[b]);
+          table_[b] = nullptr;
+          do {
+            Node* next = node->next;
+            DestroyNode(node);
+            node = next;
+          } while (node != nullptr);
+        } else if (TableEntryIsTree(b)) {
+          Tree* tree = static_cast<Tree*>(table_[b]);
+          GOOGLE_DCHECK(table_[b] == table_[b + 1] && (b & 1) == 0);
+          table_[b] = table_[b + 1] = nullptr;
+          typename Tree::iterator tree_it = tree->begin();
+          do {
+            Node* node = NodeFromTreeIterator(tree_it);
+            typename Tree::iterator next = tree_it;
+            ++next;
+            tree->erase(tree_it);
+            DestroyNode(node);
+            tree_it = next;
+          } while (tree_it != tree->end());
+          DestroyTree(tree);
+          b++;
+        }
+      }
+      num_elements_ = 0;
+      index_of_first_non_null_ = num_buckets_;
+    }
+
+    const hasher& hash_function() const { return *this; }
+
+    static size_type max_size() {
+      return static_cast<size_type>(1) << (sizeof(void**) >= 8 ? 60 : 28);
+    }
+    size_type size() const { return num_elements_; }
+    bool empty() const { return size() == 0; }
+
+    template <typename K>
+    iterator find(const K& k) {
+      return iterator(FindHelper(k).first);
+    }
+
+    // Insert the key into the map, if not present. In that case, the value will
+    // be value initialized.
+    std::pair<iterator, bool> insert(const Key& k) {
+      std::pair<const_iterator, size_type> p = FindHelper(k);
+      // Case 1: key was already present.
+      if (p.first.node_ != nullptr)
+        return std::make_pair(iterator(p.first), false);
+      // Case 2: insert.
+      if (ResizeIfLoadIsOutOfRange(num_elements_ + 1)) {
+        p = FindHelper(k);
+      }
+      const size_type b = p.second;  // bucket number
+      Node* node;
+      if (alloc_.arena() == nullptr) {
+        node = new Node{value_type(k), nullptr};
+      } else {
+        node = Alloc<Node>(1);
+        Arena::CreateInArenaStorage(const_cast<Key*>(&node->kv.first),
+                                    alloc_.arena(), k);
+        Arena::CreateInArenaStorage(&node->kv.second, alloc_.arena());
+      }
+
+      iterator result = InsertUnique(b, node);
+      ++num_elements_;
+      return std::make_pair(result, true);
+    }
+
+    value_type& operator[](const Key& k) { return *insert(k).first; }
+
+    void erase(iterator it) {
+      GOOGLE_DCHECK_EQ(it.m_, this);
+      typename Tree::iterator tree_it;
+      const bool is_list = it.revalidate_if_necessary(&tree_it);
+      size_type b = it.bucket_index_;
+      Node* const item = it.node_;
+      if (is_list) {
+        GOOGLE_DCHECK(TableEntryIsNonEmptyList(b));
+        Node* head = static_cast<Node*>(table_[b]);
+        head = EraseFromLinkedList(item, head);
+        table_[b] = static_cast<void*>(head);
+      } else {
+        GOOGLE_DCHECK(TableEntryIsTree(b));
+        Tree* tree = static_cast<Tree*>(table_[b]);
+        tree->erase(tree_it);
+        if (tree->empty()) {
+          // Force b to be the minimum of b and b ^ 1.  This is important
+          // only because we want index_of_first_non_null_ to be correct.
+          b &= ~static_cast<size_type>(1);
+          DestroyTree(tree);
+          table_[b] = table_[b + 1] = nullptr;
+        }
+      }
+      DestroyNode(item);
+      --num_elements_;
+      if (PROTOBUF_PREDICT_FALSE(b == index_of_first_non_null_)) {
+        while (index_of_first_non_null_ < num_buckets_ &&
+               table_[index_of_first_non_null_] == nullptr) {
+          ++index_of_first_non_null_;
+        }
+      }
+    }
+
+   private:
+    const_iterator find(const Key& k, TreeIterator* it) const {
+      return FindHelper(k, it).first;
+    }
+    template <typename K>
+    std::pair<const_iterator, size_type> FindHelper(const K& k) const {
+      return FindHelper(k, nullptr);
+    }
+    template <typename K>
+    std::pair<const_iterator, size_type> FindHelper(const K& k,
+                                                    TreeIterator* it) const {
+      size_type b = BucketNumber(k);
+      if (TableEntryIsNonEmptyList(b)) {
+        Node* node = static_cast<Node*>(table_[b]);
+        do {
+          if (internal::TransparentSupport<Key>::Equals(node->kv.first, k)) {
+            return std::make_pair(const_iterator(node, this, b), b);
+          } else {
+            node = node->next;
+          }
+        } while (node != nullptr);
+      } else if (TableEntryIsTree(b)) {
+        GOOGLE_DCHECK_EQ(table_[b], table_[b ^ 1]);
+        b &= ~static_cast<size_t>(1);
+        Tree* tree = static_cast<Tree*>(table_[b]);
+        auto tree_it = tree->find(k);
+        if (tree_it != tree->end()) {
+          if (it != nullptr) *it = tree_it;
+          return std::make_pair(const_iterator(tree_it, this, b), b);
+        }
+      }
+      return std::make_pair(end(), b);
+    }
+
+    // Insert the given Node in bucket b.  If that would make bucket b too big,
+    // and bucket b is not a tree, create a tree for buckets b and b^1 to share.
+    // Requires count(*KeyPtrFromNodePtr(node)) == 0 and that b is the correct
+    // bucket.  num_elements_ is not modified.
+    iterator InsertUnique(size_type b, Node* node) {
+      GOOGLE_DCHECK(index_of_first_non_null_ == num_buckets_ ||
+             table_[index_of_first_non_null_] != nullptr);
+      // In practice, the code that led to this point may have already
+      // determined whether we are inserting into an empty list, a short list,
+      // or whatever.  But it's probably cheap enough to recompute that here;
+      // it's likely that we're inserting into an empty or short list.
+      iterator result;
+      GOOGLE_DCHECK(find(node->kv.first) == end());
+      if (TableEntryIsEmpty(b)) {
+        result = InsertUniqueInList(b, node);
+      } else if (TableEntryIsNonEmptyList(b)) {
+        if (PROTOBUF_PREDICT_FALSE(TableEntryIsTooLong(b))) {
+          TreeConvert(b);
+          result = InsertUniqueInTree(b, node);
+          GOOGLE_DCHECK_EQ(result.bucket_index_, b & ~static_cast<size_type>(1));
+        } else {
+          // Insert into a pre-existing list.  This case cannot modify
+          // index_of_first_non_null_, so we skip the code to update it.
+          return InsertUniqueInList(b, node);
+        }
+      } else {
+        // Insert into a pre-existing tree.  This case cannot modify
+        // index_of_first_non_null_, so we skip the code to update it.
+        return InsertUniqueInTree(b, node);
+      }
+      // parentheses around (std::min) prevents macro expansion of min(...)
+      index_of_first_non_null_ =
+          (std::min)(index_of_first_non_null_, result.bucket_index_);
+      return result;
+    }
+
+    // Returns whether we should insert after the head of the list. For
+    // non-optimized builds, we randomly decide whether to insert right at the
+    // head of the list or just after the head. This helps add a little bit of
+    // non-determinism to the map ordering.
+    bool ShouldInsertAfterHead(void* node) {
+#ifdef NDEBUG
+      return false;
+#else
+      // Doing modulo with a prime mixes the bits more.
+      return (reinterpret_cast<uintptr_t>(node) ^ seed_) % 13 > 6;
+#endif
+    }
+
+    // Helper for InsertUnique.  Handles the case where bucket b is a
+    // not-too-long linked list.
+    iterator InsertUniqueInList(size_type b, Node* node) {
+      if (table_[b] != nullptr && ShouldInsertAfterHead(node)) {
+        Node* first = static_cast<Node*>(table_[b]);
+        node->next = first->next;
+        first->next = node;
+        return iterator(node, this, b);
+      }
+
+      node->next = static_cast<Node*>(table_[b]);
+      table_[b] = static_cast<void*>(node);
+      return iterator(node, this, b);
+    }
+
+    // Helper for InsertUnique.  Handles the case where bucket b points to a
+    // Tree.
+    iterator InsertUniqueInTree(size_type b, Node* node) {
+      GOOGLE_DCHECK_EQ(table_[b], table_[b ^ 1]);
+      // Maintain the invariant that node->next is null for all Nodes in Trees.
+      node->next = nullptr;
+      return iterator(
+          static_cast<Tree*>(table_[b])->insert({node->kv.first, node}).first,
+          this, b & ~static_cast<size_t>(1));
+    }
+
+    // Returns whether it did resize.  Currently this is only used when
+    // num_elements_ increases, though it could be used in other situations.
+    // It checks for load too low as well as load too high: because any number
+    // of erases can occur between inserts, the load could be as low as 0 here.
+    // Resizing to a lower size is not always helpful, but failing to do so can
+    // destroy the expected big-O bounds for some operations. By having the
+    // policy that sometimes we resize down as well as up, clients can easily
+    // keep O(size()) = O(number of buckets) if they want that.
+    bool ResizeIfLoadIsOutOfRange(size_type new_size) {
+      const size_type kMaxMapLoadTimes16 = 12;  // controls RAM vs CPU tradeoff
+      const size_type hi_cutoff = num_buckets_ * kMaxMapLoadTimes16 / 16;
+      const size_type lo_cutoff = hi_cutoff / 4;
+      // We don't care how many elements are in trees.  If a lot are,
+      // we may resize even though there are many empty buckets.  In
+      // practice, this seems fine.
+      if (PROTOBUF_PREDICT_FALSE(new_size >= hi_cutoff)) {
+        if (num_buckets_ <= max_size() / 2) {
+          Resize(num_buckets_ * 2);
+          return true;
+        }
+      } else if (PROTOBUF_PREDICT_FALSE(new_size <= lo_cutoff &&
+                                        num_buckets_ > kMinTableSize)) {
+        size_type lg2_of_size_reduction_factor = 1;
+        // It's possible we want to shrink a lot here... size() could even be 0.
+        // So, estimate how much to shrink by making sure we don't shrink so
+        // much that we would need to grow the table after a few inserts.
+        const size_type hypothetical_size = new_size * 5 / 4 + 1;
+        while ((hypothetical_size << lg2_of_size_reduction_factor) <
+               hi_cutoff) {
+          ++lg2_of_size_reduction_factor;
+        }
+        size_type new_num_buckets = std::max<size_type>(
+            kMinTableSize, num_buckets_ >> lg2_of_size_reduction_factor);
+        if (new_num_buckets != num_buckets_) {
+          Resize(new_num_buckets);
+          return true;
+        }
+      }
+      return false;
+    }
+
+    // Resize to the given number of buckets.
+    void Resize(size_t new_num_buckets) {
+      GOOGLE_DCHECK_GE(new_num_buckets, kMinTableSize);
+      void** const old_table = table_;
+      const size_type old_table_size = num_buckets_;
+      num_buckets_ = new_num_buckets;
+      table_ = CreateEmptyTable(num_buckets_);
+      const size_type start = index_of_first_non_null_;
+      index_of_first_non_null_ = num_buckets_;
+      for (size_type i = start; i < old_table_size; i++) {
+        if (TableEntryIsNonEmptyList(old_table, i)) {
+          TransferList(old_table, i);
+        } else if (TableEntryIsTree(old_table, i)) {
+          TransferTree(old_table, i++);
+        }
+      }
+      Dealloc<void*>(old_table, old_table_size);
+    }
+
+    void TransferList(void* const* table, size_type index) {
+      Node* node = static_cast<Node*>(table[index]);
+      do {
+        Node* next = node->next;
+        InsertUnique(BucketNumber(node->kv.first), node);
+        node = next;
+      } while (node != nullptr);
+    }
+
+    void TransferTree(void* const* table, size_type index) {
+      Tree* tree = static_cast<Tree*>(table[index]);
+      typename Tree::iterator tree_it = tree->begin();
+      do {
+        InsertUnique(BucketNumber(std::cref(tree_it->first).get()),
+                     NodeFromTreeIterator(tree_it));
+      } while (++tree_it != tree->end());
+      DestroyTree(tree);
+    }
+
+    Node* EraseFromLinkedList(Node* item, Node* head) {
+      if (head == item) {
+        return head->next;
+      } else {
+        head->next = EraseFromLinkedList(item, head->next);
+        return head;
+      }
+    }
+
+    bool TableEntryIsEmpty(size_type b) const {
+      return TableEntryIsEmpty(table_, b);
+    }
+    bool TableEntryIsNonEmptyList(size_type b) const {
+      return TableEntryIsNonEmptyList(table_, b);
+    }
+    bool TableEntryIsTree(size_type b) const {
+      return TableEntryIsTree(table_, b);
+    }
+    bool TableEntryIsList(size_type b) const {
+      return TableEntryIsList(table_, b);
+    }
+    static bool TableEntryIsEmpty(void* const* table, size_type b) {
+      return table[b] == nullptr;
+    }
+    static bool TableEntryIsNonEmptyList(void* const* table, size_type b) {
+      return table[b] != nullptr && table[b] != table[b ^ 1];
+    }
+    static bool TableEntryIsTree(void* const* table, size_type b) {
+      return !TableEntryIsEmpty(table, b) &&
+             !TableEntryIsNonEmptyList(table, b);
+    }
+    static bool TableEntryIsList(void* const* table, size_type b) {
+      return !TableEntryIsTree(table, b);
+    }
+
+    void TreeConvert(size_type b) {
+      GOOGLE_DCHECK(!TableEntryIsTree(b) && !TableEntryIsTree(b ^ 1));
+      Tree* tree =
+          Arena::Create<Tree>(alloc_.arena(), typename Tree::key_compare(),
+                              typename Tree::allocator_type(alloc_));
+      size_type count = CopyListToTree(b, tree) + CopyListToTree(b ^ 1, tree);
+      GOOGLE_DCHECK_EQ(count, tree->size());
+      table_[b] = table_[b ^ 1] = static_cast<void*>(tree);
+    }
+
+    // Copy a linked list in the given bucket to a tree.
+    // Returns the number of things it copied.
+    size_type CopyListToTree(size_type b, Tree* tree) {
+      size_type count = 0;
+      Node* node = static_cast<Node*>(table_[b]);
+      while (node != nullptr) {
+        tree->insert({node->kv.first, node});
+        ++count;
+        Node* next = node->next;
+        node->next = nullptr;
+        node = next;
+      }
+      return count;
+    }
+
+    // Return whether table_[b] is a linked list that seems awfully long.
+    // Requires table_[b] to point to a non-empty linked list.
+    bool TableEntryIsTooLong(size_type b) {
+      const size_type kMaxLength = 8;
+      size_type count = 0;
+      Node* node = static_cast<Node*>(table_[b]);
+      do {
+        ++count;
+        node = node->next;
+      } while (node != nullptr);
+      // Invariant: no linked list ever is more than kMaxLength in length.
+      GOOGLE_DCHECK_LE(count, kMaxLength);
+      return count >= kMaxLength;
+    }
+
+    template <typename K>
+    size_type BucketNumber(const K& k) const {
+      // We xor the hash value against the random seed so that we effectively
+      // have a random hash function.
+      uint64 h = hash_function()(k) ^ seed_;
+
+      // We use the multiplication method to determine the bucket number from
+      // the hash value. The constant kPhi (suggested by Knuth) is roughly
+      // (sqrt(5) - 1) / 2 * 2^64.
+      constexpr uint64 kPhi = uint64{0x9e3779b97f4a7c15};
+      return ((kPhi * h) >> 32) & (num_buckets_ - 1);
+    }
+
+    // Return a power of two no less than max(kMinTableSize, n).
+    // Assumes either n < kMinTableSize or n is a power of two.
+    size_type TableSize(size_type n) {
+      return n < static_cast<size_type>(kMinTableSize)
+                 ? static_cast<size_type>(kMinTableSize)
+                 : n;
+    }
+
+    // Use alloc_ to allocate an array of n objects of type U.
+    template <typename U>
+    U* Alloc(size_type n) {
+      using alloc_type = typename Allocator::template rebind<U>::other;
+      return alloc_type(alloc_).allocate(n);
+    }
+
+    // Use alloc_ to deallocate an array of n objects of type U.
+    template <typename U>
+    void Dealloc(U* t, size_type n) {
+      using alloc_type = typename Allocator::template rebind<U>::other;
+      alloc_type(alloc_).deallocate(t, n);
+    }
+
+    void DestroyNode(Node* node) {
+      if (alloc_.arena() == nullptr) {
+        delete node;
+      }
+    }
+
+    void DestroyTree(Tree* tree) {
+      if (alloc_.arena() == nullptr) {
+        delete tree;
+      }
+    }
+
+    void** CreateEmptyTable(size_type n) {
+      GOOGLE_DCHECK(n >= kMinTableSize);
+      GOOGLE_DCHECK_EQ(n & (n - 1), 0);
+      void** result = Alloc<void*>(n);
+      memset(result, 0, n * sizeof(result[0]));
+      return result;
+    }
+
+    // Return a randomish value.
+    size_type Seed() const {
+      // We get a little bit of randomness from the address of the map. The
+      // lower bits are not very random, due to alignment, so we discard them
+      // and shift the higher bits into their place.
+      size_type s = reinterpret_cast<uintptr_t>(this) >> 12;
+#if defined(__x86_64__) && defined(__GNUC__) && \
+    !defined(GOOGLE_PROTOBUF_NO_RDTSC)
+      uint32 hi, lo;
+      asm("rdtsc" : "=a"(lo), "=d"(hi));
+      s += ((static_cast<uint64>(hi) << 32) | lo);
+#endif
+      return s;
+    }
+
+    friend class Arena;
+    using InternalArenaConstructable_ = void;
+    using DestructorSkippable_ = void;
+
+    size_type num_elements_;
+    size_type num_buckets_;
+    size_type seed_;
+    size_type index_of_first_non_null_;
+    void** table_;  // an array with num_buckets_ entries
+    Allocator alloc_;
+    GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(InnerMap);
+  };  // end of class InnerMap
+
+  template <typename LookupKey>
+  using key_arg = typename internal::TransparentSupport<
+      key_type>::template key_arg<LookupKey>;
+
+ public:
+  // Iterators
+  class const_iterator {
+    using InnerIt = typename InnerMap::const_iterator;
+
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = typename Map::value_type;
+    using difference_type = ptrdiff_t;
+    using pointer = const value_type*;
+    using reference = const value_type&;
+
+    const_iterator() {}
+    explicit const_iterator(const InnerIt& it) : it_(it) {}
+
+    const_reference operator*() const { return *it_; }
+    const_pointer operator->() const { return &(operator*()); }
+
+    const_iterator& operator++() {
+      ++it_;
+      return *this;
+    }
+    const_iterator operator++(int) { return const_iterator(it_++); }
+
+    friend bool operator==(const const_iterator& a, const const_iterator& b) {
+      return a.it_ == b.it_;
+    }
+    friend bool operator!=(const const_iterator& a, const const_iterator& b) {
+      return !(a == b);
+    }
+
+   private:
+    InnerIt it_;
+  };
+
+  class iterator {
+    using InnerIt = typename InnerMap::iterator;
+
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = typename Map::value_type;
+    using difference_type = ptrdiff_t;
+    using pointer = value_type*;
+    using reference = value_type&;
+
+    iterator() {}
+    explicit iterator(const InnerIt& it) : it_(it) {}
+
+    reference operator*() const { return *it_; }
+    pointer operator->() const { return &(operator*()); }
+
+    iterator& operator++() {
+      ++it_;
+      return *this;
+    }
+    iterator operator++(int) { return iterator(it_++); }
+
+    // Allow implicit conversion to const_iterator.
+    operator const_iterator() const {  // NOLINT(runtime/explicit)
+      return const_iterator(typename InnerMap::const_iterator(it_));
+    }
+
+    friend bool operator==(const iterator& a, const iterator& b) {
+      return a.it_ == b.it_;
+    }
+    friend bool operator!=(const iterator& a, const iterator& b) {
+      return !(a == b);
+    }
+
+   private:
+    friend class Map;
+
+    InnerIt it_;
+  };
+
+  iterator begin() { return iterator(elements_->begin()); }
+  iterator end() { return iterator(elements_->end()); }
+  const_iterator begin() const {
+    return const_iterator(iterator(elements_->begin()));
+  }
+  const_iterator end() const {
+    return const_iterator(iterator(elements_->end()));
+  }
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+
+  // Capacity
+  size_type size() const { return elements_->size(); }
+  bool empty() const { return size() == 0; }
+
+  // Element access
+  T& operator[](const key_type& key) { return (*elements_)[key].second; }
+
+  template <typename K = key_type>
+  const T& at(const key_arg<K>& key) const {
+    const_iterator it = find(key);
+    GOOGLE_CHECK(it != end()) << "key not found: " << static_cast<Key>(key);
+    return it->second;
+  }
+
+  template <typename K = key_type>
+  T& at(const key_arg<K>& key) {
+    iterator it = find(key);
+    GOOGLE_CHECK(it != end()) << "key not found: " << static_cast<Key>(key);
+    return it->second;
+  }
+
+  // Lookup
+  template <typename K = key_type>
+  size_type count(const key_arg<K>& key) const {
+    return find(key) == end() ? 0 : 1;
+  }
+
+  template <typename K = key_type>
+  const_iterator find(const key_arg<K>& key) const {
+    return const_iterator(iterator(elements_->find(key)));
+  }
+  template <typename K = key_type>
+  iterator find(const key_arg<K>& key) {
+    return iterator(elements_->find(key));
+  }
+
+  template <typename K = key_type>
+  bool contains(const key_arg<K>& key) const {
+    return find(key) != end();
+  }
+
+  template <typename K = key_type>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const key_arg<K>& key) const {
+    const_iterator it = find(key);
+    if (it == end()) {
+      return std::pair<const_iterator, const_iterator>(it, it);
+    } else {
+      const_iterator begin = it++;
+      return std::pair<const_iterator, const_iterator>(begin, it);
+    }
+  }
+
+  template <typename K = key_type>
+  std::pair<iterator, iterator> equal_range(const key_arg<K>& key) {
+    iterator it = find(key);
+    if (it == end()) {
+      return std::pair<iterator, iterator>(it, it);
+    } else {
+      iterator begin = it++;
+      return std::pair<iterator, iterator>(begin, it);
+    }
+  }
+
+  // insert
+  std::pair<iterator, bool> insert(const value_type& value) {
+    std::pair<typename InnerMap::iterator, bool> p =
+        elements_->insert(value.first);
+    if (p.second) {
+      p.first->second = value.second;
+    }
+    return std::pair<iterator, bool>(iterator(p.first), p.second);
+  }
+  template <class InputIt>
+  void insert(InputIt first, InputIt last) {
+    for (InputIt it = first; it != last; ++it) {
+      iterator exist_it = find(it->first);
+      if (exist_it == end()) {
+        operator[](it->first) = it->second;
+      }
+    }
+  }
+  void insert(std::initializer_list<value_type> values) {
+    insert(values.begin(), values.end());
+  }
+
+  // Erase and clear
+  template <typename K = key_type>
+  size_type erase(const key_arg<K>& key) {
+    iterator it = find(key);
+    if (it == end()) {
+      return 0;
+    } else {
+      erase(it);
+      return 1;
+    }
+  }
+  iterator erase(iterator pos) {
+    iterator i = pos++;
+    elements_->erase(i.it_);
+    return pos;
+  }
+  void erase(iterator first, iterator last) {
+    while (first != last) {
+      first = erase(first);
+    }
+  }
+  void clear() { elements_->clear(); }
+
+  // Assign
+  Map& operator=(const Map& other) {
+    if (this != &other) {
+      clear();
+      insert(other.begin(), other.end());
+    }
+    return *this;
+  }
+
+  void swap(Map& other) {
+    if (arena_ == other.arena_) {
+      std::swap(default_enum_value_, other.default_enum_value_);
+      std::swap(elements_, other.elements_);
+    } else {
+      // TODO(zuguang): optimize this. The temporary copy can be allocated
+      // in the same arena as the other message, and the "other = copy" can
+      // be replaced with the fast-path swap above.
+      Map copy = *this;
+      *this = other;
+      other = copy;
+    }
+  }
+
+  // Access to hasher.  Currently this returns a copy, but it may
+  // be modified to return a const reference in the future.
+  hasher hash_function() const { return elements_->hash_function(); }
+
+ private:
+  // Set default enum value only for proto2 map field whose value is enum type.
+  void SetDefaultEnumValue(int default_enum_value) {
+    default_enum_value_ = default_enum_value;
+  }
+
+  Arena* arena_;
+  int default_enum_value_;
+  InnerMap* elements_;
+
+  friend class Arena;
+  using InternalArenaConstructable_ = void;
+  using DestructorSkippable_ = void;
+  template <typename Derived, typename K, typename V,
+            internal::WireFormatLite::FieldType key_wire_type,
+            internal::WireFormatLite::FieldType value_wire_type,
+            int default_enum_value>
+  friend class internal::MapFieldLite;
+};
+
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_MAP_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc4a6cc718cd19e17c4d00398147b0846d9bfbe6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_inl.h
@@ -0,0 +1,362 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_MAP_FIELD_INL_H__
+#define GOOGLE_PROTOBUF_MAP_FIELD_INL_H__
+
+#include <memory>
+
+#include <google/protobuf/stubs/casts.h>
+#include <google/protobuf/map.h>
+#include <google/protobuf/map_field.h>
+#include <google/protobuf/map_type_handler.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+namespace internal {
+// UnwrapMapKey template
+template <typename T>
+T UnwrapMapKey(const MapKey& map_key);
+template <>
+inline int32 UnwrapMapKey<int32>(const MapKey& map_key) {
+  return map_key.GetInt32Value();
+}
+template <>
+inline uint32 UnwrapMapKey<uint32>(const MapKey& map_key) {
+  return map_key.GetUInt32Value();
+}
+template <>
+inline int64 UnwrapMapKey<int64>(const MapKey& map_key) {
+  return map_key.GetInt64Value();
+}
+template <>
+inline uint64 UnwrapMapKey<uint64>(const MapKey& map_key) {
+  return map_key.GetUInt64Value();
+}
+template <>
+inline bool UnwrapMapKey<bool>(const MapKey& map_key) {
+  return map_key.GetBoolValue();
+}
+template <>
+inline std::string UnwrapMapKey<std::string>(const MapKey& map_key) {
+  return map_key.GetStringValue();
+}
+
+// SetMapKey template
+template <typename T>
+inline void SetMapKey(MapKey* map_key, const T& value);
+template <>
+inline void SetMapKey<int32>(MapKey* map_key, const int32& value) {
+  map_key->SetInt32Value(value);
+}
+template <>
+inline void SetMapKey<uint32>(MapKey* map_key, const uint32& value) {
+  map_key->SetUInt32Value(value);
+}
+template <>
+inline void SetMapKey<int64>(MapKey* map_key, const int64& value) {
+  map_key->SetInt64Value(value);
+}
+template <>
+inline void SetMapKey<uint64>(MapKey* map_key, const uint64& value) {
+  map_key->SetUInt64Value(value);
+}
+template <>
+inline void SetMapKey<bool>(MapKey* map_key, const bool& value) {
+  map_key->SetBoolValue(value);
+}
+template <>
+inline void SetMapKey<std::string>(MapKey* map_key, const std::string& value) {
+  map_key->SetStringValue(value);
+}
+
+// ------------------------TypeDefinedMapFieldBase---------------
+template <typename Key, typename T>
+typename Map<Key, T>::const_iterator&
+TypeDefinedMapFieldBase<Key, T>::InternalGetIterator(
+    const MapIterator* map_iter) const {
+  return *reinterpret_cast<typename Map<Key, T>::const_iterator*>(
+      map_iter->iter_);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::MapBegin(MapIterator* map_iter) const {
+  InternalGetIterator(map_iter) = GetMap().begin();
+  SetMapIteratorValue(map_iter);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::MapEnd(MapIterator* map_iter) const {
+  InternalGetIterator(map_iter) = GetMap().end();
+}
+
+template <typename Key, typename T>
+bool TypeDefinedMapFieldBase<Key, T>::EqualIterator(
+    const MapIterator& a, const MapIterator& b) const {
+  return InternalGetIterator(&a) == InternalGetIterator(&b);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::IncreaseIterator(
+    MapIterator* map_iter) const {
+  ++InternalGetIterator(map_iter);
+  SetMapIteratorValue(map_iter);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::InitializeIterator(
+    MapIterator* map_iter) const {
+  map_iter->iter_ = new typename Map<Key, T>::const_iterator;
+  GOOGLE_CHECK(map_iter->iter_ != NULL);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::DeleteIterator(
+    MapIterator* map_iter) const {
+  delete reinterpret_cast<typename Map<Key, T>::const_iterator*>(
+      map_iter->iter_);
+}
+
+template <typename Key, typename T>
+void TypeDefinedMapFieldBase<Key, T>::CopyIterator(
+    MapIterator* this_iter, const MapIterator& that_iter) const {
+  InternalGetIterator(this_iter) = InternalGetIterator(&that_iter);
+  this_iter->key_.SetType(that_iter.key_.type());
+  // MapValueRef::type() fails when containing data is null. However, if
+  // this_iter points to MapEnd, data can be null.
+  this_iter->value_.SetType(
+      static_cast<FieldDescriptor::CppType>(that_iter.value_.type_));
+  SetMapIteratorValue(this_iter);
+}
+
+// ----------------------------------------------------------------------
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+int MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+             default_enum_value>::size() const {
+  MapFieldBase::SyncMapWithRepeatedField();
+  return static_cast<int>(impl_.GetMap().size());
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::Clear() {
+  if (this->MapFieldBase::repeated_field_ != nullptr) {
+    RepeatedPtrField<EntryType>* repeated_field =
+        reinterpret_cast<RepeatedPtrField<EntryType>*>(
+            this->MapFieldBase::repeated_field_);
+    repeated_field->Clear();
+  }
+
+  impl_.MutableMap()->clear();
+  // Data in map and repeated field are both empty, but we can't set status
+  // CLEAN. Because clear is a generated API, we cannot invalidate previous
+  // reference to map.
+  MapFieldBase::SetMapDirty();
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::SetMapIteratorValue(MapIterator* map_iter)
+    const {
+  const Map<Key, T>& map = impl_.GetMap();
+  typename Map<Key, T>::const_iterator iter =
+      TypeDefinedMapFieldBase<Key, T>::InternalGetIterator(map_iter);
+  if (iter == map.end()) return;
+  SetMapKey(&map_iter->key_, iter->first);
+  map_iter->value_.SetValue(&iter->second);
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+bool MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::ContainsMapKey(const MapKey& map_key) const {
+  const Map<Key, T>& map = impl_.GetMap();
+  const Key& key = UnwrapMapKey<Key>(map_key);
+  typename Map<Key, T>::const_iterator iter = map.find(key);
+  return iter != map.end();
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+bool MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::InsertOrLookupMapValue(const MapKey& map_key,
+                                                          MapValueRef* val) {
+  // Always use mutable map because users may change the map value by
+  // MapValueRef.
+  Map<Key, T>* map = MutableMap();
+  const Key& key = UnwrapMapKey<Key>(map_key);
+  typename Map<Key, T>::iterator iter = map->find(key);
+  if (map->end() == iter) {
+    val->SetValue(&((*map)[key]));
+    return true;
+  }
+  // Key is already in the map. Make sure (*map)[key] is not called.
+  // [] may reorder the map and iterators.
+  val->SetValue(&(iter->second));
+  return false;
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+bool MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::DeleteMapValue(const MapKey& map_key) {
+  const Key& key = UnwrapMapKey<Key>(map_key);
+  return MutableMap()->erase(key);
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::MergeFrom(const MapFieldBase& other) {
+  MapFieldBase::SyncMapWithRepeatedField();
+  const MapField& other_field = static_cast<const MapField&>(other);
+  other_field.SyncMapWithRepeatedField();
+  impl_.MergeFrom(other_field.impl_);
+  MapFieldBase::SetMapDirty();
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::Swap(MapFieldBase* other) {
+  MapField* other_field = down_cast<MapField*>(other);
+  std::swap(this->MapFieldBase::repeated_field_, other_field->repeated_field_);
+  impl_.Swap(&other_field->impl_);
+  // a relaxed swap of the atomic
+  auto other_state = other_field->state_.load(std::memory_order_relaxed);
+  auto this_state = this->MapFieldBase::state_.load(std::memory_order_relaxed);
+  other_field->state_.store(this_state, std::memory_order_relaxed);
+  this->MapFieldBase::state_.store(other_state, std::memory_order_relaxed);
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::SyncRepeatedFieldWithMapNoLock() const {
+  if (this->MapFieldBase::repeated_field_ == NULL) {
+    if (this->MapFieldBase::arena_ == NULL) {
+      this->MapFieldBase::repeated_field_ = new RepeatedPtrField<Message>();
+    } else {
+      this->MapFieldBase::repeated_field_ =
+          Arena::CreateMessage<RepeatedPtrField<Message> >(
+              this->MapFieldBase::arena_);
+    }
+  }
+  const Map<Key, T>& map = impl_.GetMap();
+  RepeatedPtrField<EntryType>* repeated_field =
+      reinterpret_cast<RepeatedPtrField<EntryType>*>(
+          this->MapFieldBase::repeated_field_);
+
+  repeated_field->Clear();
+
+  // The only way we can get at this point is through reflection and the
+  // only way we can get the reflection object is by having called GetReflection
+  // on the encompassing field. So that type must have existed and hence we
+  // know that this MapEntry default_type has also already been constructed.
+  // So it's safe to just call internal_default_instance().
+  const Message* default_entry = Derived::internal_default_instance();
+  for (typename Map<Key, T>::const_iterator it = map.begin(); it != map.end();
+       ++it) {
+    EntryType* new_entry =
+        down_cast<EntryType*>(default_entry->New(this->MapFieldBase::arena_));
+    repeated_field->AddAllocated(new_entry);
+    (*new_entry->mutable_key()) = it->first;
+    (*new_entry->mutable_value()) = it->second;
+  }
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+void MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+              default_enum_value>::SyncMapWithRepeatedFieldNoLock() const {
+  Map<Key, T>* map = const_cast<MapField*>(this)->impl_.MutableMap();
+  RepeatedPtrField<EntryType>* repeated_field =
+      reinterpret_cast<RepeatedPtrField<EntryType>*>(
+          this->MapFieldBase::repeated_field_);
+  GOOGLE_CHECK(this->MapFieldBase::repeated_field_ != NULL);
+  map->clear();
+  for (typename RepeatedPtrField<EntryType>::iterator it =
+           repeated_field->begin();
+       it != repeated_field->end(); ++it) {
+    // Cast is needed because Map's api and internal storage is different when
+    // value is enum. For enum, we cannot cast an int to enum. Thus, we have to
+    // copy value. For other types, they have same exposed api type and internal
+    // stored type. We should not introduce value copy for them. We achieve this
+    // by casting to value for enum while casting to reference for other types.
+    (*map)[it->key()] = static_cast<CastValueType>(it->value());
+  }
+}
+
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+size_t MapField<Derived, Key, T, kKeyFieldType, kValueFieldType,
+                default_enum_value>::SpaceUsedExcludingSelfNoLock() const {
+  size_t size = 0;
+  if (this->MapFieldBase::repeated_field_ != NULL) {
+    size += this->MapFieldBase::repeated_field_->SpaceUsedExcludingSelfLong();
+  }
+  Map<Key, T>* map = const_cast<MapField*>(this)->impl_.MutableMap();
+  size += sizeof(*map);
+  for (typename Map<Key, T>::iterator it = map->begin(); it != map->end();
+       ++it) {
+    size += KeyTypeHandler::SpaceUsedInMapLong(it->first);
+    size += ValueTypeHandler::SpaceUsedInMapLong(it->second);
+  }
+  return size;
+}
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#endif  // GOOGLE_PROTOBUF_MAP_FIELD_INL_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8e04ca67aa1cfbe4f980ad5292140dcd04e022a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_field_lite.h
@@ -0,0 +1,195 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__
+#define GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__
+
+#include <type_traits>
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/map.h>
+#include <google/protobuf/map_entry_lite.h>
+#include <google/protobuf/port.h>
+#include <google/protobuf/wire_format_lite.h>
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// This class provides access to map field using generated api. It is used for
+// internal generated message implentation only. Users should never use this
+// directly.
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType key_wire_type,
+          WireFormatLite::FieldType value_wire_type, int default_enum_value = 0>
+class MapFieldLite {
+  // Define message type for internal repeated field.
+  typedef Derived EntryType;
+
+ public:
+  typedef Map<Key, T> MapType;
+  typedef EntryType EntryTypeTrait;
+
+  MapFieldLite() { SetDefaultEnumValue(); }
+
+  explicit MapFieldLite(Arena* arena) : map_(arena) { SetDefaultEnumValue(); }
+
+  // Accessors
+  const Map<Key, T>& GetMap() const { return map_; }
+  Map<Key, T>* MutableMap() { return &map_; }
+
+  // Convenient methods for generated message implementation.
+  int size() const { return static_cast<int>(map_.size()); }
+  void Clear() { return map_.clear(); }
+  void MergeFrom(const MapFieldLite& other) {
+    for (typename Map<Key, T>::const_iterator it = other.map_.begin();
+         it != other.map_.end(); ++it) {
+      map_[it->first] = it->second;
+    }
+  }
+  void Swap(MapFieldLite* other) { map_.swap(other->map_); }
+
+  // Set default enum value only for proto2 map field whose value is enum type.
+  void SetDefaultEnumValue() {
+    MutableMap()->SetDefaultEnumValue(default_enum_value);
+  }
+
+  // Used in the implementation of parsing. Caller should take the ownership iff
+  // arena_ is NULL.
+  EntryType* NewEntry() const {
+    return Arena::CreateMessage<EntryType>(map_.arena_);
+  }
+  // Used in the implementation of serializing enum value type. Caller should
+  // take the ownership iff arena_ is NULL.
+  EntryType* NewEnumEntryWrapper(const Key& key, const T t) const {
+    return EntryType::EnumWrap(key, t, map_.arena_);
+  }
+  // Used in the implementation of serializing other value types. Caller should
+  // take the ownership iff arena_ is NULL.
+  EntryType* NewEntryWrapper(const Key& key, const T& t) const {
+    return EntryType::Wrap(key, t, map_.arena_);
+  }
+
+  const char* _InternalParse(const char* ptr, ParseContext* ctx) {
+    typename Derived::template Parser<MapFieldLite, Map<Key, T>> parser(this);
+    return parser._InternalParse(ptr, ctx);
+  }
+
+  template <typename UnknownType>
+  const char* ParseWithEnumValidation(const char* ptr, ParseContext* ctx,
+                                      bool (*is_valid)(int), uint32 field_num,
+                                      InternalMetadata* metadata) {
+    typename Derived::template Parser<MapFieldLite, Map<Key, T>> parser(this);
+    return parser.template ParseWithEnumValidation<UnknownType>(
+        ptr, ctx, is_valid, field_num, metadata);
+  }
+
+ private:
+  typedef void DestructorSkippable_;
+
+  Map<Key, T> map_;
+
+  friend class ::PROTOBUF_NAMESPACE_ID::Arena;
+};
+
+template <typename UnknownType, typename T>
+struct EnumParseWrapper {
+  const char* _InternalParse(const char* ptr, ParseContext* ctx) {
+    return map_field->template ParseWithEnumValidation<UnknownType>(
+        ptr, ctx, is_valid, field_num, metadata);
+  }
+  T* map_field;
+  bool (*is_valid)(int);
+  uint32 field_num;
+  InternalMetadata* metadata;
+};
+
+// Helper function because the typenames of maps are horrendous to print. This
+// leverages compiler type deduction, to keep all type data out of the
+// generated code
+template <typename UnknownType, typename T>
+EnumParseWrapper<UnknownType, T> InitEnumParseWrapper(
+    T* map_field, bool (*is_valid)(int), uint32 field_num,
+    InternalMetadata* metadata) {
+  return EnumParseWrapper<UnknownType, T>{map_field, is_valid, field_num,
+                                          metadata};
+}
+
+// True if IsInitialized() is true for value field in all elements of t. T is
+// expected to be message.  It's useful to have this helper here to keep the
+// protobuf compiler from ever having to emit loops in IsInitialized() methods.
+// We want the C++ compiler to inline this or not as it sees fit.
+template <typename Derived, typename Key, typename T,
+          WireFormatLite::FieldType key_wire_type,
+          WireFormatLite::FieldType value_wire_type, int default_enum_value>
+bool AllAreInitialized(
+    const MapFieldLite<Derived, Key, T, key_wire_type, value_wire_type,
+                       default_enum_value>& field) {
+  const auto& t = field.GetMap();
+  for (typename Map<Key, T>::const_iterator it = t.begin(); it != t.end();
+       ++it) {
+    if (!it->second.IsInitialized()) return false;
+  }
+  return true;
+}
+
+template <typename MEntry>
+struct MapEntryToMapField : MapEntryToMapField<typename MEntry::SuperType> {};
+
+template <typename T, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+struct MapEntryToMapField<MapEntryLite<T, Key, Value, kKeyFieldType,
+                                       kValueFieldType, default_enum_value>> {
+  typedef MapFieldLite<MapEntryLite<T, Key, Value, kKeyFieldType,
+                                    kValueFieldType, default_enum_value>,
+                       Key, Value, kKeyFieldType, kValueFieldType,
+                       default_enum_value>
+      MapFieldType;
+};
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_MAP_FIELD_LITE_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0169bef30b5f7a3a41bfe301750b8ca3aa93a08
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_type_handler.h
@@ -0,0 +1,812 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_TYPE_HANDLER_H__
+#define GOOGLE_PROTOBUF_TYPE_HANDLER_H__
+
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/wire_format_lite.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// Used for compile time type selection. MapIf::type will be TrueType if Flag is
+// true and FalseType otherwise.
+template <bool Flag, typename TrueType, typename FalseType>
+struct MapIf;
+
+template <typename TrueType, typename FalseType>
+struct MapIf<true, TrueType, FalseType> {
+  typedef TrueType type;
+};
+
+template <typename TrueType, typename FalseType>
+struct MapIf<false, TrueType, FalseType> {
+  typedef FalseType type;
+};
+
+// In proto2 Map, enum needs to be initialized to given default value, while
+// other types' default value can be inferred from the type.
+template <bool IsEnum, typename Type>
+class MapValueInitializer {
+ public:
+  static inline void Initialize(Type& type, int default_enum_value);
+};
+
+template <typename Type>
+class MapValueInitializer<true, Type> {
+ public:
+  static inline void Initialize(Type& value, int default_enum_value) {
+    value = static_cast<Type>(default_enum_value);
+  }
+};
+
+template <typename Type>
+class MapValueInitializer<false, Type> {
+ public:
+  static inline void Initialize(Type& /* value */,
+                                int /* default_enum_value */) {}
+};
+
+template <typename Type, bool is_arena_constructable>
+class MapArenaMessageCreator {
+ public:
+  // Use arena to create message if Type is arena constructable. Otherwise,
+  // create the message on heap.
+  static inline Type* CreateMessage(Arena* arena);
+};
+template <typename Type>
+class MapArenaMessageCreator<Type, true> {
+ public:
+  static inline Type* CreateMessage(Arena* arena) {
+    return Arena::CreateMessage<Type>(arena);
+  }
+};
+template <typename Type>
+class MapArenaMessageCreator<Type, false> {
+ public:
+  static inline Type* CreateMessage(Arena* arena) {
+    return Arena::Create<Type>(arena);
+  }
+};
+
+// Define constants for given wire field type
+template <WireFormatLite::FieldType field_type, typename Type>
+class MapWireFieldTypeTraits {};
+
+#define TYPE_TRAITS(FieldType, CType, WireFormatType, IsMessage, IsEnum)   \
+  template <typename Type>                                                 \
+  class MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType, Type> {   \
+   public:                                                                 \
+    static const bool kIsMessage = IsMessage;                              \
+    static const bool kIsEnum = IsEnum;                                    \
+    typedef typename MapIf<kIsMessage, Type*, CType>::type TypeOnMemory;   \
+    typedef typename MapIf<kIsEnum, int, Type>::type MapEntryAccessorType; \
+    static const WireFormatLite::WireType kWireType =                      \
+        WireFormatLite::WIRETYPE_##WireFormatType;                         \
+  };
+
+TYPE_TRAITS(MESSAGE, Type, LENGTH_DELIMITED, true, false)
+TYPE_TRAITS(STRING, ArenaStringPtr, LENGTH_DELIMITED, false, false)
+TYPE_TRAITS(BYTES, ArenaStringPtr, LENGTH_DELIMITED, false, false)
+TYPE_TRAITS(INT64, int64, VARINT, false, false)
+TYPE_TRAITS(UINT64, uint64, VARINT, false, false)
+TYPE_TRAITS(INT32, int32, VARINT, false, false)
+TYPE_TRAITS(UINT32, uint32, VARINT, false, false)
+TYPE_TRAITS(SINT64, int64, VARINT, false, false)
+TYPE_TRAITS(SINT32, int32, VARINT, false, false)
+TYPE_TRAITS(ENUM, int, VARINT, false, true)
+TYPE_TRAITS(DOUBLE, double, FIXED64, false, false)
+TYPE_TRAITS(FLOAT, float, FIXED32, false, false)
+TYPE_TRAITS(FIXED64, uint64, FIXED64, false, false)
+TYPE_TRAITS(FIXED32, uint32, FIXED32, false, false)
+TYPE_TRAITS(SFIXED64, int64, FIXED64, false, false)
+TYPE_TRAITS(SFIXED32, int32, FIXED32, false, false)
+TYPE_TRAITS(BOOL, bool, VARINT, false, false)
+
+#undef TYPE_TRAITS
+
+template <WireFormatLite::FieldType field_type, typename Type>
+class MapTypeHandler {};
+
+template <typename Type>
+class MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type> {
+ public:
+  // Enum type cannot be used for MapTypeHandler::Read. Define a type which will
+  // replace Enum with int.
+  typedef typename MapWireFieldTypeTraits<WireFormatLite::TYPE_MESSAGE,
+                                          Type>::MapEntryAccessorType
+      MapEntryAccessorType;
+  // Internal stored type in MapEntryLite for given wire field type.
+  typedef typename MapWireFieldTypeTraits<WireFormatLite::TYPE_MESSAGE,
+                                          Type>::TypeOnMemory TypeOnMemory;
+  // Corresponding wire type for field type.
+  static constexpr WireFormatLite::WireType kWireType =
+      MapWireFieldTypeTraits<WireFormatLite::TYPE_MESSAGE, Type>::kWireType;
+  // Whether wire type is for message.
+  static constexpr bool kIsMessage =
+      MapWireFieldTypeTraits<WireFormatLite::TYPE_MESSAGE, Type>::kIsMessage;
+  // Whether wire type is for enum.
+  static constexpr bool kIsEnum =
+      MapWireFieldTypeTraits<WireFormatLite::TYPE_MESSAGE, Type>::kIsEnum;
+
+  // Functions used in parsing and serialization. ===================
+  static inline size_t ByteSize(const MapEntryAccessorType& value);
+  static inline int GetCachedSize(const MapEntryAccessorType& value);
+  static inline bool Read(io::CodedInputStream* input,
+                          MapEntryAccessorType* value);
+  static inline const char* Read(const char* ptr, ParseContext* ctx,
+                                 MapEntryAccessorType* value);
+
+  static inline uint8* Write(int field, const MapEntryAccessorType& value,
+                             uint8* ptr, io::EpsCopyOutputStream* stream);
+
+  // Functions to manipulate data on memory. ========================
+  static inline const Type& GetExternalReference(const Type* value);
+  static inline void DeleteNoArena(const Type* x);
+  static inline void Merge(const Type& from, Type** to, Arena* arena);
+  static inline void Clear(Type** value, Arena* arena);
+  static inline void ClearMaybeByDefaultEnum(Type** value, Arena* arena,
+                                             int default_enum_value);
+  static inline void Initialize(Type** x, Arena* arena);
+
+  static inline void InitializeMaybeByDefaultEnum(Type** x,
+                                                  int default_enum_value,
+                                                  Arena* arena);
+  static inline Type* EnsureMutable(Type** value, Arena* arena);
+  // SpaceUsedInMapEntry: Return bytes used by value in MapEntry, excluding
+  // those already calculate in sizeof(MapField).
+  static inline size_t SpaceUsedInMapEntryLong(const Type* value);
+  // Return bytes used by value in Map.
+  static inline size_t SpaceUsedInMapLong(const Type& value);
+  // Assign default value to given instance.
+  static inline void AssignDefaultValue(Type** value);
+  // Return default instance if value is not initialized when calling const
+  // reference accessor.
+  static inline const Type& DefaultIfNotInitialized(const Type* value,
+                                                    const Type* default_value);
+  // Check if all required fields have values set.
+  static inline bool IsInitialized(Type* value);
+};
+
+#define MAP_HANDLER(FieldType)                                                \
+  template <typename Type>                                                    \
+  class MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type> {              \
+   public:                                                                    \
+    typedef typename MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType, \
+                                            Type>::MapEntryAccessorType       \
+        MapEntryAccessorType;                                                 \
+    typedef typename MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType, \
+                                            Type>::TypeOnMemory TypeOnMemory; \
+    static const WireFormatLite::WireType kWireType =                         \
+        MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType,              \
+                               Type>::kWireType;                              \
+    static const bool kIsMessage =                                            \
+        MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType,              \
+                               Type>::kIsMessage;                             \
+    static const bool kIsEnum =                                               \
+        MapWireFieldTypeTraits<WireFormatLite::TYPE_##FieldType,              \
+                               Type>::kIsEnum;                                \
+    static inline int ByteSize(const MapEntryAccessorType& value);            \
+    static inline int GetCachedSize(const MapEntryAccessorType& value);       \
+    static inline bool Read(io::CodedInputStream* input,                      \
+                            MapEntryAccessorType* value);                     \
+    static inline const char* Read(const char* begin, ParseContext* ctx,      \
+                                   MapEntryAccessorType* value);              \
+    static inline uint8* Write(int field, const MapEntryAccessorType& value,  \
+                               uint8* ptr, io::EpsCopyOutputStream* stream);  \
+    static inline const MapEntryAccessorType& GetExternalReference(           \
+        const TypeOnMemory& value);                                           \
+    static inline void DeleteNoArena(const TypeOnMemory& x);                  \
+    static inline void Merge(const MapEntryAccessorType& from,                \
+                             TypeOnMemory* to, Arena* arena);                 \
+    static inline void Clear(TypeOnMemory* value, Arena* arena);              \
+    static inline void ClearMaybeByDefaultEnum(TypeOnMemory* value,           \
+                                               Arena* arena,                  \
+                                               int default_enum);             \
+    static inline size_t SpaceUsedInMapEntryLong(const TypeOnMemory& value);  \
+    static inline size_t SpaceUsedInMapLong(const TypeOnMemory& value);       \
+    static inline size_t SpaceUsedInMapLong(ConstStringParam value);          \
+    static inline void AssignDefaultValue(TypeOnMemory* value);               \
+    static inline const MapEntryAccessorType& DefaultIfNotInitialized(        \
+        const TypeOnMemory& value, const TypeOnMemory& default_value);        \
+    static inline bool IsInitialized(const TypeOnMemory& value);              \
+    static void DeleteNoArena(TypeOnMemory& value);                           \
+    static inline void Initialize(TypeOnMemory* value, Arena* arena);         \
+    static inline void InitializeMaybeByDefaultEnum(TypeOnMemory* value,      \
+                                                    int default_enum_value,   \
+                                                    Arena* arena);            \
+    static inline MapEntryAccessorType* EnsureMutable(TypeOnMemory* value,    \
+                                                      Arena* arena);          \
+  };
+MAP_HANDLER(STRING)
+MAP_HANDLER(BYTES)
+MAP_HANDLER(INT64)
+MAP_HANDLER(UINT64)
+MAP_HANDLER(INT32)
+MAP_HANDLER(UINT32)
+MAP_HANDLER(SINT64)
+MAP_HANDLER(SINT32)
+MAP_HANDLER(ENUM)
+MAP_HANDLER(DOUBLE)
+MAP_HANDLER(FLOAT)
+MAP_HANDLER(FIXED64)
+MAP_HANDLER(FIXED32)
+MAP_HANDLER(SFIXED64)
+MAP_HANDLER(SFIXED32)
+MAP_HANDLER(BOOL)
+#undef MAP_HANDLER
+
+template <typename Type>
+inline size_t MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::ByteSize(
+    const MapEntryAccessorType& value) {
+  return WireFormatLite::MessageSizeNoVirtual(value);
+}
+
+#define GOOGLE_PROTOBUF_BYTE_SIZE(FieldType, DeclaredType)                     \
+  template <typename Type>                                                     \
+  inline int MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::ByteSize( \
+      const MapEntryAccessorType& value) {                                     \
+    return static_cast<int>(WireFormatLite::DeclaredType##Size(value));        \
+  }
+
+GOOGLE_PROTOBUF_BYTE_SIZE(STRING, String)
+GOOGLE_PROTOBUF_BYTE_SIZE(BYTES, Bytes)
+GOOGLE_PROTOBUF_BYTE_SIZE(INT64, Int64)
+GOOGLE_PROTOBUF_BYTE_SIZE(UINT64, UInt64)
+GOOGLE_PROTOBUF_BYTE_SIZE(INT32, Int32)
+GOOGLE_PROTOBUF_BYTE_SIZE(UINT32, UInt32)
+GOOGLE_PROTOBUF_BYTE_SIZE(SINT64, SInt64)
+GOOGLE_PROTOBUF_BYTE_SIZE(SINT32, SInt32)
+GOOGLE_PROTOBUF_BYTE_SIZE(ENUM, Enum)
+
+#undef GOOGLE_PROTOBUF_BYTE_SIZE
+
+#define FIXED_BYTE_SIZE(FieldType, DeclaredType)                               \
+  template <typename Type>                                                     \
+  inline int MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::ByteSize( \
+      const MapEntryAccessorType& /* value */) {                               \
+    return WireFormatLite::k##DeclaredType##Size;                              \
+  }
+
+FIXED_BYTE_SIZE(DOUBLE, Double)
+FIXED_BYTE_SIZE(FLOAT, Float)
+FIXED_BYTE_SIZE(FIXED64, Fixed64)
+FIXED_BYTE_SIZE(FIXED32, Fixed32)
+FIXED_BYTE_SIZE(SFIXED64, SFixed64)
+FIXED_BYTE_SIZE(SFIXED32, SFixed32)
+FIXED_BYTE_SIZE(BOOL, Bool)
+
+#undef FIXED_BYTE_SIZE
+
+template <typename Type>
+inline int MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::GetCachedSize(
+    const MapEntryAccessorType& value) {
+  return static_cast<int>(WireFormatLite::LengthDelimitedSize(
+      static_cast<size_t>(value.GetCachedSize())));
+}
+
+#define GET_CACHED_SIZE(FieldType, DeclaredType)                         \
+  template <typename Type>                                               \
+  inline int                                                             \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::GetCachedSize( \
+      const MapEntryAccessorType& value) {                               \
+    return static_cast<int>(WireFormatLite::DeclaredType##Size(value));  \
+  }
+
+GET_CACHED_SIZE(STRING, String)
+GET_CACHED_SIZE(BYTES, Bytes)
+GET_CACHED_SIZE(INT64, Int64)
+GET_CACHED_SIZE(UINT64, UInt64)
+GET_CACHED_SIZE(INT32, Int32)
+GET_CACHED_SIZE(UINT32, UInt32)
+GET_CACHED_SIZE(SINT64, SInt64)
+GET_CACHED_SIZE(SINT32, SInt32)
+GET_CACHED_SIZE(ENUM, Enum)
+
+#undef GET_CACHED_SIZE
+
+#define GET_FIXED_CACHED_SIZE(FieldType, DeclaredType)                   \
+  template <typename Type>                                               \
+  inline int                                                             \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::GetCachedSize( \
+      const MapEntryAccessorType& /* value */) {                         \
+    return WireFormatLite::k##DeclaredType##Size;                        \
+  }
+
+GET_FIXED_CACHED_SIZE(DOUBLE, Double)
+GET_FIXED_CACHED_SIZE(FLOAT, Float)
+GET_FIXED_CACHED_SIZE(FIXED64, Fixed64)
+GET_FIXED_CACHED_SIZE(FIXED32, Fixed32)
+GET_FIXED_CACHED_SIZE(SFIXED64, SFixed64)
+GET_FIXED_CACHED_SIZE(SFIXED32, SFixed32)
+GET_FIXED_CACHED_SIZE(BOOL, Bool)
+
+#undef GET_FIXED_CACHED_SIZE
+
+template <typename Type>
+inline uint8* MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Write(
+    int field, const MapEntryAccessorType& value, uint8* ptr,
+    io::EpsCopyOutputStream* stream) {
+  ptr = stream->EnsureSpace(ptr);
+  return WireFormatLite::InternalWriteMessage(field, value, ptr, stream);
+}
+
+#define WRITE_METHOD(FieldType, DeclaredType)                                  \
+  template <typename Type>                                                     \
+  inline uint8* MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Write( \
+      int field, const MapEntryAccessorType& value, uint8* ptr,                \
+      io::EpsCopyOutputStream* stream) {                                       \
+    ptr = stream->EnsureSpace(ptr);                                            \
+    return stream->Write##DeclaredType(field, value, ptr);                     \
+  }
+
+WRITE_METHOD(STRING, String)
+WRITE_METHOD(BYTES, Bytes)
+
+#undef WRITE_METHOD
+#define WRITE_METHOD(FieldType, DeclaredType)                                  \
+  template <typename Type>                                                     \
+  inline uint8* MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Write( \
+      int field, const MapEntryAccessorType& value, uint8* ptr,                \
+      io::EpsCopyOutputStream* stream) {                                       \
+    ptr = stream->EnsureSpace(ptr);                                            \
+    return WireFormatLite::Write##DeclaredType##ToArray(field, value, ptr);    \
+  }
+
+WRITE_METHOD(INT64, Int64)
+WRITE_METHOD(UINT64, UInt64)
+WRITE_METHOD(INT32, Int32)
+WRITE_METHOD(UINT32, UInt32)
+WRITE_METHOD(SINT64, SInt64)
+WRITE_METHOD(SINT32, SInt32)
+WRITE_METHOD(ENUM, Enum)
+WRITE_METHOD(DOUBLE, Double)
+WRITE_METHOD(FLOAT, Float)
+WRITE_METHOD(FIXED64, Fixed64)
+WRITE_METHOD(FIXED32, Fixed32)
+WRITE_METHOD(SFIXED64, SFixed64)
+WRITE_METHOD(SFIXED32, SFixed32)
+WRITE_METHOD(BOOL, Bool)
+
+#undef WRITE_METHOD
+
+template <typename Type>
+inline bool MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Read(
+    io::CodedInputStream* input, MapEntryAccessorType* value) {
+  return WireFormatLite::ReadMessageNoVirtual(input, value);
+}
+
+template <typename Type>
+inline bool MapTypeHandler<WireFormatLite::TYPE_STRING, Type>::Read(
+    io::CodedInputStream* input, MapEntryAccessorType* value) {
+  return WireFormatLite::ReadString(input, value);
+}
+
+template <typename Type>
+inline bool MapTypeHandler<WireFormatLite::TYPE_BYTES, Type>::Read(
+    io::CodedInputStream* input, MapEntryAccessorType* value) {
+  return WireFormatLite::ReadBytes(input, value);
+}
+
+template <typename Type>
+const char* MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Read(
+    const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) {
+  return ctx->ParseMessage(value, ptr);
+}
+
+template <typename Type>
+const char* MapTypeHandler<WireFormatLite::TYPE_STRING, Type>::Read(
+    const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) {
+  int size = ReadSize(&ptr);
+  GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+  return ctx->ReadString(ptr, size, value);
+}
+
+template <typename Type>
+const char* MapTypeHandler<WireFormatLite::TYPE_BYTES, Type>::Read(
+    const char* ptr, ParseContext* ctx, MapEntryAccessorType* value) {
+  int size = ReadSize(&ptr);
+  GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+  return ctx->ReadString(ptr, size, value);
+}
+
+inline const char* ReadINT64(const char* ptr, int64* value) {
+  return VarintParse(ptr, reinterpret_cast<uint64*>(value));
+}
+inline const char* ReadUINT64(const char* ptr, uint64* value) {
+  return VarintParse(ptr, value);
+}
+inline const char* ReadINT32(const char* ptr, int32* value) {
+  return VarintParse(ptr, reinterpret_cast<uint32*>(value));
+}
+inline const char* ReadUINT32(const char* ptr, uint32* value) {
+  return VarintParse(ptr, value);
+}
+inline const char* ReadSINT64(const char* ptr, int64* value) {
+  *value = ReadVarintZigZag64(&ptr);
+  return ptr;
+}
+inline const char* ReadSINT32(const char* ptr, int32* value) {
+  *value = ReadVarintZigZag32(&ptr);
+  return ptr;
+}
+template <typename E>
+inline const char* ReadENUM(const char* ptr, E* value) {
+  *value = static_cast<E>(ReadVarint32(&ptr));
+  return ptr;
+}
+inline const char* ReadBOOL(const char* ptr, bool* value) {
+  *value = static_cast<bool>(ReadVarint32(&ptr));
+  return ptr;
+}
+
+template <typename F>
+inline const char* ReadUnaligned(const char* ptr, F* value) {
+  *value = UnalignedLoad<F>(ptr);
+  return ptr + sizeof(F);
+}
+inline const char* ReadFLOAT(const char* ptr, float* value) {
+  return ReadUnaligned(ptr, value);
+}
+inline const char* ReadDOUBLE(const char* ptr, double* value) {
+  return ReadUnaligned(ptr, value);
+}
+inline const char* ReadFIXED64(const char* ptr, uint64* value) {
+  return ReadUnaligned(ptr, value);
+}
+inline const char* ReadFIXED32(const char* ptr, uint32* value) {
+  return ReadUnaligned(ptr, value);
+}
+inline const char* ReadSFIXED64(const char* ptr, int64* value) {
+  return ReadUnaligned(ptr, value);
+}
+inline const char* ReadSFIXED32(const char* ptr, int32* value) {
+  return ReadUnaligned(ptr, value);
+}
+
+#define READ_METHOD(FieldType)                                              \
+  template <typename Type>                                                  \
+  inline bool MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Read( \
+      io::CodedInputStream* input, MapEntryAccessorType* value) {           \
+    return WireFormatLite::ReadPrimitive<TypeOnMemory,                      \
+                                         WireFormatLite::TYPE_##FieldType>( \
+        input, value);                                                      \
+  }                                                                         \
+  template <typename Type>                                                  \
+  const char* MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Read( \
+      const char* begin, ParseContext* ctx, MapEntryAccessorType* value) {  \
+    (void)ctx;                                                              \
+    return Read##FieldType(begin, value);                                   \
+  }
+
+READ_METHOD(INT64)
+READ_METHOD(UINT64)
+READ_METHOD(INT32)
+READ_METHOD(UINT32)
+READ_METHOD(SINT64)
+READ_METHOD(SINT32)
+READ_METHOD(ENUM)
+READ_METHOD(DOUBLE)
+READ_METHOD(FLOAT)
+READ_METHOD(FIXED64)
+READ_METHOD(FIXED32)
+READ_METHOD(SFIXED64)
+READ_METHOD(SFIXED32)
+READ_METHOD(BOOL)
+
+#undef READ_METHOD
+
+// Definition for message handler
+
+template <typename Type>
+inline const Type&
+MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::GetExternalReference(
+    const Type* value) {
+  return *value;
+}
+
+template <typename Type>
+inline size_t MapTypeHandler<WireFormatLite::TYPE_MESSAGE,
+                             Type>::SpaceUsedInMapEntryLong(const Type* value) {
+  return value->SpaceUsedLong();
+}
+
+template <typename Type>
+size_t MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::SpaceUsedInMapLong(
+    const Type& value) {
+  return value.SpaceUsedLong();
+}
+
+template <typename Type>
+inline void MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Clear(
+    Type** value, Arena* /* arena */) {
+  if (*value != NULL) (*value)->Clear();
+}
+template <typename Type>
+inline void
+MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::ClearMaybeByDefaultEnum(
+    Type** value, Arena* /* arena */, int /* default_enum_value */) {
+  if (*value != NULL) (*value)->Clear();
+}
+template <typename Type>
+inline void MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Merge(
+    const Type& from, Type** to, Arena* /* arena */) {
+  (*to)->MergeFrom(from);
+}
+
+template <typename Type>
+void MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::DeleteNoArena(
+    const Type* ptr) {
+  delete ptr;
+}
+
+template <typename Type>
+inline void MapTypeHandler<WireFormatLite::TYPE_MESSAGE,
+                           Type>::AssignDefaultValue(Type** value) {
+  *value = const_cast<Type*>(Type::internal_default_instance());
+}
+
+template <typename Type>
+inline void MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::Initialize(
+    Type** x, Arena* /* arena */) {
+  *x = NULL;
+}
+
+template <typename Type>
+inline void MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::
+    InitializeMaybeByDefaultEnum(Type** x, int /* default_enum_value */,
+                                 Arena* /* arena */) {
+  *x = NULL;
+}
+
+template <typename Type>
+inline Type* MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::EnsureMutable(
+    Type** value, Arena* arena) {
+  if (*value == NULL) {
+    *value = MapArenaMessageCreator<
+        Type,
+        Arena::is_arena_constructable<Type>::type::value>::CreateMessage(arena);
+  }
+  return *value;
+}
+
+template <typename Type>
+inline const Type&
+MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::DefaultIfNotInitialized(
+    const Type* value, const Type* default_value) {
+  return value != NULL ? *value : *default_value;
+}
+
+template <typename Type>
+inline bool MapTypeHandler<WireFormatLite::TYPE_MESSAGE, Type>::IsInitialized(
+    Type* value) {
+  return value ? value->IsInitialized() : false;
+}
+
+// Definition for string/bytes handler
+
+#define STRING_OR_BYTES_HANDLER_FUNCTIONS(FieldType)                          \
+  template <typename Type>                                                    \
+  inline const typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,      \
+                                       Type>::MapEntryAccessorType&           \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType,                            \
+                 Type>::GetExternalReference(const TypeOnMemory& value) {     \
+    return value.Get();                                                       \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline size_t                                                               \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType,                            \
+                 Type>::SpaceUsedInMapEntryLong(const TypeOnMemory& value) {  \
+    return sizeof(value);                                                     \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline size_t                                                               \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::SpaceUsedInMapLong( \
+      const TypeOnMemory& value) {                                            \
+    return sizeof(value);                                                     \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline size_t                                                               \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::SpaceUsedInMapLong( \
+      ConstStringParam value) {                                               \
+    return sizeof(std::string);                                               \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Clear(  \
+      TypeOnMemory* value, Arena* arena) {                                    \
+    value->ClearToEmpty(&internal::GetEmptyStringAlreadyInited(), arena);     \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::        \
+      ClearMaybeByDefaultEnum(TypeOnMemory* value, Arena* arena,              \
+                              int /* default_enum */) {                       \
+    Clear(value, arena);                                                      \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Merge(  \
+      const MapEntryAccessorType& from, TypeOnMemory* to, Arena* arena) {     \
+    to->Set(&internal::GetEmptyStringAlreadyInited(), from, arena);           \
+  }                                                                           \
+  template <typename Type>                                                    \
+  void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::DeleteNoArena( \
+      TypeOnMemory& value) {                                                  \
+    value.DestroyNoArena(&internal::GetEmptyStringAlreadyInited());           \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::AssignDefaultValue( \
+      TypeOnMemory* /* value */) {}                                           \
+  template <typename Type>                                                    \
+  inline void                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Initialize(         \
+      TypeOnMemory* value, Arena* /* arena */) {                              \
+    value->UnsafeSetDefault(&internal::GetEmptyStringAlreadyInited());        \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::        \
+      InitializeMaybeByDefaultEnum(                                           \
+          TypeOnMemory* value, int /* default_enum_value */, Arena* arena) {  \
+    Initialize(value, arena);                                                 \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,            \
+                                 Type>::MapEntryAccessorType*                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::EnsureMutable(      \
+      TypeOnMemory* value, Arena* arena) {                                    \
+    return value->Mutable(&internal::GetEmptyStringAlreadyInited(), arena);   \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline const typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,      \
+                                       Type>::MapEntryAccessorType&           \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::                    \
+      DefaultIfNotInitialized(const TypeOnMemory& value,                      \
+                              const TypeOnMemory& /* default_value */) {      \
+    return value.Get();                                                       \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline bool                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::IsInitialized(      \
+      const TypeOnMemory& /* value */) {                                      \
+    return true;                                                              \
+  }
+STRING_OR_BYTES_HANDLER_FUNCTIONS(STRING)
+STRING_OR_BYTES_HANDLER_FUNCTIONS(BYTES)
+#undef STRING_OR_BYTES_HANDLER_FUNCTIONS
+
+#define PRIMITIVE_HANDLER_FUNCTIONS(FieldType)                                \
+  template <typename Type>                                                    \
+  inline const typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,      \
+                                       Type>::MapEntryAccessorType&           \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType,                            \
+                 Type>::GetExternalReference(const TypeOnMemory& value) {     \
+    return value;                                                             \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline size_t MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::      \
+      SpaceUsedInMapEntryLong(const TypeOnMemory& /* value */) {              \
+    return 0;                                                                 \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline size_t                                                               \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::SpaceUsedInMapLong( \
+      const TypeOnMemory& /* value */) {                                      \
+    return sizeof(Type);                                                      \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Clear(  \
+      TypeOnMemory* value, Arena* /* arena */) {                              \
+    *value = 0;                                                               \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::        \
+      ClearMaybeByDefaultEnum(TypeOnMemory* value, Arena* /* arena */,        \
+                              int default_enum_value) {                       \
+    *value = static_cast<TypeOnMemory>(default_enum_value);                   \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Merge(  \
+      const MapEntryAccessorType& from, TypeOnMemory* to,                     \
+      Arena* /* arena */) {                                                   \
+    *to = from;                                                               \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType,                \
+                             Type>::DeleteNoArena(TypeOnMemory& /* x */) {}   \
+  template <typename Type>                                                    \
+  inline void                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::AssignDefaultValue( \
+      TypeOnMemory* /* value */) {}                                           \
+  template <typename Type>                                                    \
+  inline void                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::Initialize(         \
+      TypeOnMemory* value, Arena* /* arena */) {                              \
+    *value = 0;                                                               \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline void MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::        \
+      InitializeMaybeByDefaultEnum(                                           \
+          TypeOnMemory* value, int default_enum_value, Arena* /* arena */) {  \
+    *value = static_cast<TypeOnMemory>(default_enum_value);                   \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,            \
+                                 Type>::MapEntryAccessorType*                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::EnsureMutable(      \
+      TypeOnMemory* value, Arena* /* arena */) {                              \
+    return value;                                                             \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline const typename MapTypeHandler<WireFormatLite::TYPE_##FieldType,      \
+                                       Type>::MapEntryAccessorType&           \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::                    \
+      DefaultIfNotInitialized(const TypeOnMemory& value,                      \
+                              const TypeOnMemory& /* default_value */) {      \
+    return value;                                                             \
+  }                                                                           \
+  template <typename Type>                                                    \
+  inline bool                                                                 \
+  MapTypeHandler<WireFormatLite::TYPE_##FieldType, Type>::IsInitialized(      \
+      const TypeOnMemory& /* value */) {                                      \
+    return true;                                                              \
+  }
+PRIMITIVE_HANDLER_FUNCTIONS(INT64)
+PRIMITIVE_HANDLER_FUNCTIONS(UINT64)
+PRIMITIVE_HANDLER_FUNCTIONS(INT32)
+PRIMITIVE_HANDLER_FUNCTIONS(UINT32)
+PRIMITIVE_HANDLER_FUNCTIONS(SINT64)
+PRIMITIVE_HANDLER_FUNCTIONS(SINT32)
+PRIMITIVE_HANDLER_FUNCTIONS(ENUM)
+PRIMITIVE_HANDLER_FUNCTIONS(DOUBLE)
+PRIMITIVE_HANDLER_FUNCTIONS(FLOAT)
+PRIMITIVE_HANDLER_FUNCTIONS(FIXED64)
+PRIMITIVE_HANDLER_FUNCTIONS(FIXED32)
+PRIMITIVE_HANDLER_FUNCTIONS(SFIXED64)
+PRIMITIVE_HANDLER_FUNCTIONS(SFIXED32)
+PRIMITIVE_HANDLER_FUNCTIONS(BOOL)
+#undef PRIMITIVE_HANDLER_FUNCTIONS
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#endif  // GOOGLE_PROTOBUF_TYPE_HANDLER_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..89761c62ed239aaff1103627725864c56afea253
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/message.h
@@ -0,0 +1,1344 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// Defines Message, the abstract interface implemented by non-lite
+// protocol message objects.  Although it's possible to implement this
+// interface manually, most users will use the protocol compiler to
+// generate implementations.
+//
+// Example usage:
+//
+// Say you have a message defined as:
+//
+//   message Foo {
+//     optional string text = 1;
+//     repeated int32 numbers = 2;
+//   }
+//
+// Then, if you used the protocol compiler to generate a class from the above
+// definition, you could use it like so:
+//
+//   std::string data;  // Will store a serialized version of the message.
+//
+//   {
+//     // Create a message and serialize it.
+//     Foo foo;
+//     foo.set_text("Hello World!");
+//     foo.add_numbers(1);
+//     foo.add_numbers(5);
+//     foo.add_numbers(42);
+//
+//     foo.SerializeToString(&data);
+//   }
+//
+//   {
+//     // Parse the serialized message and check that it contains the
+//     // correct data.
+//     Foo foo;
+//     foo.ParseFromString(data);
+//
+//     assert(foo.text() == "Hello World!");
+//     assert(foo.numbers_size() == 3);
+//     assert(foo.numbers(0) == 1);
+//     assert(foo.numbers(1) == 5);
+//     assert(foo.numbers(2) == 42);
+//   }
+//
+//   {
+//     // Same as the last block, but do it dynamically via the Message
+//     // reflection interface.
+//     Message* foo = new Foo;
+//     const Descriptor* descriptor = foo->GetDescriptor();
+//
+//     // Get the descriptors for the fields we're interested in and verify
+//     // their types.
+//     const FieldDescriptor* text_field = descriptor->FindFieldByName("text");
+//     assert(text_field != nullptr);
+//     assert(text_field->type() == FieldDescriptor::TYPE_STRING);
+//     assert(text_field->label() == FieldDescriptor::LABEL_OPTIONAL);
+//     const FieldDescriptor* numbers_field = descriptor->
+//                                            FindFieldByName("numbers");
+//     assert(numbers_field != nullptr);
+//     assert(numbers_field->type() == FieldDescriptor::TYPE_INT32);
+//     assert(numbers_field->label() == FieldDescriptor::LABEL_REPEATED);
+//
+//     // Parse the message.
+//     foo->ParseFromString(data);
+//
+//     // Use the reflection interface to examine the contents.
+//     const Reflection* reflection = foo->GetReflection();
+//     assert(reflection->GetString(*foo, text_field) == "Hello World!");
+//     assert(reflection->FieldSize(*foo, numbers_field) == 3);
+//     assert(reflection->GetRepeatedInt32(*foo, numbers_field, 0) == 1);
+//     assert(reflection->GetRepeatedInt32(*foo, numbers_field, 1) == 5);
+//     assert(reflection->GetRepeatedInt32(*foo, numbers_field, 2) == 42);
+//
+//     delete foo;
+//   }
+
+#ifndef GOOGLE_PROTOBUF_MESSAGE_H__
+#define GOOGLE_PROTOBUF_MESSAGE_H__
+
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <google/protobuf/stubs/casts.h>
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/port.h>
+
+
+#define GOOGLE_PROTOBUF_HAS_ONEOF
+#define GOOGLE_PROTOBUF_HAS_ARENAS
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+
+// Defined in this file.
+class Message;
+class Reflection;
+class MessageFactory;
+
+// Defined in other files.
+class AssignDescriptorsHelper;
+class DynamicMessageFactory;
+class MapKey;
+class MapValueRef;
+class MapIterator;
+class MapReflectionTester;
+
+namespace internal {
+struct DescriptorTable;
+class MapFieldBase;
+}
+class UnknownFieldSet;  // unknown_field_set.h
+namespace io {
+class ZeroCopyInputStream;   // zero_copy_stream.h
+class ZeroCopyOutputStream;  // zero_copy_stream.h
+class CodedInputStream;      // coded_stream.h
+class CodedOutputStream;     // coded_stream.h
+}  // namespace io
+namespace python {
+class MapReflectionFriend;  // scalar_map_container.h
+}
+namespace expr {
+class CelMapReflectionFriend;  // field_backed_map_impl.cc
+}
+
+namespace internal {
+class MapFieldPrinterHelper;  // text_format.cc
+}
+
+
+namespace internal {
+class ReflectionAccessor;      // message.cc
+class ReflectionOps;           // reflection_ops.h
+class MapKeySorter;            // wire_format.cc
+class WireFormat;              // wire_format.h
+class MapFieldReflectionTest;  // map_test.cc
+}  // namespace internal
+
+template <typename T>
+class RepeatedField;  // repeated_field.h
+
+template <typename T>
+class RepeatedPtrField;  // repeated_field.h
+
+// A container to hold message metadata.
+struct Metadata {
+  const Descriptor* descriptor;
+  const Reflection* reflection;
+};
+
+namespace internal {
+template <class To>
+inline To* GetPointerAtOffset(Message* message, uint32 offset) {
+  return reinterpret_cast<To*>(reinterpret_cast<char*>(message) + offset);
+}
+
+template <class To>
+const To* GetConstPointerAtOffset(const Message* message, uint32 offset) {
+  return reinterpret_cast<const To*>(reinterpret_cast<const char*>(message) +
+                                     offset);
+}
+
+template <class To>
+const To& GetConstRefAtOffset(const Message& message, uint32 offset) {
+  return *GetConstPointerAtOffset<To>(&message, offset);
+}
+
+bool CreateUnknownEnumValues(const FieldDescriptor* field);
+}  // namespace internal
+
+// Abstract interface for protocol messages.
+//
+// See also MessageLite, which contains most every-day operations.  Message
+// adds descriptors and reflection on top of that.
+//
+// The methods of this class that are virtual but not pure-virtual have
+// default implementations based on reflection.  Message classes which are
+// optimized for speed will want to override these with faster implementations,
+// but classes optimized for code size may be happy with keeping them.  See
+// the optimize_for option in descriptor.proto.
+//
+// Users must not derive from this class. Only the protocol compiler and
+// the internal library are allowed to create subclasses.
+class PROTOBUF_EXPORT Message : public MessageLite {
+ public:
+  inline Message() {}
+
+  // Basic Operations ------------------------------------------------
+
+  // Construct a new instance of the same type.  Ownership is passed to the
+  // caller.  (This is also defined in MessageLite, but is defined again here
+  // for return-type covariance.)
+  Message* New() const override = 0;
+
+  // Construct a new instance on the arena. Ownership is passed to the caller
+  // if arena is a nullptr. Default implementation allows for API compatibility
+  // during the Arena transition.
+  Message* New(Arena* arena) const override {
+    Message* message = New();
+    if (arena != nullptr) {
+      arena->Own(message);
+    }
+    return message;
+  }
+
+  // Make this message into a copy of the given message.  The given message
+  // must have the same descriptor, but need not necessarily be the same class.
+  // By default this is just implemented as "Clear(); MergeFrom(from);".
+  virtual void CopyFrom(const Message& from);
+
+  // Merge the fields from the given message into this message.  Singular
+  // fields will be overwritten, if specified in from, except for embedded
+  // messages which will be merged.  Repeated fields will be concatenated.
+  // The given message must be of the same type as this message (i.e. the
+  // exact same class).
+  virtual void MergeFrom(const Message& from);
+
+  // Verifies that IsInitialized() returns true.  GOOGLE_CHECK-fails otherwise, with
+  // a nice error message.
+  void CheckInitialized() const;
+
+  // Slowly build a list of all required fields that are not set.
+  // This is much, much slower than IsInitialized() as it is implemented
+  // purely via reflection.  Generally, you should not call this unless you
+  // have already determined that an error exists by calling IsInitialized().
+  void FindInitializationErrors(std::vector<std::string>* errors) const;
+
+  // Like FindInitializationErrors, but joins all the strings, delimited by
+  // commas, and returns them.
+  std::string InitializationErrorString() const override;
+
+  // Clears all unknown fields from this message and all embedded messages.
+  // Normally, if unknown tag numbers are encountered when parsing a message,
+  // the tag and value are stored in the message's UnknownFieldSet and
+  // then written back out when the message is serialized.  This allows servers
+  // which simply route messages to other servers to pass through messages
+  // that have new field definitions which they don't yet know about.  However,
+  // this behavior can have security implications.  To avoid it, call this
+  // method after parsing.
+  //
+  // See Reflection::GetUnknownFields() for more on unknown fields.
+  virtual void DiscardUnknownFields();
+
+  // Computes (an estimate of) the total number of bytes currently used for
+  // storing the message in memory.  The default implementation calls the
+  // Reflection object's SpaceUsed() method.
+  //
+  // SpaceUsed() is noticeably slower than ByteSize(), as it is implemented
+  // using reflection (rather than the generated code implementation for
+  // ByteSize()). Like ByteSize(), its CPU time is linear in the number of
+  // fields defined for the proto.
+  virtual size_t SpaceUsedLong() const;
+
+  PROTOBUF_DEPRECATED_MSG("Please use SpaceUsedLong() instead")
+  int SpaceUsed() const { return internal::ToIntSize(SpaceUsedLong()); }
+
+  // Debugging & Testing----------------------------------------------
+
+  // Generates a human readable form of this message, useful for debugging
+  // and other purposes.
+  std::string DebugString() const;
+  // Like DebugString(), but with less whitespace.
+  std::string ShortDebugString() const;
+  // Like DebugString(), but do not escape UTF-8 byte sequences.
+  std::string Utf8DebugString() const;
+  // Convenience function useful in GDB.  Prints DebugString() to stdout.
+  void PrintDebugString() const;
+
+  // Reflection-based methods ----------------------------------------
+  // These methods are pure-virtual in MessageLite, but Message provides
+  // reflection-based default implementations.
+
+  std::string GetTypeName() const override;
+  void Clear() override;
+
+  // Returns whether all required fields have been set. Note that required
+  // fields no longer exist starting in proto3.
+  bool IsInitialized() const override;
+
+  void CheckTypeAndMergeFrom(const MessageLite& other) override;
+  // Reflective parser
+  const char* _InternalParse(const char* ptr,
+                             internal::ParseContext* ctx) override;
+  size_t ByteSizeLong() const override;
+  uint8* _InternalSerialize(uint8* target,
+                            io::EpsCopyOutputStream* stream) const override;
+
+ private:
+  // This is called only by the default implementation of ByteSize(), to
+  // update the cached size.  If you override ByteSize(), you do not need
+  // to override this.  If you do not override ByteSize(), you MUST override
+  // this; the default implementation will crash.
+  //
+  // The method is private because subclasses should never call it; only
+  // override it.  Yes, C++ lets you do that.  Crazy, huh?
+  virtual void SetCachedSize(int size) const;
+
+ public:
+  // Introspection ---------------------------------------------------
+
+
+  // Get a non-owning pointer to a Descriptor for this message's type.  This
+  // describes what fields the message contains, the types of those fields, etc.
+  // This object remains property of the Message.
+  const Descriptor* GetDescriptor() const { return GetMetadata().descriptor; }
+
+  // Get a non-owning pointer to the Reflection interface for this Message,
+  // which can be used to read and modify the fields of the Message dynamically
+  // (in other words, without knowing the message type at compile time).  This
+  // object remains property of the Message.
+  const Reflection* GetReflection() const { return GetMetadata().reflection; }
+
+ protected:
+  // Get a struct containing the metadata for the Message, which is used in turn
+  // to implement GetDescriptor() and GetReflection() above.
+  virtual Metadata GetMetadata() const = 0;
+
+  inline explicit Message(Arena* arena) : MessageLite(arena) {}
+
+
+ private:
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Message);
+};
+
+namespace internal {
+// Forward-declare interfaces used to implement RepeatedFieldRef.
+// These are protobuf internals that users shouldn't care about.
+class RepeatedFieldAccessor;
+}  // namespace internal
+
+// Forward-declare RepeatedFieldRef templates. The second type parameter is
+// used for SFINAE tricks. Users should ignore it.
+template <typename T, typename Enable = void>
+class RepeatedFieldRef;
+
+template <typename T, typename Enable = void>
+class MutableRepeatedFieldRef;
+
+// This interface contains methods that can be used to dynamically access
+// and modify the fields of a protocol message.  Their semantics are
+// similar to the accessors the protocol compiler generates.
+//
+// To get the Reflection for a given Message, call Message::GetReflection().
+//
+// This interface is separate from Message only for efficiency reasons;
+// the vast majority of implementations of Message will share the same
+// implementation of Reflection (GeneratedMessageReflection,
+// defined in generated_message.h), and all Messages of a particular class
+// should share the same Reflection object (though you should not rely on
+// the latter fact).
+//
+// There are several ways that these methods can be used incorrectly.  For
+// example, any of the following conditions will lead to undefined
+// results (probably assertion failures):
+// - The FieldDescriptor is not a field of this message type.
+// - The method called is not appropriate for the field's type.  For
+//   each field type in FieldDescriptor::TYPE_*, there is only one
+//   Get*() method, one Set*() method, and one Add*() method that is
+//   valid for that type.  It should be obvious which (except maybe
+//   for TYPE_BYTES, which are represented using strings in C++).
+// - A Get*() or Set*() method for singular fields is called on a repeated
+//   field.
+// - GetRepeated*(), SetRepeated*(), or Add*() is called on a non-repeated
+//   field.
+// - The Message object passed to any method is not of the right type for
+//   this Reflection object (i.e. message.GetReflection() != reflection).
+//
+// You might wonder why there is not any abstract representation for a field
+// of arbitrary type.  E.g., why isn't there just a "GetField()" method that
+// returns "const Field&", where "Field" is some class with accessors like
+// "GetInt32Value()".  The problem is that someone would have to deal with
+// allocating these Field objects.  For generated message classes, having to
+// allocate space for an additional object to wrap every field would at least
+// double the message's memory footprint, probably worse.  Allocating the
+// objects on-demand, on the other hand, would be expensive and prone to
+// memory leaks.  So, instead we ended up with this flat interface.
+class PROTOBUF_EXPORT Reflection final {
+ public:
+  // Get the UnknownFieldSet for the message.  This contains fields which
+  // were seen when the Message was parsed but were not recognized according
+  // to the Message's definition.
+  const UnknownFieldSet& GetUnknownFields(const Message& message) const;
+  // Get a mutable pointer to the UnknownFieldSet for the message.  This
+  // contains fields which were seen when the Message was parsed but were not
+  // recognized according to the Message's definition.
+  UnknownFieldSet* MutableUnknownFields(Message* message) const;
+
+  // Estimate the amount of memory used by the message object.
+  size_t SpaceUsedLong(const Message& message) const;
+
+  PROTOBUF_DEPRECATED_MSG("Please use SpaceUsedLong() instead")
+  int SpaceUsed(const Message& message) const {
+    return internal::ToIntSize(SpaceUsedLong(message));
+  }
+
+  // Check if the given non-repeated field is set.
+  bool HasField(const Message& message, const FieldDescriptor* field) const;
+
+  // Get the number of elements of a repeated field.
+  int FieldSize(const Message& message, const FieldDescriptor* field) const;
+
+  // Clear the value of a field, so that HasField() returns false or
+  // FieldSize() returns zero.
+  void ClearField(Message* message, const FieldDescriptor* field) const;
+
+  // Check if the oneof is set. Returns true if any field in oneof
+  // is set, false otherwise.
+  bool HasOneof(const Message& message,
+                const OneofDescriptor* oneof_descriptor) const;
+
+  void ClearOneof(Message* message,
+                  const OneofDescriptor* oneof_descriptor) const;
+
+  // Returns the field descriptor if the oneof is set. nullptr otherwise.
+  const FieldDescriptor* GetOneofFieldDescriptor(
+      const Message& message, const OneofDescriptor* oneof_descriptor) const;
+
+  // Removes the last element of a repeated field.
+  // We don't provide a way to remove any element other than the last
+  // because it invites inefficient use, such as O(n^2) filtering loops
+  // that should have been O(n).  If you want to remove an element other
+  // than the last, the best way to do it is to re-arrange the elements
+  // (using Swap()) so that the one you want removed is at the end, then
+  // call RemoveLast().
+  void RemoveLast(Message* message, const FieldDescriptor* field) const;
+  // Removes the last element of a repeated message field, and returns the
+  // pointer to the caller.  Caller takes ownership of the returned pointer.
+  Message* ReleaseLast(Message* message, const FieldDescriptor* field) const;
+
+  // Swap the complete contents of two messages.
+  void Swap(Message* message1, Message* message2) const;
+
+  // Swap fields listed in fields vector of two messages.
+  void SwapFields(Message* message1, Message* message2,
+                  const std::vector<const FieldDescriptor*>& fields) const;
+
+  // Swap two elements of a repeated field.
+  void SwapElements(Message* message, const FieldDescriptor* field, int index1,
+                    int index2) const;
+
+  // List all fields of the message which are currently set, except for unknown
+  // fields, but including extension known to the parser (i.e. compiled in).
+  // Singular fields will only be listed if HasField(field) would return true
+  // and repeated fields will only be listed if FieldSize(field) would return
+  // non-zero.  Fields (both normal fields and extension fields) will be listed
+  // ordered by field number.
+  // Use Reflection::GetUnknownFields() or message.unknown_fields() to also get
+  // access to fields/extensions unknown to the parser.
+  void ListFields(const Message& message,
+                  std::vector<const FieldDescriptor*>* output) const;
+
+  // Singular field getters ------------------------------------------
+  // These get the value of a non-repeated field.  They return the default
+  // value for fields that aren't set.
+
+  int32 GetInt32(const Message& message, const FieldDescriptor* field) const;
+  int64 GetInt64(const Message& message, const FieldDescriptor* field) const;
+  uint32 GetUInt32(const Message& message, const FieldDescriptor* field) const;
+  uint64 GetUInt64(const Message& message, const FieldDescriptor* field) const;
+  float GetFloat(const Message& message, const FieldDescriptor* field) const;
+  double GetDouble(const Message& message, const FieldDescriptor* field) const;
+  bool GetBool(const Message& message, const FieldDescriptor* field) const;
+  std::string GetString(const Message& message,
+                        const FieldDescriptor* field) const;
+  const EnumValueDescriptor* GetEnum(const Message& message,
+                                     const FieldDescriptor* field) const;
+
+  // GetEnumValue() returns an enum field's value as an integer rather than
+  // an EnumValueDescriptor*. If the integer value does not correspond to a
+  // known value descriptor, a new value descriptor is created. (Such a value
+  // will only be present when the new unknown-enum-value semantics are enabled
+  // for a message.)
+  int GetEnumValue(const Message& message, const FieldDescriptor* field) const;
+
+  // See MutableMessage() for the meaning of the "factory" parameter.
+  const Message& GetMessage(const Message& message,
+                            const FieldDescriptor* field,
+                            MessageFactory* factory = nullptr) const;
+
+  // Get a string value without copying, if possible.
+  //
+  // GetString() necessarily returns a copy of the string.  This can be
+  // inefficient when the std::string is already stored in a std::string object
+  // in the underlying message.  GetStringReference() will return a reference to
+  // the underlying std::string in this case.  Otherwise, it will copy the
+  // string into *scratch and return that.
+  //
+  // Note:  It is perfectly reasonable and useful to write code like:
+  //     str = reflection->GetStringReference(message, field, &str);
+  //   This line would ensure that only one copy of the string is made
+  //   regardless of the field's underlying representation.  When initializing
+  //   a newly-constructed string, though, it's just as fast and more
+  //   readable to use code like:
+  //     std::string str = reflection->GetString(message, field);
+  const std::string& GetStringReference(const Message& message,
+                                        const FieldDescriptor* field,
+                                        std::string* scratch) const;
+
+
+  // Singular field mutators -----------------------------------------
+  // These mutate the value of a non-repeated field.
+
+  void SetInt32(Message* message, const FieldDescriptor* field,
+                int32 value) const;
+  void SetInt64(Message* message, const FieldDescriptor* field,
+                int64 value) const;
+  void SetUInt32(Message* message, const FieldDescriptor* field,
+                 uint32 value) const;
+  void SetUInt64(Message* message, const FieldDescriptor* field,
+                 uint64 value) const;
+  void SetFloat(Message* message, const FieldDescriptor* field,
+                float value) const;
+  void SetDouble(Message* message, const FieldDescriptor* field,
+                 double value) const;
+  void SetBool(Message* message, const FieldDescriptor* field,
+               bool value) const;
+  void SetString(Message* message, const FieldDescriptor* field,
+                 std::string value) const;
+  void SetEnum(Message* message, const FieldDescriptor* field,
+               const EnumValueDescriptor* value) const;
+  // Set an enum field's value with an integer rather than EnumValueDescriptor.
+  // For proto3 this is just setting the enum field to the value specified, for
+  // proto2 it's more complicated. If value is a known enum value the field is
+  // set as usual. If the value is unknown then it is added to the unknown field
+  // set. Note this matches the behavior of parsing unknown enum values.
+  // If multiple calls with unknown values happen than they are all added to the
+  // unknown field set in order of the calls.
+  void SetEnumValue(Message* message, const FieldDescriptor* field,
+                    int value) const;
+
+  // Get a mutable pointer to a field with a message type.  If a MessageFactory
+  // is provided, it will be used to construct instances of the sub-message;
+  // otherwise, the default factory is used.  If the field is an extension that
+  // does not live in the same pool as the containing message's descriptor (e.g.
+  // it lives in an overlay pool), then a MessageFactory must be provided.
+  // If you have no idea what that meant, then you probably don't need to worry
+  // about it (don't provide a MessageFactory).  WARNING:  If the
+  // FieldDescriptor is for a compiled-in extension, then
+  // factory->GetPrototype(field->message_type()) MUST return an instance of
+  // the compiled-in class for this type, NOT DynamicMessage.
+  Message* MutableMessage(Message* message, const FieldDescriptor* field,
+                          MessageFactory* factory = nullptr) const;
+  // Replaces the message specified by 'field' with the already-allocated object
+  // sub_message, passing ownership to the message.  If the field contained a
+  // message, that message is deleted.  If sub_message is nullptr, the field is
+  // cleared.
+  void SetAllocatedMessage(Message* message, Message* sub_message,
+                           const FieldDescriptor* field) const;
+  // Releases the message specified by 'field' and returns the pointer,
+  // ReleaseMessage() will return the message the message object if it exists.
+  // Otherwise, it may or may not return nullptr.  In any case, if the return
+  // value is non-null, the caller takes ownership of the pointer.
+  // If the field existed (HasField() is true), then the returned pointer will
+  // be the same as the pointer returned by MutableMessage().
+  // This function has the same effect as ClearField().
+  Message* ReleaseMessage(Message* message, const FieldDescriptor* field,
+                          MessageFactory* factory = nullptr) const;
+
+
+  // Repeated field getters ------------------------------------------
+  // These get the value of one element of a repeated field.
+
+  int32 GetRepeatedInt32(const Message& message, const FieldDescriptor* field,
+                         int index) const;
+  int64 GetRepeatedInt64(const Message& message, const FieldDescriptor* field,
+                         int index) const;
+  uint32 GetRepeatedUInt32(const Message& message, const FieldDescriptor* field,
+                           int index) const;
+  uint64 GetRepeatedUInt64(const Message& message, const FieldDescriptor* field,
+                           int index) const;
+  float GetRepeatedFloat(const Message& message, const FieldDescriptor* field,
+                         int index) const;
+  double GetRepeatedDouble(const Message& message, const FieldDescriptor* field,
+                           int index) const;
+  bool GetRepeatedBool(const Message& message, const FieldDescriptor* field,
+                       int index) const;
+  std::string GetRepeatedString(const Message& message,
+                                const FieldDescriptor* field, int index) const;
+  const EnumValueDescriptor* GetRepeatedEnum(const Message& message,
+                                             const FieldDescriptor* field,
+                                             int index) const;
+  // GetRepeatedEnumValue() returns an enum field's value as an integer rather
+  // than an EnumValueDescriptor*. If the integer value does not correspond to a
+  // known value descriptor, a new value descriptor is created. (Such a value
+  // will only be present when the new unknown-enum-value semantics are enabled
+  // for a message.)
+  int GetRepeatedEnumValue(const Message& message, const FieldDescriptor* field,
+                           int index) const;
+  const Message& GetRepeatedMessage(const Message& message,
+                                    const FieldDescriptor* field,
+                                    int index) const;
+
+  // See GetStringReference(), above.
+  const std::string& GetRepeatedStringReference(const Message& message,
+                                                const FieldDescriptor* field,
+                                                int index,
+                                                std::string* scratch) const;
+
+
+  // Repeated field mutators -----------------------------------------
+  // These mutate the value of one element of a repeated field.
+
+  void SetRepeatedInt32(Message* message, const FieldDescriptor* field,
+                        int index, int32 value) const;
+  void SetRepeatedInt64(Message* message, const FieldDescriptor* field,
+                        int index, int64 value) const;
+  void SetRepeatedUInt32(Message* message, const FieldDescriptor* field,
+                         int index, uint32 value) const;
+  void SetRepeatedUInt64(Message* message, const FieldDescriptor* field,
+                         int index, uint64 value) const;
+  void SetRepeatedFloat(Message* message, const FieldDescriptor* field,
+                        int index, float value) const;
+  void SetRepeatedDouble(Message* message, const FieldDescriptor* field,
+                         int index, double value) const;
+  void SetRepeatedBool(Message* message, const FieldDescriptor* field,
+                       int index, bool value) const;
+  void SetRepeatedString(Message* message, const FieldDescriptor* field,
+                         int index, std::string value) const;
+  void SetRepeatedEnum(Message* message, const FieldDescriptor* field,
+                       int index, const EnumValueDescriptor* value) const;
+  // Set an enum field's value with an integer rather than EnumValueDescriptor.
+  // For proto3 this is just setting the enum field to the value specified, for
+  // proto2 it's more complicated. If value is a known enum value the field is
+  // set as usual. If the value is unknown then it is added to the unknown field
+  // set. Note this matches the behavior of parsing unknown enum values.
+  // If multiple calls with unknown values happen than they are all added to the
+  // unknown field set in order of the calls.
+  void SetRepeatedEnumValue(Message* message, const FieldDescriptor* field,
+                            int index, int value) const;
+  // Get a mutable pointer to an element of a repeated field with a message
+  // type.
+  Message* MutableRepeatedMessage(Message* message,
+                                  const FieldDescriptor* field,
+                                  int index) const;
+
+
+  // Repeated field adders -------------------------------------------
+  // These add an element to a repeated field.
+
+  void AddInt32(Message* message, const FieldDescriptor* field,
+                int32 value) const;
+  void AddInt64(Message* message, const FieldDescriptor* field,
+                int64 value) const;
+  void AddUInt32(Message* message, const FieldDescriptor* field,
+                 uint32 value) const;
+  void AddUInt64(Message* message, const FieldDescriptor* field,
+                 uint64 value) const;
+  void AddFloat(Message* message, const FieldDescriptor* field,
+                float value) const;
+  void AddDouble(Message* message, const FieldDescriptor* field,
+                 double value) const;
+  void AddBool(Message* message, const FieldDescriptor* field,
+               bool value) const;
+  void AddString(Message* message, const FieldDescriptor* field,
+                 std::string value) const;
+  void AddEnum(Message* message, const FieldDescriptor* field,
+               const EnumValueDescriptor* value) const;
+  // Add an integer value to a repeated enum field rather than
+  // EnumValueDescriptor. For proto3 this is just setting the enum field to the
+  // value specified, for proto2 it's more complicated. If value is a known enum
+  // value the field is set as usual. If the value is unknown then it is added
+  // to the unknown field set. Note this matches the behavior of parsing unknown
+  // enum values. If multiple calls with unknown values happen than they are all
+  // added to the unknown field set in order of the calls.
+  void AddEnumValue(Message* message, const FieldDescriptor* field,
+                    int value) const;
+  // See MutableMessage() for comments on the "factory" parameter.
+  Message* AddMessage(Message* message, const FieldDescriptor* field,
+                      MessageFactory* factory = nullptr) const;
+
+  // Appends an already-allocated object 'new_entry' to the repeated field
+  // specified by 'field' passing ownership to the message.
+  void AddAllocatedMessage(Message* message, const FieldDescriptor* field,
+                           Message* new_entry) const;
+
+
+  // Get a RepeatedFieldRef object that can be used to read the underlying
+  // repeated field. The type parameter T must be set according to the
+  // field's cpp type. The following table shows the mapping from cpp type
+  // to acceptable T.
+  //
+  //   field->cpp_type()      T
+  //   CPPTYPE_INT32        int32
+  //   CPPTYPE_UINT32       uint32
+  //   CPPTYPE_INT64        int64
+  //   CPPTYPE_UINT64       uint64
+  //   CPPTYPE_DOUBLE       double
+  //   CPPTYPE_FLOAT        float
+  //   CPPTYPE_BOOL         bool
+  //   CPPTYPE_ENUM         generated enum type or int32
+  //   CPPTYPE_STRING       std::string
+  //   CPPTYPE_MESSAGE      generated message type or google::protobuf::Message
+  //
+  // A RepeatedFieldRef object can be copied and the resulted object will point
+  // to the same repeated field in the same message. The object can be used as
+  // long as the message is not destroyed.
+  //
+  // Note that to use this method users need to include the header file
+  // "reflection.h" (which defines the RepeatedFieldRef class templates).
+  template <typename T>
+  RepeatedFieldRef<T> GetRepeatedFieldRef(const Message& message,
+                                          const FieldDescriptor* field) const;
+
+  // Like GetRepeatedFieldRef() but return an object that can also be used
+  // manipulate the underlying repeated field.
+  template <typename T>
+  MutableRepeatedFieldRef<T> GetMutableRepeatedFieldRef(
+      Message* message, const FieldDescriptor* field) const;
+
+  // DEPRECATED. Please use Get(Mutable)RepeatedFieldRef() for repeated field
+  // access. The following repeated field accesors will be removed in the
+  // future.
+  //
+  // Repeated field accessors  -------------------------------------------------
+  // The methods above, e.g. GetRepeatedInt32(msg, fd, index), provide singular
+  // access to the data in a RepeatedField.  The methods below provide aggregate
+  // access by exposing the RepeatedField object itself with the Message.
+  // Applying these templates to inappropriate types will lead to an undefined
+  // reference at link time (e.g. GetRepeatedField<***double>), or possibly a
+  // template matching error at compile time (e.g. GetRepeatedPtrField<File>).
+  //
+  // Usage example: my_doubs = refl->GetRepeatedField<double>(msg, fd);
+
+  // DEPRECATED. Please use GetRepeatedFieldRef().
+  //
+  // for T = Cord and all protobuf scalar types except enums.
+  template <typename T>
+  PROTOBUF_DEPRECATED_MSG("Please use GetRepeatedFieldRef() instead")
+  const RepeatedField<T>& GetRepeatedField(const Message& msg,
+                                           const FieldDescriptor* d) const {
+    return GetRepeatedFieldInternal<T>(msg, d);
+  }
+
+  // DEPRECATED. Please use GetMutableRepeatedFieldRef().
+  //
+  // for T = Cord and all protobuf scalar types except enums.
+  template <typename T>
+  PROTOBUF_DEPRECATED_MSG("Please use GetMutableRepeatedFieldRef() instead")
+  RepeatedField<T>* MutableRepeatedField(Message* msg,
+                                         const FieldDescriptor* d) const {
+    return MutableRepeatedFieldInternal<T>(msg, d);
+  }
+
+  // DEPRECATED. Please use GetRepeatedFieldRef().
+  //
+  // for T = std::string, google::protobuf::internal::StringPieceField
+  //         google::protobuf::Message & descendants.
+  template <typename T>
+  PROTOBUF_DEPRECATED_MSG("Please use GetRepeatedFieldRef() instead")
+  const RepeatedPtrField<T>& GetRepeatedPtrField(
+      const Message& msg, const FieldDescriptor* d) const {
+    return GetRepeatedPtrFieldInternal<T>(msg, d);
+  }
+
+  // DEPRECATED. Please use GetMutableRepeatedFieldRef().
+  //
+  // for T = std::string, google::protobuf::internal::StringPieceField
+  //         google::protobuf::Message & descendants.
+  template <typename T>
+  PROTOBUF_DEPRECATED_MSG("Please use GetMutableRepeatedFieldRef() instead")
+  RepeatedPtrField<T>* MutableRepeatedPtrField(Message* msg,
+                                               const FieldDescriptor* d) const {
+    return MutableRepeatedPtrFieldInternal<T>(msg, d);
+  }
+
+  // Extensions ----------------------------------------------------------------
+
+  // Try to find an extension of this message type by fully-qualified field
+  // name.  Returns nullptr if no extension is known for this name or number.
+  const FieldDescriptor* FindKnownExtensionByName(
+      const std::string& name) const;
+
+  // Try to find an extension of this message type by field number.
+  // Returns nullptr if no extension is known for this name or number.
+  const FieldDescriptor* FindKnownExtensionByNumber(int number) const;
+
+  // Feature Flags -------------------------------------------------------------
+
+  // Does this message support storing arbitrary integer values in enum fields?
+  // If |true|, GetEnumValue/SetEnumValue and associated repeated-field versions
+  // take arbitrary integer values, and the legacy GetEnum() getter will
+  // dynamically create an EnumValueDescriptor for any integer value without
+  // one. If |false|, setting an unknown enum value via the integer-based
+  // setters results in undefined behavior (in practice, GOOGLE_DCHECK-fails).
+  //
+  // Generic code that uses reflection to handle messages with enum fields
+  // should check this flag before using the integer-based setter, and either
+  // downgrade to a compatible value or use the UnknownFieldSet if not. For
+  // example:
+  //
+  //   int new_value = GetValueFromApplicationLogic();
+  //   if (reflection->SupportsUnknownEnumValues()) {
+  //     reflection->SetEnumValue(message, field, new_value);
+  //   } else {
+  //     if (field_descriptor->enum_type()->
+  //             FindValueByNumber(new_value) != nullptr) {
+  //       reflection->SetEnumValue(message, field, new_value);
+  //     } else if (emit_unknown_enum_values) {
+  //       reflection->MutableUnknownFields(message)->AddVarint(
+  //           field->number(), new_value);
+  //     } else {
+  //       // convert value to a compatible/default value.
+  //       new_value = CompatibleDowngrade(new_value);
+  //       reflection->SetEnumValue(message, field, new_value);
+  //     }
+  //   }
+  bool SupportsUnknownEnumValues() const;
+
+  // Returns the MessageFactory associated with this message.  This can be
+  // useful for determining if a message is a generated message or not, for
+  // example:
+  //   if (message->GetReflection()->GetMessageFactory() ==
+  //       google::protobuf::MessageFactory::generated_factory()) {
+  //     // This is a generated message.
+  //   }
+  // It can also be used to create more messages of this type, though
+  // Message::New() is an easier way to accomplish this.
+  MessageFactory* GetMessageFactory() const;
+
+ private:
+  template <typename T>
+  const RepeatedField<T>& GetRepeatedFieldInternal(
+      const Message& message, const FieldDescriptor* field) const;
+  template <typename T>
+  RepeatedField<T>* MutableRepeatedFieldInternal(
+      Message* message, const FieldDescriptor* field) const;
+  template <typename T>
+  const RepeatedPtrField<T>& GetRepeatedPtrFieldInternal(
+      const Message& message, const FieldDescriptor* field) const;
+  template <typename T>
+  RepeatedPtrField<T>* MutableRepeatedPtrFieldInternal(
+      Message* message, const FieldDescriptor* field) const;
+  // Obtain a pointer to a Repeated Field Structure and do some type checking:
+  //   on field->cpp_type(),
+  //   on field->field_option().ctype() (if ctype >= 0)
+  //   of field->message_type() (if message_type != nullptr).
+  // We use 2 routine rather than 4 (const vs mutable) x (scalar vs pointer).
+  void* MutableRawRepeatedField(Message* message, const FieldDescriptor* field,
+                                FieldDescriptor::CppType, int ctype,
+                                const Descriptor* message_type) const;
+
+  const void* GetRawRepeatedField(const Message& message,
+                                  const FieldDescriptor* field,
+                                  FieldDescriptor::CppType cpptype, int ctype,
+                                  const Descriptor* message_type) const;
+
+  // The following methods are used to implement (Mutable)RepeatedFieldRef.
+  // A Ref object will store a raw pointer to the repeated field data (obtained
+  // from RepeatedFieldData()) and a pointer to a Accessor (obtained from
+  // RepeatedFieldAccessor) which will be used to access the raw data.
+
+  // Returns a raw pointer to the repeated field
+  //
+  // "cpp_type" and "message_type" are deduced from the type parameter T passed
+  // to Get(Mutable)RepeatedFieldRef. If T is a generated message type,
+  // "message_type" should be set to its descriptor. Otherwise "message_type"
+  // should be set to nullptr. Implementations of this method should check
+  // whether "cpp_type"/"message_type" is consistent with the actual type of the
+  // field. We use 1 routine rather than 2 (const vs mutable) because it is
+  // protected and it doesn't change the message.
+  void* RepeatedFieldData(Message* message, const FieldDescriptor* field,
+                          FieldDescriptor::CppType cpp_type,
+                          const Descriptor* message_type) const;
+
+  // The returned pointer should point to a singleton instance which implements
+  // the RepeatedFieldAccessor interface.
+  const internal::RepeatedFieldAccessor* RepeatedFieldAccessor(
+      const FieldDescriptor* field) const;
+
+  // Lists all fields of the message which are currently set, except for unknown
+  // fields and stripped fields. See ListFields for details.
+  void ListFieldsOmitStripped(
+      const Message& message,
+      std::vector<const FieldDescriptor*>* output) const;
+
+  bool IsMessageStripped(const Descriptor* descriptor) const {
+    return schema_.IsMessageStripped(descriptor);
+  }
+
+  friend class TextFormat;
+
+  void ListFieldsMayFailOnStripped(
+      const Message& message, bool should_fail,
+      std::vector<const FieldDescriptor*>* output) const;
+
+  const Descriptor* const descriptor_;
+  const internal::ReflectionSchema schema_;
+  const DescriptorPool* const descriptor_pool_;
+  MessageFactory* const message_factory_;
+
+  // Last non weak field index. This is an optimization when most weak fields
+  // are at the end of the containing message. If a message proto doesn't
+  // contain weak fields, then this field equals descriptor_->field_count().
+  int last_non_weak_field_index_;
+
+  template <typename T, typename Enable>
+  friend class RepeatedFieldRef;
+  template <typename T, typename Enable>
+  friend class MutableRepeatedFieldRef;
+  friend class ::PROTOBUF_NAMESPACE_ID::MessageLayoutInspector;
+  friend class ::PROTOBUF_NAMESPACE_ID::AssignDescriptorsHelper;
+  friend class DynamicMessageFactory;
+  friend class python::MapReflectionFriend;
+#define GOOGLE_PROTOBUF_HAS_CEL_MAP_REFLECTION_FRIEND
+  friend class expr::CelMapReflectionFriend;
+  friend class internal::MapFieldReflectionTest;
+  friend class internal::MapKeySorter;
+  friend class internal::WireFormat;
+  friend class internal::ReflectionOps;
+  // Needed for implementing text format for map.
+  friend class internal::MapFieldPrinterHelper;
+
+  Reflection(const Descriptor* descriptor,
+             const internal::ReflectionSchema& schema,
+             const DescriptorPool* pool, MessageFactory* factory);
+
+  // Special version for specialized implementations of string.  We can't
+  // call MutableRawRepeatedField directly here because we don't have access to
+  // FieldOptions::* which are defined in descriptor.pb.h.  Including that
+  // file here is not possible because it would cause a circular include cycle.
+  // We use 1 routine rather than 2 (const vs mutable) because it is private
+  // and mutable a repeated string field doesn't change the message.
+  void* MutableRawRepeatedString(Message* message, const FieldDescriptor* field,
+                                 bool is_string) const;
+
+  friend class MapReflectionTester;
+  // Returns true if key is in map. Returns false if key is not in map field.
+  bool ContainsMapKey(const Message& message, const FieldDescriptor* field,
+                      const MapKey& key) const;
+
+  // If key is in map field: Saves the value pointer to val and returns
+  // false. If key in not in map field: Insert the key into map, saves
+  // value pointer to val and returns true.
+  bool InsertOrLookupMapValue(Message* message, const FieldDescriptor* field,
+                              const MapKey& key, MapValueRef* val) const;
+
+  // Delete and returns true if key is in the map field. Returns false
+  // otherwise.
+  bool DeleteMapValue(Message* message, const FieldDescriptor* field,
+                      const MapKey& key) const;
+
+  // Returns a MapIterator referring to the first element in the map field.
+  // If the map field is empty, this function returns the same as
+  // reflection::MapEnd. Mutation to the field may invalidate the iterator.
+  MapIterator MapBegin(Message* message, const FieldDescriptor* field) const;
+
+  // Returns a MapIterator referring to the theoretical element that would
+  // follow the last element in the map field. It does not point to any
+  // real element. Mutation to the field may invalidate the iterator.
+  MapIterator MapEnd(Message* message, const FieldDescriptor* field) const;
+
+  // Get the number of <key, value> pair of a map field. The result may be
+  // different from FieldSize which can have duplicate keys.
+  int MapSize(const Message& message, const FieldDescriptor* field) const;
+
+  // Help method for MapIterator.
+  friend class MapIterator;
+  friend class WireFormatForMapFieldTest;
+  internal::MapFieldBase* MutableMapData(Message* message,
+                                         const FieldDescriptor* field) const;
+
+  const internal::MapFieldBase* GetMapData(const Message& message,
+                                           const FieldDescriptor* field) const;
+
+  template <class T>
+  const T& GetRawNonOneof(const Message& message,
+                          const FieldDescriptor* field) const;
+  template <class T>
+  T* MutableRawNonOneof(Message* message, const FieldDescriptor* field) const;
+
+  template <typename Type>
+  const Type& GetRaw(const Message& message,
+                     const FieldDescriptor* field) const;
+  template <typename Type>
+  inline Type* MutableRaw(Message* message, const FieldDescriptor* field) const;
+  template <typename Type>
+  const Type& DefaultRaw(const FieldDescriptor* field) const;
+
+  inline const uint32* GetHasBits(const Message& message) const;
+  inline uint32* MutableHasBits(Message* message) const;
+  inline uint32 GetOneofCase(const Message& message,
+                             const OneofDescriptor* oneof_descriptor) const;
+  inline uint32* MutableOneofCase(
+      Message* message, const OneofDescriptor* oneof_descriptor) const;
+  inline bool HasExtensionSet(const Message& message) const {
+    return schema_.HasExtensionSet();
+  }
+  const internal::ExtensionSet& GetExtensionSet(const Message& message) const;
+  internal::ExtensionSet* MutableExtensionSet(Message* message) const;
+  inline Arena* GetArena(Message* message) const;
+
+  inline const internal::InternalMetadata& GetInternalMetadata(
+      const Message& message) const;
+
+  internal::InternalMetadata* MutableInternalMetadata(Message* message) const;
+
+  inline bool IsInlined(const FieldDescriptor* field) const;
+
+  inline bool HasBit(const Message& message,
+                     const FieldDescriptor* field) const;
+  inline void SetBit(Message* message, const FieldDescriptor* field) const;
+  inline void ClearBit(Message* message, const FieldDescriptor* field) const;
+  inline void SwapBit(Message* message1, Message* message2,
+                      const FieldDescriptor* field) const;
+
+  // This function only swaps the field. Should swap corresponding has_bit
+  // before or after using this function.
+  void SwapField(Message* message1, Message* message2,
+                 const FieldDescriptor* field) const;
+
+  void SwapOneofField(Message* message1, Message* message2,
+                      const OneofDescriptor* oneof_descriptor) const;
+
+  inline bool HasOneofField(const Message& message,
+                            const FieldDescriptor* field) const;
+  inline void SetOneofCase(Message* message,
+                           const FieldDescriptor* field) const;
+  inline void ClearOneofField(Message* message,
+                              const FieldDescriptor* field) const;
+
+  template <typename Type>
+  inline const Type& GetField(const Message& message,
+                              const FieldDescriptor* field) const;
+  template <typename Type>
+  inline void SetField(Message* message, const FieldDescriptor* field,
+                       const Type& value) const;
+  template <typename Type>
+  inline Type* MutableField(Message* message,
+                            const FieldDescriptor* field) const;
+  template <typename Type>
+  inline const Type& GetRepeatedField(const Message& message,
+                                      const FieldDescriptor* field,
+                                      int index) const;
+  template <typename Type>
+  inline const Type& GetRepeatedPtrField(const Message& message,
+                                         const FieldDescriptor* field,
+                                         int index) const;
+  template <typename Type>
+  inline void SetRepeatedField(Message* message, const FieldDescriptor* field,
+                               int index, Type value) const;
+  template <typename Type>
+  inline Type* MutableRepeatedField(Message* message,
+                                    const FieldDescriptor* field,
+                                    int index) const;
+  template <typename Type>
+  inline void AddField(Message* message, const FieldDescriptor* field,
+                       const Type& value) const;
+  template <typename Type>
+  inline Type* AddField(Message* message, const FieldDescriptor* field) const;
+
+  int GetExtensionNumberOrDie(const Descriptor* type) const;
+
+  // Internal versions of EnumValue API perform no checking. Called after checks
+  // by public methods.
+  void SetEnumValueInternal(Message* message, const FieldDescriptor* field,
+                            int value) const;
+  void SetRepeatedEnumValueInternal(Message* message,
+                                    const FieldDescriptor* field, int index,
+                                    int value) const;
+  void AddEnumValueInternal(Message* message, const FieldDescriptor* field,
+                            int value) const;
+
+  Message* UnsafeArenaReleaseMessage(Message* message,
+                                     const FieldDescriptor* field,
+                                     MessageFactory* factory = nullptr) const;
+
+  void UnsafeArenaSetAllocatedMessage(Message* message, Message* sub_message,
+                                      const FieldDescriptor* field) const;
+
+  friend inline  // inline so nobody can call this function.
+      void
+      RegisterAllTypesInternal(const Metadata* file_level_metadata, int size);
+  friend inline const char* ParseLenDelim(int field_number,
+                                          const FieldDescriptor* field,
+                                          Message* msg,
+                                          const Reflection* reflection,
+                                          const char* ptr,
+                                          internal::ParseContext* ctx);
+  friend inline const char* ParsePackedField(const FieldDescriptor* field,
+                                             Message* msg,
+                                             const Reflection* reflection,
+                                             const char* ptr,
+                                             internal::ParseContext* ctx);
+
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Reflection);
+};
+
+// Abstract interface for a factory for message objects.
+class PROTOBUF_EXPORT MessageFactory {
+ public:
+  inline MessageFactory() {}
+  virtual ~MessageFactory();
+
+  // Given a Descriptor, gets or constructs the default (prototype) Message
+  // of that type.  You can then call that message's New() method to construct
+  // a mutable message of that type.
+  //
+  // Calling this method twice with the same Descriptor returns the same
+  // object.  The returned object remains property of the factory.  Also, any
+  // objects created by calling the prototype's New() method share some data
+  // with the prototype, so these must be destroyed before the MessageFactory
+  // is destroyed.
+  //
+  // The given descriptor must outlive the returned message, and hence must
+  // outlive the MessageFactory.
+  //
+  // Some implementations do not support all types.  GetPrototype() will
+  // return nullptr if the descriptor passed in is not supported.
+  //
+  // This method may or may not be thread-safe depending on the implementation.
+  // Each implementation should document its own degree thread-safety.
+  virtual const Message* GetPrototype(const Descriptor* type) = 0;
+
+  // Gets a MessageFactory which supports all generated, compiled-in messages.
+  // In other words, for any compiled-in type FooMessage, the following is true:
+  //   MessageFactory::generated_factory()->GetPrototype(
+  //     FooMessage::descriptor()) == FooMessage::default_instance()
+  // This factory supports all types which are found in
+  // DescriptorPool::generated_pool().  If given a descriptor from any other
+  // pool, GetPrototype() will return nullptr.  (You can also check if a
+  // descriptor is for a generated message by checking if
+  // descriptor->file()->pool() == DescriptorPool::generated_pool().)
+  //
+  // This factory is 100% thread-safe; calling GetPrototype() does not modify
+  // any shared data.
+  //
+  // This factory is a singleton.  The caller must not delete the object.
+  static MessageFactory* generated_factory();
+
+  // For internal use only:  Registers a .proto file at static initialization
+  // time, to be placed in generated_factory.  The first time GetPrototype()
+  // is called with a descriptor from this file, |register_messages| will be
+  // called, with the file name as the parameter.  It must call
+  // InternalRegisterGeneratedMessage() (below) to register each message type
+  // in the file.  This strange mechanism is necessary because descriptors are
+  // built lazily, so we can't register types by their descriptor until we
+  // know that the descriptor exists.  |filename| must be a permanent string.
+  static void InternalRegisterGeneratedFile(
+      const google::protobuf::internal::DescriptorTable* table);
+
+  // For internal use only:  Registers a message type.  Called only by the
+  // functions which are registered with InternalRegisterGeneratedFile(),
+  // above.
+  static void InternalRegisterGeneratedMessage(const Descriptor* descriptor,
+                                               const Message* prototype);
+
+
+ private:
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MessageFactory);
+};
+
+#define DECLARE_GET_REPEATED_FIELD(TYPE)                           \
+  template <>                                                      \
+  PROTOBUF_EXPORT const RepeatedField<TYPE>&                       \
+  Reflection::GetRepeatedFieldInternal<TYPE>(                      \
+      const Message& message, const FieldDescriptor* field) const; \
+                                                                   \
+  template <>                                                      \
+  PROTOBUF_EXPORT RepeatedField<TYPE>*                             \
+  Reflection::MutableRepeatedFieldInternal<TYPE>(                  \
+      Message * message, const FieldDescriptor* field) const;
+
+DECLARE_GET_REPEATED_FIELD(int32)
+DECLARE_GET_REPEATED_FIELD(int64)
+DECLARE_GET_REPEATED_FIELD(uint32)
+DECLARE_GET_REPEATED_FIELD(uint64)
+DECLARE_GET_REPEATED_FIELD(float)
+DECLARE_GET_REPEATED_FIELD(double)
+DECLARE_GET_REPEATED_FIELD(bool)
+
+#undef DECLARE_GET_REPEATED_FIELD
+
+// Tries to downcast this message to a generated message type.  Returns nullptr
+// if this class is not an instance of T.  This works even if RTTI is disabled.
+//
+// This also has the effect of creating a strong reference to T that will
+// prevent the linker from stripping it out at link time.  This can be important
+// if you are using a DynamicMessageFactory that delegates to the generated
+// factory.
+template <typename T>
+const T* DynamicCastToGenerated(const Message* from) {
+  // Compile-time assert that T is a generated type that has a
+  // default_instance() accessor, but avoid actually calling it.
+  const T& (*get_default_instance)() = &T::default_instance;
+  (void)get_default_instance;
+
+  // Compile-time assert that T is a subclass of google::protobuf::Message.
+  const Message* unused = static_cast<T*>(nullptr);
+  (void)unused;
+
+#if PROTOBUF_RTTI
+  return dynamic_cast<const T*>(from);
+#else
+  bool ok = T::default_instance().GetReflection() == from->GetReflection();
+  return ok ? down_cast<const T*>(from) : nullptr;
+#endif
+}
+
+template <typename T>
+T* DynamicCastToGenerated(Message* from) {
+  const Message* message_const = from;
+  return const_cast<T*>(DynamicCastToGenerated<T>(message_const));
+}
+
+// Call this function to ensure that this message's reflection is linked into
+// the binary:
+//
+//   google::protobuf::LinkMessageReflection<FooMessage>();
+//
+// This will ensure that the following lookup will succeed:
+//
+//   DescriptorPool::generated_pool()->FindMessageTypeByName("FooMessage");
+//
+// As a side-effect, it will also guarantee that anything else from the same
+// .proto file will also be available for lookup in the generated pool.
+//
+// This function does not actually register the message, so it does not need
+// to be called before the lookup.  However it does need to occur in a function
+// that cannot be stripped from the binary (ie. it must be reachable from main).
+//
+// Best practice is to call this function as close as possible to where the
+// reflection is actually needed.  This function is very cheap to call, so you
+// should not need to worry about its runtime overhead except in the tightest
+// of loops (on x86-64 it compiles into two "mov" instructions).
+template <typename T>
+void LinkMessageReflection() {
+  internal::StrongReference(T::default_instance);
+}
+
+// =============================================================================
+// Implementation details for {Get,Mutable}RawRepeatedPtrField.  We provide
+// specializations for <std::string>, <StringPieceField> and <Message> and
+// handle everything else with the default template which will match any type
+// having a method with signature "static const google::protobuf::Descriptor*
+// descriptor()". Such a type presumably is a descendant of google::protobuf::Message.
+
+template <>
+inline const RepeatedPtrField<std::string>&
+Reflection::GetRepeatedPtrFieldInternal<std::string>(
+    const Message& message, const FieldDescriptor* field) const {
+  return *static_cast<RepeatedPtrField<std::string>*>(
+      MutableRawRepeatedString(const_cast<Message*>(&message), field, true));
+}
+
+template <>
+inline RepeatedPtrField<std::string>*
+Reflection::MutableRepeatedPtrFieldInternal<std::string>(
+    Message* message, const FieldDescriptor* field) const {
+  return static_cast<RepeatedPtrField<std::string>*>(
+      MutableRawRepeatedString(message, field, true));
+}
+
+
+// -----
+
+template <>
+inline const RepeatedPtrField<Message>& Reflection::GetRepeatedPtrFieldInternal(
+    const Message& message, const FieldDescriptor* field) const {
+  return *static_cast<const RepeatedPtrField<Message>*>(GetRawRepeatedField(
+      message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1, nullptr));
+}
+
+template <>
+inline RepeatedPtrField<Message>* Reflection::MutableRepeatedPtrFieldInternal(
+    Message* message, const FieldDescriptor* field) const {
+  return static_cast<RepeatedPtrField<Message>*>(MutableRawRepeatedField(
+      message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1, nullptr));
+}
+
+template <typename PB>
+inline const RepeatedPtrField<PB>& Reflection::GetRepeatedPtrFieldInternal(
+    const Message& message, const FieldDescriptor* field) const {
+  return *static_cast<const RepeatedPtrField<PB>*>(
+      GetRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE, -1,
+                          PB::default_instance().GetDescriptor()));
+}
+
+template <typename PB>
+inline RepeatedPtrField<PB>* Reflection::MutableRepeatedPtrFieldInternal(
+    Message* message, const FieldDescriptor* field) const {
+  return static_cast<RepeatedPtrField<PB>*>(
+      MutableRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE,
+                              -1, PB::default_instance().GetDescriptor()));
+}
+
+template <typename Type>
+const Type& Reflection::DefaultRaw(const FieldDescriptor* field) const {
+  return *reinterpret_cast<const Type*>(schema_.GetFieldDefault(field));
+}
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_MESSAGE_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..857faa035fb8c2f0501dd69eac03a24b70f401a9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection_ops.h
@@ -0,0 +1,96 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// This header is logically internal, but is made public because it is used
+// from protocol-compiler-generated code, which may reside in other components.
+
+#ifndef GOOGLE_PROTOBUF_REFLECTION_OPS_H__
+#define GOOGLE_PROTOBUF_REFLECTION_OPS_H__
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/message.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// Basic operations that can be performed using reflection.
+// These can be used as a cheap way to implement the corresponding
+// methods of the Message interface, though they are likely to be
+// slower than implementations tailored for the specific message type.
+//
+// This class should stay limited to operations needed to implement
+// the Message interface.
+//
+// This class is really a namespace that contains only static methods.
+class PROTOBUF_EXPORT ReflectionOps {
+ public:
+  static void Copy(const Message& from, Message* to);
+  static void Merge(const Message& from, Message* to);
+  static void Clear(Message* message);
+  static bool IsInitialized(const Message& message);
+  static bool IsInitialized(const Message& message, bool check_fields,
+                            bool check_descendants);
+  static void DiscardUnknownFields(Message* message);
+
+  // Finds all unset required fields in the message and adds their full
+  // paths (e.g. "foo.bar[5].baz") to *names.  "prefix" will be attached to
+  // the front of each name.
+  static void FindInitializationErrors(const Message& message,
+                                       const std::string& prefix,
+                                       std::vector<std::string>* errors);
+
+ private:
+  // All methods are static.  No need to construct.
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ReflectionOps);
+};
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_REFLECTION_OPS_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3d36923e0507a71255343c2c56d6853d774babe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/source_context.pb.h
@@ -0,0 +1,300 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/source_context.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fsource_5fcontext_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fsource_5fcontext_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class SourceContext;
+class SourceContextDefaultTypeInternal;
+PROTOBUF_EXPORT extern SourceContextDefaultTypeInternal _SourceContext_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::SourceContext* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::SourceContext>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT SourceContext PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.SourceContext) */ {
+ public:
+  inline SourceContext() : SourceContext(nullptr) {}
+  virtual ~SourceContext();
+
+  SourceContext(const SourceContext& from);
+  SourceContext(SourceContext&& from) noexcept
+    : SourceContext() {
+    *this = ::std::move(from);
+  }
+
+  inline SourceContext& operator=(const SourceContext& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline SourceContext& operator=(SourceContext&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const SourceContext& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const SourceContext* internal_default_instance() {
+    return reinterpret_cast<const SourceContext*>(
+               &_SourceContext_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(SourceContext& a, SourceContext& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(SourceContext* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(SourceContext* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline SourceContext* New() const final {
+    return CreateMaybeMessage<SourceContext>(nullptr);
+  }
+
+  SourceContext* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<SourceContext>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const SourceContext& from);
+  void MergeFrom(const SourceContext& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(SourceContext* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.SourceContext";
+  }
+  protected:
+  explicit SourceContext(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fsource_5fcontext_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kFileNameFieldNumber = 1,
+  };
+  // string file_name = 1;
+  void clear_file_name();
+  const std::string& file_name() const;
+  void set_file_name(const std::string& value);
+  void set_file_name(std::string&& value);
+  void set_file_name(const char* value);
+  void set_file_name(const char* value, size_t size);
+  std::string* mutable_file_name();
+  std::string* release_file_name();
+  void set_allocated_file_name(std::string* file_name);
+  private:
+  const std::string& _internal_file_name() const;
+  void _internal_set_file_name(const std::string& value);
+  std::string* _internal_mutable_file_name();
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.SourceContext)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr file_name_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fsource_5fcontext_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// SourceContext
+
+// string file_name = 1;
+inline void SourceContext::clear_file_name() {
+  file_name_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline const std::string& SourceContext::file_name() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.SourceContext.file_name)
+  return _internal_file_name();
+}
+inline void SourceContext::set_file_name(const std::string& value) {
+  _internal_set_file_name(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.SourceContext.file_name)
+}
+inline std::string* SourceContext::mutable_file_name() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.SourceContext.file_name)
+  return _internal_mutable_file_name();
+}
+inline const std::string& SourceContext::_internal_file_name() const {
+  return file_name_.Get();
+}
+inline void SourceContext::_internal_set_file_name(const std::string& value) {
+  
+  file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void SourceContext::set_file_name(std::string&& value) {
+  
+  file_name_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.SourceContext.file_name)
+}
+inline void SourceContext::set_file_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.SourceContext.file_name)
+}
+inline void SourceContext::set_file_name(const char* value,
+    size_t size) {
+  
+  file_name_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.SourceContext.file_name)
+}
+inline std::string* SourceContext::_internal_mutable_file_name() {
+  
+  return file_name_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* SourceContext::release_file_name() {
+  // @@protoc_insertion_point(field_release:google.protobuf.SourceContext.file_name)
+  return file_name_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline void SourceContext::set_allocated_file_name(std::string* file_name) {
+  if (file_name != nullptr) {
+    
+  } else {
+    
+  }
+  file_name_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), file_name,
+      GetArena());
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.SourceContext.file_name)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fsource_5fcontext_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e629b9ea5ced2c75792919eeed8be612fc15108
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/wire_format.h
@@ -0,0 +1,412 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+//         atenasio@google.com (Chris Atenasio) (ZigZag transform)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// This header is logically internal, but is made public because it is used
+// from protocol-compiler-generated code, which may reside in other components.
+
+#ifndef GOOGLE_PROTOBUF_WIRE_FORMAT_H__
+#define GOOGLE_PROTOBUF_WIRE_FORMAT_H__
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/wire_format_lite.h>
+#include <google/protobuf/stubs/casts.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+class UnknownFieldSet;  // unknown_field_set.h
+}  // namespace protobuf
+}  // namespace google
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// This class is for internal use by the protocol buffer library and by
+// protocol-compiler-generated message classes.  It must not be called
+// directly by clients.
+//
+// This class contains code for implementing the binary protocol buffer
+// wire format via reflection.  The WireFormatLite class implements the
+// non-reflection based routines.
+//
+// This class is really a namespace that contains only static methods
+class PROTOBUF_EXPORT WireFormat {
+ public:
+  // Given a field return its WireType
+  static inline WireFormatLite::WireType WireTypeForField(
+      const FieldDescriptor* field);
+
+  // Given a FieldDescriptor::Type return its WireType
+  static inline WireFormatLite::WireType WireTypeForFieldType(
+      FieldDescriptor::Type type);
+
+  // Compute the byte size of a tag.  For groups, this includes both the start
+  // and end tags.
+  static inline size_t TagSize(int field_number, FieldDescriptor::Type type);
+
+  // These procedures can be used to implement the methods of Message which
+  // handle parsing and serialization of the protocol buffer wire format
+  // using only the Reflection interface.  When you ask the protocol
+  // compiler to optimize for code size rather than speed, it will implement
+  // those methods in terms of these procedures.  Of course, these are much
+  // slower than the specialized implementations which the protocol compiler
+  // generates when told to optimize for speed.
+
+  // Read a message in protocol buffer wire format.
+  //
+  // This procedure reads either to the end of the input stream or through
+  // a WIRETYPE_END_GROUP tag ending the message, whichever comes first.
+  // It returns false if the input is invalid.
+  //
+  // Required fields are NOT checked by this method.  You must call
+  // IsInitialized() on the resulting message yourself.
+  static bool ParseAndMergePartial(io::CodedInputStream* input,
+                                   Message* message);
+
+  // This is meant for internal protobuf use (WireFormat is an internal class).
+  // This is the reflective implementation of the _InternalParse functionality.
+  static const char* _InternalParse(Message* msg, const char* ptr,
+                                    internal::ParseContext* ctx);
+
+  // Serialize a message in protocol buffer wire format.
+  //
+  // Any embedded messages within the message must have their correct sizes
+  // cached.  However, the top-level message need not; its size is passed as
+  // a parameter to this procedure.
+  //
+  // These return false iff the underlying stream returns a write error.
+  static void SerializeWithCachedSizes(const Message& message, int size,
+                                       io::CodedOutputStream* output) {
+    int expected_endpoint = output->ByteCount() + size;
+    output->SetCur(
+        _InternalSerialize(message, output->Cur(), output->EpsCopy()));
+    GOOGLE_CHECK_EQ(output->ByteCount(), expected_endpoint)
+        << ": Protocol message serialized to a size different from what was "
+           "originally expected.  Perhaps it was modified by another thread "
+           "during serialization?";
+  }
+  static uint8* _InternalSerialize(const Message& message, uint8* target,
+                                   io::EpsCopyOutputStream* stream);
+
+  // Implements Message::ByteSize() via reflection.  WARNING:  The result
+  // of this method is *not* cached anywhere.  However, all embedded messages
+  // will have their ByteSize() methods called, so their sizes will be cached.
+  // Therefore, calling this method is sufficient to allow you to call
+  // WireFormat::SerializeWithCachedSizes() on the same object.
+  static size_t ByteSize(const Message& message);
+
+  // -----------------------------------------------------------------
+  // Helpers for dealing with unknown fields
+
+  // Skips a field value of the given WireType.  The input should start
+  // positioned immediately after the tag.  If unknown_fields is non-NULL,
+  // the contents of the field will be added to it.
+  static bool SkipField(io::CodedInputStream* input, uint32 tag,
+                        UnknownFieldSet* unknown_fields);
+
+  // Reads and ignores a message from the input.  If unknown_fields is
+  // non-NULL, the contents will be added to it.
+  static bool SkipMessage(io::CodedInputStream* input,
+                          UnknownFieldSet* unknown_fields);
+
+  // Read a packed enum field. If the is_valid function is not NULL, values
+  // for which is_valid(value) returns false are appended to
+  // unknown_fields_stream.
+  static bool ReadPackedEnumPreserveUnknowns(io::CodedInputStream* input,
+                                             uint32 field_number,
+                                             bool (*is_valid)(int),
+                                             UnknownFieldSet* unknown_fields,
+                                             RepeatedField<int>* values);
+
+  // Write the contents of an UnknownFieldSet to the output.
+  static void SerializeUnknownFields(const UnknownFieldSet& unknown_fields,
+                                     io::CodedOutputStream* output) {
+    output->SetCur(InternalSerializeUnknownFieldsToArray(
+        unknown_fields, output->Cur(), output->EpsCopy()));
+  }
+  // Same as above, except writing directly to the provided buffer.
+  // Requires that the buffer have sufficient capacity for
+  // ComputeUnknownFieldsSize(unknown_fields).
+  //
+  // Returns a pointer past the last written byte.
+  static uint8* SerializeUnknownFieldsToArray(
+      const UnknownFieldSet& unknown_fields, uint8* target) {
+    io::EpsCopyOutputStream stream(
+        target, static_cast<int>(ComputeUnknownFieldsSize(unknown_fields)),
+        io::CodedOutputStream::IsDefaultSerializationDeterministic());
+    return InternalSerializeUnknownFieldsToArray(unknown_fields, target,
+                                                 &stream);
+  }
+  static uint8* InternalSerializeUnknownFieldsToArray(
+      const UnknownFieldSet& unknown_fields, uint8* target,
+      io::EpsCopyOutputStream* stream);
+
+  // Same thing except for messages that have the message_set_wire_format
+  // option.
+  static void SerializeUnknownMessageSetItems(
+      const UnknownFieldSet& unknown_fields, io::CodedOutputStream* output) {
+    output->SetCur(InternalSerializeUnknownMessageSetItemsToArray(
+        unknown_fields, output->Cur(), output->EpsCopy()));
+  }
+  // Same as above, except writing directly to the provided buffer.
+  // Requires that the buffer have sufficient capacity for
+  // ComputeUnknownMessageSetItemsSize(unknown_fields).
+  //
+  // Returns a pointer past the last written byte.
+  static uint8* SerializeUnknownMessageSetItemsToArray(
+      const UnknownFieldSet& unknown_fields, uint8* target);
+  static uint8* InternalSerializeUnknownMessageSetItemsToArray(
+      const UnknownFieldSet& unknown_fields, uint8* target,
+      io::EpsCopyOutputStream* stream);
+
+  // Compute the size of the UnknownFieldSet on the wire.
+  static size_t ComputeUnknownFieldsSize(const UnknownFieldSet& unknown_fields);
+
+  // Same thing except for messages that have the message_set_wire_format
+  // option.
+  static size_t ComputeUnknownMessageSetItemsSize(
+      const UnknownFieldSet& unknown_fields);
+
+  // Helper functions for encoding and decoding tags.  (Inlined below and in
+  // _inl.h)
+  //
+  // This is different from MakeTag(field->number(), field->type()) in the
+  // case of packed repeated fields.
+  static uint32 MakeTag(const FieldDescriptor* field);
+
+  // Parse a single field.  The input should start out positioned immediately
+  // after the tag.
+  static bool ParseAndMergeField(
+      uint32 tag,
+      const FieldDescriptor* field,  // May be NULL for unknown
+      Message* message, io::CodedInputStream* input);
+
+  // Serialize a single field.
+  static void SerializeFieldWithCachedSizes(
+      const FieldDescriptor* field,  // Cannot be NULL
+      const Message& message, io::CodedOutputStream* output) {
+    output->SetCur(InternalSerializeField(field, message, output->Cur(),
+                                          output->EpsCopy()));
+  }
+  static uint8* InternalSerializeField(
+      const FieldDescriptor* field,  // Cannot be NULL
+      const Message& message, uint8* target, io::EpsCopyOutputStream* stream);
+
+  // Compute size of a single field.  If the field is a message type, this
+  // will call ByteSize() for the embedded message, insuring that it caches
+  // its size.
+  static size_t FieldByteSize(const FieldDescriptor* field,  // Cannot be NULL
+                              const Message& message);
+
+  // Parse/serialize a MessageSet::Item group.  Used with messages that use
+  // option message_set_wire_format = true.
+  static bool ParseAndMergeMessageSetItem(io::CodedInputStream* input,
+                                          Message* message);
+  static void SerializeMessageSetItemWithCachedSizes(
+      const FieldDescriptor* field, const Message& message,
+      io::CodedOutputStream* output) {
+    output->SetCur(InternalSerializeMessageSetItem(
+        field, message, output->Cur(), output->EpsCopy()));
+  }
+  static uint8* InternalSerializeMessageSetItem(
+      const FieldDescriptor* field, const Message& message, uint8* target,
+      io::EpsCopyOutputStream* stream);
+  static size_t MessageSetItemByteSize(const FieldDescriptor* field,
+                                       const Message& message);
+
+  // Computes the byte size of a field, excluding tags. For packed fields, it
+  // only includes the size of the raw data, and not the size of the total
+  // length, but for other length-delimited types, the size of the length is
+  // included.
+  static size_t FieldDataOnlyByteSize(
+      const FieldDescriptor* field,  // Cannot be NULL
+      const Message& message);
+
+  enum Operation {
+    PARSE = 0,
+    SERIALIZE = 1,
+  };
+
+  // Verifies that a string field is valid UTF8, logging an error if not.
+  // This function will not be called by newly generated protobuf code
+  // but remains present to support existing code.
+  static void VerifyUTF8String(const char* data, int size, Operation op);
+  // The NamedField variant takes a field name in order to produce an
+  // informative error message if verification fails.
+  static void VerifyUTF8StringNamedField(const char* data, int size,
+                                         Operation op, const char* field_name);
+
+ private:
+  struct MessageSetParser;
+  // Skip a MessageSet field.
+  static bool SkipMessageSetField(io::CodedInputStream* input,
+                                  uint32 field_number,
+                                  UnknownFieldSet* unknown_fields);
+
+  // Parse a MessageSet field.
+  static bool ParseAndMergeMessageSetField(uint32 field_number,
+                                           const FieldDescriptor* field,
+                                           Message* message,
+                                           io::CodedInputStream* input);
+  // Parses the value from the wire that belongs to tag.
+  static const char* _InternalParseAndMergeField(Message* msg, const char* ptr,
+                                                 internal::ParseContext* ctx,
+                                                 uint64 tag,
+                                                 const Reflection* reflection,
+                                                 const FieldDescriptor* field);
+
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(WireFormat);
+};
+
+// Subclass of FieldSkipper which saves skipped fields to an UnknownFieldSet.
+class PROTOBUF_EXPORT UnknownFieldSetFieldSkipper : public FieldSkipper {
+ public:
+  UnknownFieldSetFieldSkipper(UnknownFieldSet* unknown_fields)
+      : unknown_fields_(unknown_fields) {}
+  ~UnknownFieldSetFieldSkipper() override {}
+
+  // implements FieldSkipper -----------------------------------------
+  bool SkipField(io::CodedInputStream* input, uint32 tag) override;
+  bool SkipMessage(io::CodedInputStream* input) override;
+  void SkipUnknownEnum(int field_number, int value) override;
+
+ protected:
+  UnknownFieldSet* unknown_fields_;
+};
+
+// inline methods ====================================================
+
+inline WireFormatLite::WireType WireFormat::WireTypeForField(
+    const FieldDescriptor* field) {
+  if (field->is_packed()) {
+    return WireFormatLite::WIRETYPE_LENGTH_DELIMITED;
+  } else {
+    return WireTypeForFieldType(field->type());
+  }
+}
+
+inline WireFormatLite::WireType WireFormat::WireTypeForFieldType(
+    FieldDescriptor::Type type) {
+  // Some compilers don't like enum -> enum casts, so we implicit_cast to
+  // int first.
+  return WireFormatLite::WireTypeForFieldType(
+      static_cast<WireFormatLite::FieldType>(implicit_cast<int>(type)));
+}
+
+inline uint32 WireFormat::MakeTag(const FieldDescriptor* field) {
+  return WireFormatLite::MakeTag(field->number(), WireTypeForField(field));
+}
+
+inline size_t WireFormat::TagSize(int field_number,
+                                  FieldDescriptor::Type type) {
+  // Some compilers don't like enum -> enum casts, so we implicit_cast to
+  // int first.
+  return WireFormatLite::TagSize(
+      field_number,
+      static_cast<WireFormatLite::FieldType>(implicit_cast<int>(type)));
+}
+
+inline void WireFormat::VerifyUTF8String(const char* data, int size,
+                                         WireFormat::Operation op) {
+#ifdef GOOGLE_PROTOBUF_UTF8_VALIDATION_ENABLED
+  WireFormatLite::VerifyUtf8String(
+      data, size, static_cast<WireFormatLite::Operation>(op), NULL);
+#else
+  // Avoid the compiler warning about unused variables.
+  (void)data;
+  (void)size;
+  (void)op;
+#endif
+}
+
+inline void WireFormat::VerifyUTF8StringNamedField(const char* data, int size,
+                                                   WireFormat::Operation op,
+                                                   const char* field_name) {
+#ifdef GOOGLE_PROTOBUF_UTF8_VALIDATION_ENABLED
+  WireFormatLite::VerifyUtf8String(
+      data, size, static_cast<WireFormatLite::Operation>(op), field_name);
+#else
+  // Avoid the compiler warning about unused variables.
+  (void)data;
+  (void)size;
+  (void)op;
+  (void)field_name;
+#endif
+}
+
+
+inline uint8* InternalSerializeUnknownMessageSetItemsToArray(
+    const UnknownFieldSet& unknown_fields, uint8* target,
+    io::EpsCopyOutputStream* stream) {
+  return WireFormat::InternalSerializeUnknownMessageSetItemsToArray(
+      unknown_fields, target, stream);
+}
+
+inline size_t ComputeUnknownMessageSetItemsSize(
+    const UnknownFieldSet& unknown_fields) {
+  return WireFormat::ComputeUnknownMessageSetItemsSize(unknown_fields);
+}
+
+// Compute the size of the UnknownFieldSet on the wire.
+PROTOBUF_EXPORT
+size_t ComputeUnknownFieldsSize(const InternalMetadata& metadata, size_t size,
+                                CachedSize* cached_size);
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_WIRE_FORMAT_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h
new file mode 100644
index 0000000000000000000000000000000000000000..14ff7bfb910fea278c05f79a2613669a9cf9e080
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.h
@@ -0,0 +1,3993 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2016-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C API
+
+#ifndef ONEAPI_DNNL_DNNL_H
+#define ONEAPI_DNNL_DNNL_H
+
+#include "oneapi/dnnl/dnnl_common.h"
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+#include "oneapi/dnnl/dnnl_version.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Changes the primitive descriptor to point to the next available
+/// implementation.
+///
+/// @param primitive_desc A primitive descriptor to change.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+/// @returns #dnnl_last_impl_reached if no more implementations available,
+/// in which case the primitive descriptor itself is kept unchanged.
+dnnl_status_t DNNL_API dnnl_primitive_desc_next_impl(
+        dnnl_primitive_desc_t primitive_desc);
+
+/// Clones a primitive descriptor. The resulting primitive descriptor must be
+/// destroyed separately.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param existing_primitive_desc Primitive descriptor to clone.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_desc_clone(
+        dnnl_primitive_desc_t *primitive_desc,
+        const_dnnl_primitive_desc_t existing_primitive_desc);
+
+/// Returns a constant reference to the attributes of a primitive descriptor.
+///
+/// @warning
+///     It is an error to destroy the resulting @p attr.
+///
+/// @warning
+///     The lifetime of an @p attr is the same as that of a @p
+///     primitive_desc, so it is an error to use the @p attr once the @p
+///     primitive_desc has been destroyed.
+///
+/// @param primitive_desc Primitive descriptor.
+/// @param attr Output primitive attributes.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_desc_get_attr(
+        const_dnnl_primitive_desc_t primitive_desc,
+        const_dnnl_primitive_attr_t *attr);
+
+/// Destroys a primitive descriptor.
+///
+/// @param primitive_desc Primitive descriptor to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_desc_destroy(
+        dnnl_primitive_desc_t primitive_desc);
+
+/// Queries a primitive descriptor for various pieces of information.
+///
+/// The most common use case is to query a primitive descriptor, created with
+/// source, weights, and destination memory descriptors with format tags set
+/// to #dnnl_format_tag_any, for the corresponding memory descriptors (in this
+/// case the @p what is set to #dnnl_query_src_md, #dnnl_query_weights_md, and
+/// #dnnl_query_dst_md respectively) so that it is possible to create memory
+/// objects and reorder primitives if necessary.
+///
+/// Another typical use case is to query a primitive descriptor for workspace
+/// memory descriptor (with @p what set to #dnnl_query_workspace_md). If this
+/// query returns #dnnl_not_required status, then workspace memory is not
+/// required.
+///
+/// @note
+///     When querying for a memory descriptor for a scratchpad, a workspace,
+///     or an optional parameter, the query will return a pointer to a zero
+///     memory descriptor if the parameter is not needed.
+///
+/// A few other use cases:
+///  - query a primitive descriptor for the implementation information string
+///    (#dnnl_query_impl_info_str)
+///  - query a primitive descriptor for the number of inputs and outputs
+///    (#dnnl_query_num_of_inputs_s32 and #dnnl_query_num_of_outputs_s32
+///    respectively)
+///
+/// @sa dnnl_query_t for more options
+///
+/// @param primitive_desc Primitive descriptor.
+/// @param what Parameter to query.
+/// @param index Index of the parameter to query for.
+/// @param result Output result. The type depends on the query. For example,
+///     it must be a @c dnnl_memory_desc_t* if querying for a memory
+///     descriptor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_desc_query(
+        const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what,
+        int index, void *result);
+
+/// Queries primitive descriptor for a memory descriptor.
+///
+/// @note
+///     This function is a convenience version of
+///     #dnnl_primitive_desc_query().
+///
+/// @param primitive_desc Primitive descriptor.
+/// @param what Kind of memory descriptor parameter to query for.
+/// @param index Index of the parameter to query.
+/// @returns A pointer to the requested memory descriptor.
+/// @returns A pointer to a zero memory descriptor if the parameter is not
+///          needed.
+/// @returns NULL in case of any error.
+///
+const_dnnl_memory_desc_t DNNL_API dnnl_primitive_desc_query_md(
+        const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what,
+        int index);
+
+/// Queries primitive descriptor for a signed 32bit int.
+///
+/// @note
+///     This function is a convenience version of
+///     #dnnl_primitive_desc_query().
+///
+/// @param primitive_desc Primitive descriptor.
+/// @param what Kind of the value to query for.
+/// @param index Index of the parameter to query.
+/// @returns The requested value.
+/// @returns 0 in case of any error (in particular if the queried entity is
+///     not of type int32_t). Note that 0 may also be the actual returned
+///     value.
+int DNNL_API dnnl_primitive_desc_query_s32(
+        const_dnnl_primitive_desc_t primitive_desc, dnnl_query_t what,
+        int index);
+
+/// Creates a primitive.
+///
+/// @param primitive Output primitive.
+/// @param primitive_desc Primitive descriptor used to create the primitive.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_create(dnnl_primitive_t *primitive,
+        const_dnnl_primitive_desc_t primitive_desc);
+
+/// Creates a primitive from a cache blob.
+///
+/// @param primitive Output primitive.
+/// @param primitive_desc Primitive descriptor used to create the primitive.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_create_from_cache_blob(
+        dnnl_primitive_t *primitive, const_dnnl_primitive_desc_t primitive_desc,
+        size_t size, const uint8_t *cache_blob);
+
+/// Executes a primitive.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+
+/// @note If any argument in @p args is padded (padded_dims >
+/// dims), the primitive execution will assume properly zero-padded
+/// input arguments, and produce zero-padded output arguments.
+dnnl_status_t DNNL_API dnnl_primitive_execute(const_dnnl_primitive_t primitive,
+        dnnl_stream_t stream, int nargs, const dnnl_exec_arg_t *args);
+
+/// Retrieves a constant reference to the primitive descriptor of a given
+/// primitive.
+///
+/// @warning
+///     It is an error to destroy the returned object. It is owned by the
+///     primitive. The @c const qualifier of the returned object prevents
+///     such attempts.
+///
+/// @param primitive Primitive to query for the primitive descriptor.
+/// @param primitive_desc Output primitive descriptor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_get_primitive_desc(
+        const_dnnl_primitive_t primitive,
+        const_dnnl_primitive_desc_t *primitive_desc);
+
+/// Retrieves a cache blob associated with the given primitive.
+///
+/// @param primitive Primitive to query for the cache blob.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is
+///     nullptr then the size of the cache blob is returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+///
+/// @note The cache blob can be empty. It's the user's responsibility to check
+///     whether it's empty prior to passing it to
+///     #dnnl_primitive_create_from_cache_blob().
+dnnl_status_t DNNL_API dnnl_primitive_get_cache_blob(
+        const_dnnl_primitive_t primitive, size_t *size, uint8_t *cache_blob);
+
+/// Destroys a primitive.
+///
+/// @param primitive The primitive to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_destroy(dnnl_primitive_t primitive);
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_attributes
+/// @{
+
+/// Creates an empty (default) primitive attributes with all the parameters
+/// set to their default values.
+///
+/// Empty attributes are implied whenever the respective argument is NULL.
+///
+/// @param attr Output primitive attributes.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_create(dnnl_primitive_attr_t *attr);
+
+/// Clones primitive attributes.
+///
+/// @param attr Output primitive attributes.
+/// @param existing_attr Primitive attributes to clone.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_clone(
+        dnnl_primitive_attr_t *attr, const_dnnl_primitive_attr_t existing_attr);
+
+/// Destroys primitive attributes.
+///
+/// @param attr Primitive attributes to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_destroy(dnnl_primitive_attr_t attr);
+
+/// Returns probability for output dropout primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param dropout_desc Output dropout memory descriptor
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_dropout(
+        const_dnnl_primitive_attr_t attr,
+        const_dnnl_memory_desc_t *dropout_desc);
+
+/// Sets probability for output dropout primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param dropout_desc Output dropout memory descriptor
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_dropout(
+        dnnl_primitive_attr_t attr, const_dnnl_memory_desc_t dropout_desc);
+
+/// Returns the floating-point math mode primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param mode Output FP math mode.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_fpmath_mode(
+        const_dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t *mode);
+
+/// Sets the floating-point math mode primitive attributes.
+///
+/// @param attr Primitive attributes.
+/// @param mode FP math mode. The possible values are:
+///     #dnnl_fpmath_mode_strict (default),
+///     #dnnl_fpmath_mode_bf16,
+///     #dnnl_fpmath_mode_f16,
+///     #dnnl_fpmath_mode_tf32,
+///     #dnnl_fpmath_mode_any.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_fpmath_mode(
+        dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t mode);
+
+/// Returns the floating-point math mode primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param mode Output FP math mode.
+/// @param apply_to_int Output use floating-point arithmetic for integer primitives.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_fpmath_mode_v2(
+        const_dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t *mode,
+        int *apply_to_int);
+
+/// Sets the floating-point math mode primitive attributes.
+///
+/// @param attr Primitive attributes.
+/// @param mode FP math mode. The possible values are:
+///     #dnnl_fpmath_mode_strict (default),
+///     #dnnl_fpmath_mode_bf16,
+///     #dnnl_fpmath_mode_f16,
+///     #dnnl_fpmath_mode_tf32,
+///     #dnnl_fpmath_mode_any.
+/// @param apply_to_int Boolean. Use of floating-point arithmetic for integer primitives.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_fpmath_mode_v2(
+        dnnl_primitive_attr_t attr, dnnl_fpmath_mode_t mode, int apply_to_int);
+
+/// Returns the deterministic primitive attribute value.
+///
+/// @param attr Primitive attributes.
+/// @param value Output deterministic attribute value
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_deterministic(
+        const_dnnl_primitive_attr_t attr, int *value);
+
+/// Sets the deterministic primitive attribute value.
+///
+/// @param attr Primitive attributes.
+/// @param value Boolean value to set deterministic attribute.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_deterministic(
+        dnnl_primitive_attr_t attr, int value);
+
+/// Returns the accumulation mode primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param mode Output accumulation mode.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_accumulation_mode(
+        const_dnnl_primitive_attr_t attr, dnnl_accumulation_mode_t *mode);
+
+/// Sets the accumulation mode primitive attribute.
+///
+/// @param attr Primitive attributes.
+/// @param mode Accumulation mode. The possible values are:
+///     #dnnl_accumulation_mode_strict (default), which is s32 for quantized primitives, f32/f64 otherwise
+///     #dnnl_accumulation_mode_relaxed, which is same as strict but allows intermediate accumulators to be in src/dst datatype
+///     #dnnl_accumulation_mode_any, which allows accumulators to be src/dst datatype or any wider type.
+///     #dnnl_accumulation_mode_f32,
+///     #dnnl_accumulation_mode_s32,
+///     #dnnl_accumulation_mode_f16.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_accumulation_mode(
+        dnnl_primitive_attr_t attr, dnnl_accumulation_mode_t mode);
+
+/// Returns the primitive attributes scratchpad mode.
+///
+/// @param attr Primitive attributes.
+/// @param mode Output scratchpad mode.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_scratchpad_mode(
+        const_dnnl_primitive_attr_t attr, dnnl_scratchpad_mode_t *mode);
+
+/// Sets primitive attributes scratchpad mode.
+///
+/// @param attr Primitive attributes.
+/// @param mode Scratchpad mode. The possible values are:
+///     #dnnl_scratchpad_mode_library (default) and
+///     #dnnl_scratchpad_mode_user.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_scratchpad_mode(
+        dnnl_primitive_attr_t attr, dnnl_scratchpad_mode_t mode);
+
+/// Sets primitive attributes scaling factors for primitive operations for a
+/// given memory argument. The scaling factors must be passed at execution time
+/// as an argument with index #DNNL_ARG_ATTR_SCALES | arg.
+///
+/// @sa dnnl_primitive_attr_set_scales_mask
+///
+///
+/// @param attr Primitive attributes.
+/// @param arg Parameter argument index as passed to the
+///     dnnl_primitive_execute() call.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the tensor dimensions and the @p scales array.
+///     The set i-th bit indicates that a dedicated scaling factor is used for
+///     each index along that dimension. Set the mask to 0 to use a common
+///     scaling factor for the whole output tensor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales_mask(
+        dnnl_primitive_attr_t attr, int arg, int mask);
+
+/// Sets primitive attributes scaling factors for primitive operations for a
+/// given memory argument. The scaling factors must be passed at execution time
+/// as an argument with index #DNNL_ARG_ATTR_SCALES | arg.
+///
+/// @sa dnnl_primitive_attr_set_scales
+///
+///
+/// @param attr Primitive attributes.
+/// @param arg Parameter argument index as passed to the
+///     dnnl_primitive_execute() call.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the tensor dimensions and the @p scales array.
+///     The set i-th bit indicates that a dedicated scaling factor is used for
+///     each index along that dimension. Set the mask to 0 to use a common
+///     scaling factor for the whole output tensor.
+/// @param ndims Number of group dimensions.
+/// @param group_dims Scaling factors correspondence groups that define the
+///     correspondence between the tensor dimensions and the scales array.
+///     The group dimensions should only be provided for each logical dimension
+///     that has correspondence mask @p mask set.
+/// @param data_type Scaling factors data_type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales(
+        dnnl_primitive_attr_t attr, int arg, int mask, int ndims,
+        const dnnl_dims_t group_dims, dnnl_data_type_t data_type);
+
+/// Sets primitive attributes zero points for primitive operations for a given
+/// memory argument. The zero points must be passed at execution time
+/// as an argument with index #DNNL_ARG_ATTR_ZERO_POINTS | arg.
+///
+/// @sa dnnl_primitive_attr_set_zero_points_mask
+///
+///
+/// @param attr Primitive attributes.
+/// @param arg Parameter argument index as passed to the
+///     dnnl_primitive_execute() call.
+/// @param mask Zero point correspondence mask that defines the
+///     correspondence between the tensor dimensions and the @p
+///     zero_points array. The set i-th bit indicates that a dedicated
+///     zero point is used for each index along that dimension. Set the
+///     mask to 0 to use a common zero point for the whole output tensor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points_mask(
+        dnnl_primitive_attr_t attr, int arg, int mask);
+
+/// Sets primitive attributes zero points for primitive operations for a given
+/// memory argument. The zero points must be passed at execution time
+/// as an argument with index #DNNL_ARG_ATTR_ZERO_POINTS | arg.
+///
+/// @sa dnnl_primitive_attr_set_zero_points
+///
+///
+/// @param attr Primitive attributes.
+/// @param arg Parameter argument index as passed to the
+///     dnnl_primitive_execute() call.
+/// @param mask Zero point correspondence mask that defines the
+///     correspondence between the tensor dimensions and the @p
+///     zero_points array. The set i-th bit indicates that a dedicated
+///     zero point is used for each index along that dimension. Set the
+///     mask to 0 to use a common zero point for the whole output tensor.
+/// @param ndims Number of group dimensions.
+/// @param group_dims Zero point factors correspondence groups that define the
+///     correspondence between the tensor dimensions and the zero_points array.
+///     The group dimensions should be only provided for each logical dimension
+///     that has the bit set correspondence mask @p mask set.
+/// @param data_type Zero points factors data_type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points(
+        dnnl_primitive_attr_t attr, int arg, int mask, int ndims,
+        const dnnl_dims_t group_dims, dnnl_data_type_t data_type);
+
+/// Sets the rounding mode attribute value for a given argument
+///
+/// @param attr Primitive attributes.
+/// @param arg Argument for which rounding mode should be set.
+/// @param mode Rounding mode to apply to the argument.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_rounding(
+        dnnl_primitive_attr_t attr, int arg, dnnl_rounding_mode_t mode);
+
+/// Returns the rounding mode attribute value for a given argument
+///
+/// @param attr Primitive attributes.
+/// @param arg Argument for which rounding mode query applies.
+/// @param mode Output rounding mode.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_rounding(
+        dnnl_primitive_attr_t attr, int arg, dnnl_rounding_mode_t *mode);
+
+/// Returns primitive attributes post-ops.
+///
+/// @warning
+///     The output @p post_ops points to the internal @p attr field, so it is
+///     an error to modify or destroy them. The lifetime of @p post_ops is
+///     the same as that of the @p attr it belongs to, so it is an error to
+///     use @p post_ops after @p attr has been destroyed.
+///
+/// @param attr Primitive attributes.
+/// @param post_ops Output post-ops.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_post_ops(
+        const_dnnl_primitive_attr_t attr, const_dnnl_post_ops_t *post_ops);
+
+/// Sets primitive attributes post-ops.
+///
+/// @note
+///     There is no way to check whether the post-ops would be supported by
+///     the target primitive. Any error will be reported by the
+///     dnnl_<primitive name>_[propagation kind]_primitive_desc_create() function call.
+///
+/// @param attr Primitive attributes.
+/// @param post_ops Post-ops to set.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_post_ops(
+        dnnl_primitive_attr_t attr, const_dnnl_post_ops_t post_ops);
+
+/// Creates empty post-ops sequence.
+///
+/// @param post_ops Output post-ops.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_create(dnnl_post_ops_t *post_ops);
+
+/// Clones post-ops primitive attribute.
+///
+/// @param post_ops Output post-ops primitive attribute.
+/// @param existing_post_ops Post-ops primitive attribute to clone.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_clone(
+        dnnl_post_ops_t *post_ops, const_dnnl_post_ops_t existing_post_ops);
+
+/// Destroys post-ops.
+///
+/// @param post_ops Post-ops to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_destroy(dnnl_post_ops_t post_ops);
+
+/// Returns the length of post-ops.
+///
+/// @param post_ops Post-ops.
+/// @returns The number of post-ops entries.
+int DNNL_API dnnl_post_ops_len(const_dnnl_post_ops_t post_ops);
+
+/// Returns the kind of a post-op entry.
+///
+/// @param post_ops Post-ops.
+/// @param index Post-op entry index.
+/// @returns The kind of the post-op with the specified index.
+/// @returns #dnnl_undefined_primitive if there is no post-op at the specified
+///     index.
+dnnl_primitive_kind_t DNNL_API dnnl_post_ops_get_kind(
+        const_dnnl_post_ops_t post_ops, int index);
+
+/// Appends an accumulation v3 (sum) to post-ops. Prior to accumulating the
+/// result, a zero point is subtracted from the previous value and is
+/// multiplied by the scale.
+///
+/// The kind of this post-op is #dnnl_sum.
+///
+/// This feature may improve performance for cases like dequantize the
+/// asymmetrically quantized sum's src1 tensor to f32 domain before performing
+/// the sum operation by subtracting the @p zero_point before the scaling.
+///
+/// In the simplest case where accumulation is the only post-op, the
+/// computations will be:
+///
+///     dst[:] <- scale * (dst[:] - zero_point) + op(...)
+///                                             // instead of dst[:] <- op(...)
+///
+/// If @p data_type is specified, original dst tensor will be reinterpreted
+/// as a tensor with provided data type. Since it is reinterpretation,
+/// data_type and dst data type should have the same size.
+/// As a result, computations will be:
+///
+///     dst[:] <- scale * (as_data_type(dst[:]) - zero_point) + op(...)
+///                                        // instead of dst[:] <- op(...)
+/// @note
+///     This post-op executes in-place and does not change the
+///     destination layout.
+///
+/// @param post_ops Post-ops.
+/// @param scale Accumulation scaling factor.
+/// @param zero_point Single scalar int32_t value of zero point.
+/// @param data_type Accumulation data_type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_append_sum(dnnl_post_ops_t post_ops,
+        float scale, int32_t zero_point, dnnl_data_type_t data_type);
+
+/// Returns the parameters of an accumulation (sum) post-op with
+/// zero point and data type parameter.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the sum post-op.
+/// @param scale Output accumulation scaling factor.
+/// @param zero_point Zero point.
+/// @param data_type Data type for accumulation.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_sum(
+        const_dnnl_post_ops_t post_ops, int index, float *scale,
+        int32_t *zero_point, dnnl_data_type_t *data_type);
+
+/// Appends an elementwise post-op.
+///
+/// The kind of this post operation is #dnnl_eltwise.
+///
+/// In the simplest case when the elementwise is the only post operation, the
+/// computations would be:
+///
+///     dst[:] <- eltwise_op (op(...)) // instead of dst[:] <- op(...)
+///
+/// where eltwise_op is configured with the given parameters.
+///
+/// @param post_ops Post-ops.
+/// @param alg_kind Elementwise algorithm for the post-op.
+/// @param alpha Alpha parameter for the elementwise algorithm.
+/// @param beta Beta parameter for the elementwise algorithm.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_append_eltwise(dnnl_post_ops_t post_ops,
+        dnnl_alg_kind_t alg_kind, float alpha, float beta);
+
+/// Returns the parameters of an elementwise post-op.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the elementwise post-op.
+/// @param alg_kind Output elementwise algorithm kind.
+/// @param alpha Output alpha parameter for the elementwise algorithm.
+/// @param beta Output beta parameter for the elementwise algorithm.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+/// @returns #dnnl_invalid_arguments if @p index does not refer to an
+///     elementwise post-op.
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_eltwise(
+        const_dnnl_post_ops_t post_ops, int index, dnnl_alg_kind_t *alg_kind,
+        float *alpha, float *beta);
+
+/// Appends a depthwise post-op convolution.
+///
+/// This post-op can only be fused with a 2D 1x1 convolution (convolution with
+/// weights spatial dimensions equal to 1 i.e., kh=kw=1).
+///
+/// The kind of this post-op is #dnnl_convolution.
+///
+/// The number of outputs for primitive with fusion is one. The output spatial
+/// size can be derived as below:
+///
+/// output_height = ceil(output_height_1x1_convolution, stride)
+/// output_width = ceil(output_width_1x1_convolution, stride)
+///
+/// See @ref dev_guide_attributes_post_ops_depthwise and
+/// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info.
+///
+/// @param post_ops Post-ops.
+/// @param weights_data_type Weights data type of depthwise post-op
+/// @param bias_data_type Bias data type of depthwise post-op
+/// @param dst_data_type Output data type of depthwise post-op
+/// @param kernel_size Size of kernel of depthwise post-op
+/// @param stride_size Size of stride of depthwise post-op
+/// @param padding_l_size Size of left and top paddings of depthwise post-op
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise
+dnnl_status_t DNNL_API dnnl_post_ops_append_dw(dnnl_post_ops_t post_ops,
+        dnnl_data_type_t weights_data_type, dnnl_data_type_t bias_data_type,
+        dnnl_data_type_t dst_data_type, dnnl_dim_t kernel_size,
+        dnnl_dim_t stride_size, dnnl_dim_t padding_l_size);
+
+/// Returns the parameters of an depthwise post-op.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the elementwise post-op.
+/// @param weights_data_type Weights data type of depthwise post-op
+/// @param bias_data_type Bias data type of depthwise post-op
+/// @param dst_data_type Output data type of depthwise post-op
+/// @param kernel_size Size of kernel of depthwise post-op
+/// @param stride_size Size of stride of depthwise post-op
+/// @param padding_l_size Size of left and top paddings of depthwise post-op
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_dw(
+        const_dnnl_post_ops_t post_ops, int index,
+        dnnl_data_type_t *weights_data_type, dnnl_data_type_t *bias_data_type,
+        dnnl_data_type_t *dst_data_type, dnnl_dim_t *kernel_size,
+        dnnl_dim_t *stride_size, dnnl_dim_t *padding_l_size);
+
+/// Appends a binary post-op.
+///
+/// The kind of this post operation is #dnnl_binary.
+///
+/// In the simplest case when the binary is the only post operation, the
+/// computations would be:
+///
+///     dst[:] <- binary_op (dst[:], another_input[:])
+///
+/// where binary_op is configured with the given parameters. binary_op supports
+/// broadcast semantics for a second operand.
+///
+/// @param post_ops Post-ops.
+/// @param alg_kind Binary algorithm for the post-op.
+/// @param src1_desc Memory descriptor of a second operand.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_append_binary(dnnl_post_ops_t post_ops,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src1_desc);
+
+/// Returns the parameters of a binary post-op.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the binary post-op.
+/// @param alg_kind Output binary algorithm kind.
+/// @param src1_desc Output memory descriptor of a second operand.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+/// @returns #dnnl_invalid_arguments if @p index does not refer to a binary
+///     post-op.
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_binary(
+        const_dnnl_post_ops_t post_ops, int index, dnnl_alg_kind_t *alg_kind,
+        const_dnnl_memory_desc_t *src1_desc);
+
+/// Appends a prelu forward post-op.
+///
+/// The kind of this post-op is #dnnl::primitive::kind::prelu.
+///
+/// The post-op can be defined as:
+///
+///      dst[:] <- prelu(dst[:], weights[:])
+///      prelu:
+///      dst[:] <- dst[:] if dst[:] > 0
+///      dst[:] <- dst[:] * weights[:] if dst[:] <= 0
+///
+///
+/// @note
+///     The order of dimensions does not depend on how elements are laid
+///     out in memory. For example:
+///     - for a 2D CNN activations tensor the order is always (n, c)
+///     - for a 4D CNN activations tensor the order is always (n, c, h, w)
+///     - for a 5D CNN weights tensor the order is always
+///        (g, oc, ic, kh, kw)
+///
+///    Prelu weights tensor is passed in runtime execution phase. Prelu
+///    weights tensor data type is implicitly assumed as f32 using plain
+///    layout (a, ab, acb, acdb, acdeb)
+///
+/// @param post_ops Post-ops.
+/// @param mask Defines the correspondence between the output tensor
+///     dimensions and the prelu weights tensor. The set i-th bit indicates
+///     that a dedicated weights value is used for each index along that
+///     dimension. Set the mask to 0 to use a common weights value
+///     for the whole output tensor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_append_prelu(
+        dnnl_post_ops_t post_ops, int mask);
+
+/// Returns the parameters of a prelu post-op.
+///
+/// @param post_ops Post-ops.
+/// @param index Index of the prelu post-op.
+/// @param mask Mask of the prelu post-op.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_post_ops_get_params_prelu(
+        const_dnnl_post_ops_t post_ops, int index, int *mask);
+
+/// @} dnnl_api_attributes
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// Destroys a memory descriptor.
+///
+/// @param memory_desc Memory descriptor to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_destroy(dnnl_memory_desc_t memory_desc);
+
+/// Clones a memory descriptor. The resulting memory descriptor must be
+/// destroyed separately.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param existing_memory_desc Memory descriptor to clone.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_clone(dnnl_memory_desc_t *memory_desc,
+        const_dnnl_memory_desc_t existing_memory_desc);
+
+/// Retrieves a binary blob associated with the given memory descriptor
+///
+/// @param blob Output pointer to binary blob.
+///     If not nullptr, size bytes of the memory descriptor blob are written.
+/// @param size Output pointer to the size of the binary blob in bytes.
+///     Size is written if blob is nullptr.
+/// @param memory_desc input memory descriptor to serialize
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_get_blob(
+        uint8_t *blob, size_t *size, const_dnnl_memory_desc_t memory_desc);
+
+/// Creates a memory descriptor from a memory descriptor binary blob.
+///
+/// @param memory_desc Output pointer to a newly allocated memory descriptor.
+/// @param blob Pointer to a memory descriptor binary blob.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_blob(
+        dnnl_memory_desc_t *memory_desc, const uint8_t *blob);
+
+/// Creates a memory descriptor using dimensions and strides.
+///
+/// @note
+///     As always, the logical order of dimensions corresponds to the `abc...`
+///     format tag, and the physical meaning of the dimensions depends on both
+///     the primitive that consumes the memory and the context of that
+///     consumption.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param strides Strides in each dimension.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_strides(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, const dnnl_dims_t strides);
+
+/// Creates a memory descriptor using dimensions and memory format tag.
+///
+/// @note
+///     As always, the logical order of dimensions corresponds to the `abc...`
+///     format tag, and the physical meaning of the dimensions depends on both
+///     the primitive that consumes the memory and the context of that
+///     consumption.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param tag Memory format tag. Can be #dnnl_format_tag_any which would
+///     allow a primitive to chose the final memory format. In this case the
+///     format_kind field of the memory descriptor would be set to
+///     #dnnl_format_kind_any.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_tag(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, dnnl_format_tag_t tag);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory descriptor for CSR encoding.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param nnz Number of non-zero entries.
+/// @param indices_dt Data type of indices.
+/// @param pointers_dt Data type of pointers.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_csr_encoding(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, dnnl_dim_t nnz, dnnl_data_type_t indices_dt,
+        dnnl_data_type_t pointers_dt);
+
+/// Creates a memory descriptor for COO encoding.
+///
+/// The created memory descriptor will describe a memory object that
+/// contains n+1 buffers for an n-dimensional tensor.
+/// The buffers have the following meaning and assigned numbers (index):
+///  - 0: values
+///  - 1: indices for dimension 0
+///  - 2: indices for dimension 1 ...
+///  - n: indices for dimension n-1
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions.
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param nnz Number of non-zero entries.
+/// @param indices_dt Data type of indices.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_coo_encoding(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, dnnl_dim_t nnz,
+        dnnl_data_type_t indices_dt);
+
+/// Creates a memory descriptor for packed sparse encoding.
+///
+/// The created memory descriptor cannot be used to create a memory
+/// object. It can only be used to create a primitive descriptor to
+/// query the actual memory descriptor (similar to the format tag
+/// `any`).
+///
+/// @warning
+///     The meaning and content of the handles of the memory object that
+///     is created using the queried memory descriptor are unspecified
+///     therefore using the content is an undefined behavior.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param nnz Number of non-zero entries.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_packed_encoding(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, dnnl_dim_t nnz);
+#endif
+
+/// Creates a memory descriptor for a region inside an area
+/// described by an existing memory descriptor.
+///
+/// @warning
+///     Some combinations of physical memory layout and/or offsets or dims may
+///     result in a failure to create a submemory.
+//
+/// @param memory_desc Output memory descriptor.
+/// @param parent_memory_desc An existing memory descriptor.
+/// @param dims Sizes of the region.
+/// @param offsets Offsets to the region from the encompassing
+///     memory object in each dimension
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_submemory(
+        dnnl_memory_desc_t *memory_desc,
+        const_dnnl_memory_desc_t parent_memory_desc, const dnnl_dims_t dims,
+        const dnnl_dims_t offsets);
+
+/// Creates a memory descriptor by reshaping an existing one. The new
+/// memory descriptor inherits the data type. This operation is valid only for
+/// memory descriptors that have format_kind #dnnl_blocked or
+/// #dnnl_format_kind_any.
+///
+/// The resulting memory descriptor must be destroyed separately.
+///
+/// The operation ensures the transformation of the physical memory format
+/// corresponds to the transformation of the logical dimensions. If such
+/// transformation is impossible, the function returns #dnnl_invalid_arguments.
+///
+/// The reshape operation can be described as a combination of the following
+/// basic operations:
+/// 1. Add a dimension of size `1`. This is always possible.
+/// 2. Remove a dimension of size `1`. This is possible only if the dimension
+///    has no padding (i.e. `padded_dims[dim] == dims[dim] && dims[dim] == 1`).
+/// 3. Split a dimension into multiple ones. This is possible only if the size
+///    of the dimension is exactly equal to the product of the split ones and
+///    the dimension does not have padding (i.e.
+///    `padded_dims[dim] = dims[dim]`).
+/// 4. Joining multiple consecutive dimensions into a single one. As in the
+///    cases above, this requires that the dimensions do not have padding and
+///    that the memory format is such that in physical memory these dimensions
+///    are dense and have the same order as their logical counterparts. This
+///    also assumes that these dimensions are not blocked.
+///    - Here, dense means:
+///      `stride for dim[i] == (stride for dim[i + 1]) * dim[i + 1]`;
+///    - And same order means:
+///      `i < j` if and only if `stride for dim[j] <= stride for dim[i]`.
+///
+/// @warning
+///     Some combinations of physical memory layout and/or offsets or
+///     dimensions may result in a failure to make a reshape.
+///
+/// @param out_memory_desc Output memory descriptor.
+/// @param in_memory_desc An existing memory descriptor. Must have format_kind
+///     set to #dnnl_blocked or #dnnl_format_kind_any.
+/// @param ndims Number of dimensions for the output memory descriptor.
+/// @param dims Dimensions for the output memory descriptor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_reshape(
+        dnnl_memory_desc_t *out_memory_desc,
+        const_dnnl_memory_desc_t in_memory_desc, int ndims,
+        const dnnl_dims_t dims);
+
+/// Creates a memory descriptor by permuting axes in an existing one.
+///
+/// The physical memory layout representation is adjusted accordingly to
+/// maintain the consistency between the logical and physical parts of the
+/// memory descriptor.
+///
+/// The resulting memory descriptor must be destroyed separately.
+///
+/// The new memory descriptor inherits the data type. This operation is valid
+/// only for memory descriptors that have format_kind set to #dnnl_blocked or
+/// #dnnl_format_kind_any.
+///
+/// The logical axes will be permuted in the following manner:
+/// ```
+/// for (i: 0 .. in_memory_desc->ndims)
+///     out_memory_desc->dims[permutation[i]] = in_memory_desc->dims[i];
+/// ```
+///
+/// Example:
+/// @code
+///     dnnl_memory_desc_t in_md, out_md, expect_out_md;
+///
+///     const int permutation[] = {1, 0}; // swap the first and the second axes
+///
+///     dnnl_dims_t in_dims = {2, 3}, out_dims = {3, 2};
+///     dnnl_format_tag_t in_tag = dnnl_ab, out_tag = dnnl_ba;
+///
+///     dnnl_memory_desc_create_with_tag(
+///             &in_md, 2, in_dims, data_type, in_tag);
+///     dnnl_memory_desc_create_with_tag(
+///             &expect_out_md, 2, out_dims, data_type, out_tag);
+///
+///     dnnl_memory_desc_permute_axes(&out_md, in_md, permutation);
+///     assert(dnnl_memory_desc_equal(out_md, expect_out_md));
+///
+///     dnnl_memory_desc_destroy(in_md);
+///     dnnl_memory_desc_destroy(out_md);
+///     dnnl_memory_desc_destroy(expect_out_md);
+/// @endcode
+///
+/// @param out_memory_desc Output memory descriptor.
+/// @param in_memory_desc An existing memory descriptor. Must have format_kind
+///     set to #dnnl_blocked or #dnnl_format_kind_any.
+/// @param permutation Axes permutation (of size `in_memory_desc->ndims`).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_permute_axes(
+        dnnl_memory_desc_t *out_memory_desc,
+        const_dnnl_memory_desc_t in_memory_desc, const int *permutation);
+
+/// Queries a memory descriptor for various pieces of information.
+///
+/// The following information can be queried:
+///  - Number of dimensions (#dnnl_query_ndims_s32)
+///  - Dimensions (#dnnl_query_dims) in the following order:
+///    - CNN data tensors: mini-batch, channel, spatial
+///      (<code>{N, C, [[D,] H,] W}</code>)
+///    - CNN weight tensors: group (optional), output channel, input channel,
+///      spatial (<code>{[G,] O, I, [[D,] H,] W}</code>)
+///    - RNN data tensors: time, mini-batch, channels (<code>{T, N, C}</code>)
+///      or layers, directions, states, mini-batch, channels
+///      (<code>{L, D, S, N, C}</code>)
+///    - RNN weight tensor: layers, directions, input channel, gates, output
+///      channels (<code>{L, D, I, G, O}</code>)
+///  - Data type of the tensor elements (#dnnl_query_data_type)
+///  - Padded dimensions (#dnnl_query_padded_dims) - size of the data including
+///    padding in each dimension
+///  - Padded offsets (#dnnl_query_padded_offsets) - per-dimension offset from
+///    the padding to actual data, the top-level tensor with offsets applied
+///    must lie within the padding area.
+///  - Submemory offset (#dnnl_query_submemory_offset_s64) - offset from memory
+///    origin to the current block, non-zero only in a description of a memory
+///    sub-block.
+///  - Format kind (#dnnl_query_format_kind) - memory format kind
+///
+/// @note
+///    The order of dimensions does not depend on the memory format, so
+///    whether the data is laid out in #dnnl_nchw or #dnnl_nhwc
+///    the dims for 4D CN data tensor would be <code>{N, C, H, W}</code>.
+///
+/// The following queries are applicable only to format kind #dnnl_blocked.
+///  - Strides (#dnnl_query_strides) between the outermost blocks or in case
+///    of plain (non-blocked) formats the strides between dimensions
+///  - Number of innermost blocks (#dnnl_query_inner_nblks_s32), e.g.
+///    `{4, 16, 4}` in case of `OIhw_4i16o4i`
+///  - Size of the innermost blocks (#dnnl_query_inner_blks), e.g. 3 in case
+///    of `OIhw_4i16o4i_`
+///  - Logical indices of the blocks (#dnnl_query_inner_idxs), e.g. `{1, 0, 1}`
+///    in case of `4i16o4i`, because `i` is the 1st dim and `o` is the 0st dim
+///
+/// @param memory_desc Memory descriptor.
+/// @param what Parameter to query.
+/// @param result Output result. The type depends on the query. For example,
+///     it must be a @c dnnl_dims_t** if querying for a strides.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_query(
+        const_dnnl_memory_desc_t memory_desc, dnnl_query_t what, void *result);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Queries a memory descriptor for various pieces of information. This version
+/// support additional queries #dnnl_query_sparse_encoding, #dnnl_query_nnz_s64
+/// #dnnl_query_num_handles_s32 and #dnnl_query_data_type for a particular
+/// buffer.
+///
+/// The following information can be queried:
+///  - Number of dimensions (#dnnl_query_ndims_s32)
+///  - Dimensions (#dnnl_query_dims) in the following order:
+///    - CNN data tensors: mini-batch, channel, spatial
+///      (<code>{N, C, [[D,] H,] W}</code>)
+///    - CNN weight tensors: group (optional), output channel, input channel,
+///      spatial (<code>{[G,] O, I, [[D,] H,] W}</code>)
+///    - RNN data tensors: time, mini-batch, channels (<code>{T, N, C}</code>)
+///      or layers, directions, states, mini-batch, channels
+///      (<code>{L, D, S, N, C}</code>)
+///    - RNN weight tensor: layers, directions, input channel, gates, output
+///      channels (<code>{L, D, I, G, O}</code>)
+///  - Data type of the tensor elements (#dnnl_query_data_type)
+///  - Padded dimensions (#dnnl_query_padded_dims) - size of the data including
+///    padding in each dimension
+///  - Padded offsets (#dnnl_query_padded_offsets) - per-dimension offset from
+///    the padding to actual data, the top-level tensor with offsets applied
+///    must lie within the padding area.
+///  - Submemory offset (#dnnl_query_submemory_offset_s64) - offset from memory
+///    origin to the current block, non-zero only in a description of a memory
+///    sub-block.
+///  - Format kind (#dnnl_query_format_kind) - memory format kind
+///
+/// @note
+///    The order of dimensions does not depend on the memory format, so
+///    whether the data is laid out in #dnnl_nchw or #dnnl_nhwc
+///    the dims for 4D CN data tensor would be <code>{N, C, H, W}</code>.
+///
+/// The following queries are applicable only to format kind #dnnl_blocked.
+///  - Strides (#dnnl_query_strides) between the outermost blocks or in case
+///    of plain (non-blocked) formats the strides between dimensions
+///  - Number of innermost blocks (#dnnl_query_inner_nblks_s32), e.g.
+///    `{4, 16, 4}` in case of `OIhw_4i16o4i`
+///  - Size of the innermost blocks (#dnnl_query_inner_blks), e.g. 3 in case
+///    of `OIhw_4i16o4i_`
+///  - Logical indices of the blocks (#dnnl_query_inner_idxs), e.g. `{1, 0, 1}`
+///    in case of `4i16o4i`, because `i` is the 1st dim and `o` is the 0st dim
+///
+/// @param memory_desc Memory descriptor.
+/// @param what Parameter to query.
+/// @param index Index of the parameter to query for. It is mostly used with
+///     #dnnl_query_data_type to specify which data type is being queried.
+///     The main data type (data type of values) has always index 0. For other
+///     indices please refer to the API for creating a memory descriptor for
+///     sparse encoding.
+/// @param result Output result. The type depends on the query. For example,
+///     it must be a @c dnnl_dims_t** if querying for a strides.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_query_v2(
+        const_dnnl_memory_desc_t memory_desc, dnnl_query_t what, int index,
+        void *result);
+#endif
+
+/// Compares two memory descriptors.
+///
+/// Use this function to identify whether a reorder is required between the
+/// two memories
+///
+/// @param lhs Left-hand side of the comparison.
+/// @param rhs Right-hand side of the comparison.
+/// @returns 1 if the descriptors are the same.
+/// @returns 0 if the descriptors are different.
+int DNNL_API dnnl_memory_desc_equal(
+        const_dnnl_memory_desc_t lhs, const_dnnl_memory_desc_t rhs);
+
+/// Returns the size of a memory descriptor.
+///
+/// @param memory_desc Memory descriptor.
+/// @returns The number of bytes required for memory described by a memory
+///     descriptor.
+size_t DNNL_API dnnl_memory_desc_get_size(const_dnnl_memory_desc_t memory_desc);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Returns the size of the data that corresponds to the given index.
+///
+/// @param memory_desc Memory descriptor.
+/// @param index Index of the buffer.
+///
+/// @returns The number of bytes required for the requested data.
+size_t DNNL_API dnnl_memory_desc_get_size_v2(
+        const_dnnl_memory_desc_t memory_desc, int index);
+#endif
+
+/// Returns the size of data type.
+///
+/// @param data_type Data type.
+/// @returns The number of bytes occupied by data type.
+size_t DNNL_API dnnl_data_type_size(dnnl_data_type_t data_type);
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE, the constructed memory
+/// object will have the underlying buffer set. In this case, the buffer will
+/// be initialized as if dnnl_memory_set_data_handle() had been called.
+///
+/// @sa dnnl_memory_set_data_handle()
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - DNNL_MEMORY_NONE to create dnnl_memory without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - DNNL_MEMORY_NONE Instructs the library to skip allocation of the
+///       memory buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        int nhandles, void **handles);
+#endif
+
+/// Returns the memory descriptor for a memory object.
+///
+/// @param memory Memory object.
+/// @param memory_desc Output memory descriptor (a copy).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_get_memory_desc(
+        const_dnnl_memory_t memory, const_dnnl_memory_desc_t *memory_desc);
+
+/// Returns the engine of a memory object.
+///
+/// @param memory Memory object.
+/// @param engine Output engine on which the memory is located.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_get_engine(
+        const_dnnl_memory_t memory, dnnl_engine_t *engine);
+
+/// Maps a memory object and returns a host-side pointer to a memory buffer
+/// with a copy of its contents.
+///
+/// Mapping enables explicit direct access to memory contents for the engines
+/// that do not support it implicitly.
+///
+/// Mapping is an exclusive operation - a memory object cannot be used in
+/// other operations until this memory object is unmapped.
+///
+/// @note
+///     Any primitives working with @p memory should be completed before
+///     the memory is mapped. Use dnnl_stream_wait to synchronize the
+///     corresponding execution stream.
+///
+/// @note
+///     The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are
+///     mainly provided for debug and testing purposes, and their performance
+///     may be suboptimal.
+///
+/// @param memory Memory object.
+/// @param mapped_ptr Output pointer to the mapped buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_map_data(
+        const_dnnl_memory_t memory, void **mapped_ptr);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Maps a memory object and returns a host-side pointer to a memory buffer
+/// with a copy of its contents. The memory buffer corresponds to the given
+/// index.
+///
+/// Mapping enables explicit direct access to memory contents for the engines
+/// that do not support it implicitly.
+///
+/// Mapping is an exclusive operation - a memory object cannot be used in
+/// other operations until this memory object is unmapped.
+///
+/// @note
+///     Any primitives working with @p memory should be completed before
+///     the memory is mapped. Use dnnl_stream_wait to synchronize the
+///     corresponding execution stream.
+///
+/// @note
+///     The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are
+///     mainly provided for debug and testing purposes, and their performance
+///     may be suboptimal.
+///
+/// @param memory Memory object.
+/// @param mapped_ptr Output pointer to the mapped buffer.
+/// @param index Index of the buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_map_data_v2(
+        const_dnnl_memory_t memory, void **mapped_ptr, int index);
+#endif
+
+/// Unmaps a memory object and writes back any changes made to the previously
+/// mapped memory buffer. The pointer to the mapped buffer must be obtained
+/// via the dnnl_memory_map_data() call.
+///
+/// @note
+///     The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are
+///     mainly provided for debug and testing purposes, and their performance
+///     may be suboptimal.
+///
+/// @param memory Memory object.
+/// @param mapped_ptr Pointer to the mapped buffer that must have been
+///     obtained using the dnnl_memory_map_data() function.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_unmap_data(
+        const_dnnl_memory_t memory, void *mapped_ptr);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Unmaps a memory object and writes back any changes made to the previously
+/// mapped memory buffer. The pointer to the mapped buffer must be obtained
+/// via the dnnl_memory_map_data() call. The buffer corresponds to the given
+/// index.
+///
+/// @note
+///     The dnnl_memory_map_data() and dnnl_memory_unmap_data() functions are
+///     mainly provided for debug and testing purposes, and their performance
+///     may be suboptimal.
+///
+/// @param memory Memory object.
+/// @param mapped_ptr Pointer to the mapped buffer that must have been
+///     obtained using the dnnl_memory_map_data() function.
+/// @param index Index of the buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_unmap_data_v2(
+        const_dnnl_memory_t memory, void *mapped_ptr, int index);
+#endif
+
+/// Returns memory object's data handle.
+///
+/// @param memory Memory object.
+/// @param handle Output data handle. For the CPU engine, the data handle is a
+///     pointer to the actual data. For OpenCL it is a cl_mem.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_get_data_handle(
+        const_dnnl_memory_t memory, void **handle);
+
+/// Sets the underlying memory buffer.
+///
+/// @param memory Memory object.
+/// @param handle Data handle. For the CPU engine or when USM is used, the
+///     memory buffer is a pointer to the actual data. For OpenCL it is a
+///     `cl_mem`.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_set_data_handle(
+        dnnl_memory_t memory, void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Returns an underlying memory buffer that corresponds to the given index.
+///
+/// @param memory Memory object.
+/// @param handle Data handle. For the CPU engine or when USM is used, the
+///     memory buffer is a pointer to the actual data. For OpenCL it is a
+///     `cl_mem`.
+/// @param index Index of the buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_get_data_handle_v2(
+        const_dnnl_memory_t memory, void **handle, int index);
+
+/// Sets an underlying memory buffer that corresponds to the given index.
+///
+/// @param memory Memory object.
+/// @param handle Data handle. For the CPU engine or when USM is used, the
+///     memory buffer is a pointer to the actual data. For OpenCL it is a
+///     `cl_mem`.
+/// @param index Index of the buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_set_data_handle_v2(
+        dnnl_memory_t memory, void *handle, int index);
+#endif
+
+/// Destroys a memory object.
+///
+/// @param memory Memory object to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_destroy(dnnl_memory_t memory);
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+
+/// @addtogroup dnnl_api_reorder
+/// @{
+
+/// Creates a primitive descriptor for a reorder primitive.
+///
+/// @param reorder_primitive_desc Output primitive descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param src_engine Engine on which the source memory object will be
+///     located.
+/// @param dst_desc Destination memory descriptor.
+/// @param dst_engine Engine on which the destination memory object
+///     will be located.
+/// @param attr Primitive attributes to use (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_reorder_primitive_desc_create(
+        dnnl_primitive_desc_t *reorder_primitive_desc,
+        const_dnnl_memory_desc_t src_desc, dnnl_engine_t src_engine,
+        const_dnnl_memory_desc_t dst_desc, dnnl_engine_t dst_engine,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_reorder
+
+/// @addtogroup dnnl_api_concat
+/// @{
+
+/// Creates a primitive descriptor for an out-of-place concatenation
+/// primitive.
+///
+/// @param concat_primitive_desc Output primitive descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param n Number of source parameters.
+/// @param concat_dimension Source tensors will be concatenated over
+///     dimension with this index. Note that order of dimensions does
+///     not depend on memory format.
+/// @param src_descs Array of source memory descriptors with @p n elements.
+/// @param attr Primitive attributes to use (can be NULL).
+/// @param engine Engine to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_concat_primitive_desc_create(
+        dnnl_primitive_desc_t *concat_primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t dst_desc, int n, int concat_dimension,
+        const_dnnl_memory_desc_t const *src_descs,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_concat
+
+/// @addtogroup dnnl_api_sum
+/// @{
+
+/// Creates a primitive descriptor for an (out-of-place) sum primitive.
+///
+/// @param sum_primitive_desc Output primitive descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param n Number of source parameters.
+/// @param scales Vector of scales to multiply data in each source
+///     memory by.
+/// @param src_descs Array of source memory descriptors having @p n elements.
+/// @param attr Primitive attributes to use (can be NULL).
+/// @param engine Engine to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sum_primitive_desc_create(
+        dnnl_primitive_desc_t *sum_primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t dst_desc, int n, const float *scales,
+        const_dnnl_memory_desc_t const *src_descs,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_sum
+
+/// @addtogroup dnnl_api_binary
+/// @{
+
+/// Creates a primitive descriptor for a binary primitive.
+///
+/// @note
+///     Memory descriptors @p src1_desc and @p dst_desc are allowed to be
+///     initialized with #dnnl_format_tag_any or with format_kind set to
+///     #dnnl_format_kind_any.
+///
+/// @note
+///     Both memory descriptors must have the same number of dimensions.
+///     Element broadcasting is supported for memory descriptor @p src1_desc
+///     and are applied to @p src1_desc dimensions that have size equal to 1.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Algorithm kind. Valid values are #dnnl_binary_add,
+///     #dnnl_binary_mul, #dnnl_binary_max, #dnnl_binary_min, #dnnl_binary_div,
+///     #dnnl_binary_sub, #dnnl_binary_ge, #dnnl_binary_gt, #dnnl_binary_le,
+///     #dnnl_binary_lt, #dnnl_binary_eq and #dnnl_binary_ne.
+/// @param src0_desc Source 0 memory descriptor.
+/// @param src1_desc Source 1 memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src0_desc,
+        const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t dst_desc,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a binary primitive with support of
+/// ternary operators.
+///
+/// @note
+///     Memory descriptors @p src1_desc, @p src2_desc and @p dst_desc are
+///     allowed to be initialized with #dnnl_format_tag_any or with format_kind
+///     set to #dnnl_format_kind_any.
+///
+/// @note
+///     All memory descriptors must have the same number of dimensions.
+///     Element broadcasting is supported for memory descriptor @p src1_desc
+///     and is applied to @p src1_desc dimensions that have a size equal to 1.
+///     There is no broadcasting support for @p src2_desc.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Algorithm kind.
+/// @param src0_desc Source 0 memory descriptor.
+/// @param src1_desc Source 1 memory descriptor.
+/// @param src2_desc Source memory descriptor for ternary operations. Might
+///     be empty.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create_v2(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src0_desc,
+        const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t src2_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_binary
+
+/// @addtogroup dnnl_api_convolution
+/// @{
+
+/// Creates a primitive descriptor for a convolution forward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind Convolution algorithm. Possible values are
+///     #dnnl_convolution_direct, #dnnl_convolution_winograd,
+///     #dnnl_convolution_auto.
+/// @param src_desc Source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory
+///     descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param dst_desc Destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_convolution_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc,
+        const dnnl_dims_t strides, const dnnl_dims_t dilates,
+        const dnnl_dims_t padding_l, const dnnl_dims_t padding_r,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a convolution backward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Convolution algorithm. Possible values are
+///     #dnnl_convolution_direct, #dnnl_convolution_winograd,
+///     #dnnl_convolution_auto.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_convolution_backward_data_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides,
+        const dnnl_dims_t dilates, const dnnl_dims_t padding_l,
+        const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a convolution weights gradient primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Convolution algorithm. Possible values are
+///     #dnnl_convolution_direct, #dnnl_convolution_winograd,
+///     #dnnl_convolution_auto.
+/// @param src_desc Source memory descriptor.
+/// @param diff_weights_desc Diff weights memory descriptor.
+/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero
+///     memory descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_convolution_backward_weights_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t diff_weights_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides,
+        const dnnl_dims_t dilates, const dnnl_dims_t padding_l,
+        const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_convolution
+
+/// @addtogroup dnnl_api_deconvolution
+/// @{
+
+/// Creates a primitive descriptor for a deconvolution forward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind Deconvolution algorithm. Possible values are
+///     #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd.
+/// @param src_desc Source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory
+///     descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param dst_desc Destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_deconvolution_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc,
+        const dnnl_dims_t strides, const dnnl_dims_t dilates,
+        const dnnl_dims_t padding_l, const dnnl_dims_t padding_r,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a deconvolution backward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Deconvolution algorithm. Possible values are
+///     #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_deconvolution_backward_data_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides,
+        const dnnl_dims_t dilates, const dnnl_dims_t padding_l,
+        const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a deconvolution weights gradient
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r contain
+/// values for spatial dimensions only and hence must have the same number of
+/// elements as there are spatial dimensions. The order of values is the same
+/// as in the tensor: depth (for 3D tensors), height (for 3D and 2D tensors),
+/// and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Deconvolution algorithm. Possible values are
+///     #dnnl_deconvolution_direct, #dnnl_deconvolution_winograd.
+/// @param src_desc Source memory descriptor.
+/// @param diff_weights_desc Diff weights memory descriptor.
+/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero
+///     memory descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param dilates Array of dilations for spatial dimension. A zero value
+///     means no dilation in the corresponding dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API
+dnnl_deconvolution_backward_weights_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t diff_weights_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides,
+        const dnnl_dims_t dilates, const dnnl_dims_t padding_l,
+        const dnnl_dims_t padding_r, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_deconvolution
+
+/// @addtogroup dnnl_api_shuffle
+/// @{
+
+/// Creates a primitive descriptor for a shuffle forward propagation primitive
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param axis The axis along which the data is shuffled.
+/// @param group_size Shuffle group size.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_shuffle_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, int axis, dnnl_dim_t group_size,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a shuffle backward propagation primitive
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param axis The axis along which the data is shuffled.
+/// @param group_size Shuffle group size.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_shuffle_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, int axis, dnnl_dim_t group_size,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_shuffle
+
+/// @addtogroup dnnl_api_eltwise
+/// @{
+
+/// Creates a primitive descriptor for an eltwise forward propagation primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind Elementwise algorithm kind.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param alpha The alpha parameter for the elementwise operation. Specific
+///     meaning depends on the algorithm.
+/// @param beta The beta parameter for the elementwise operation. Specific
+///     meaning depends on the algorithm.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_eltwise_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc,
+        float alpha, float beta, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an eltwise backward propagation
+///     primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Elementwise algorithm kind.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param data_desc Destination memory descriptor if one of the
+///     "use_dst_for_bwd" algorithms are used (such as
+///     #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor otherwise.
+/// @param alpha The alpha parameter for the elementwise operation. Specific
+///     meaning depends on the algorithm.
+/// @param beta The beta parameter for the elementwise operation. Specific
+///     meaning depends on the algorithm.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_eltwise_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t data_desc, float alpha, float beta,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_eltwise
+
+/// @addtogroup dnnl_api_softmax
+/// @{
+
+/// Creates a primitive descriptor for a softmax forward propagation primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind Softmax algorithm kind: either #dnnl_softmax_accurate, or
+///     #dnnl_softmax_log.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param softmax_axis Axis over which softmax is computed.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_softmax_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc,
+        int softmax_axis, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a softmax backward propagation primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Softmax algorithm kind: either #dnnl_softmax_accurate, or
+///     #dnnl_softmax_log.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param softmax_axis Axis over which softmax is computed.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_softmax_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t dst_desc, int softmax_axis,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_softmax
+
+/// @addtogroup dnnl_api_pooling
+/// @{
+
+/// Creates a primitive descriptor for a pooling forward propagation
+///     primitive.
+///
+/// Arrays @p strides, @p kernel, @p dilation, @p padding_l and @p padding_r
+/// contain values for spatial dimensions only and hence must have the same
+/// number of elements as there are spatial dimensions. The order of values
+/// is the same as in the tensor: depth (for 3D tensors),
+/// height (for 3D and 2D tensors), and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind Pooling algorithm kind: either #dnnl_pooling_max,
+///     #dnnl_pooling_avg_include_padding, or #dnnl_pooling_avg_exclude_padding.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param kernel Array of kernel spatial dimensions.
+/// @param dilation Array of dilations for spatial dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_pooling_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc,
+        const dnnl_dims_t strides, const dnnl_dims_t kernel,
+        const dnnl_dims_t dilation, const dnnl_dims_t padding_l,
+        const dnnl_dims_t padding_r, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a pooling backward propagation
+///     primitive.
+///
+/// Arrays @p strides, @p kernel, @p dilation, @p padding_l and @p padding_r
+/// contain values for spatial dimensions only and hence must have the same
+/// number of elements as there are spatial dimensions. The order of values
+/// is the same as in the tensor: depth (for 3D tensors),
+/// height (for 3D and 2D tensors), and width.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Pooling algorithm kind: either #dnnl_pooling_max,
+///     #dnnl_pooling_avg_include_padding, or #dnnl_pooling_avg_exclude_padding.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param strides Array of strides for spatial dimension.
+/// @param kernel Array of kernel spatial dimensions.
+/// @param dilation Array of dilations for spatial dimension.
+/// @param padding_l Array of padding values for low indices for each spatial
+///     dimension `([[front,] top,] left)`.
+/// @param padding_r Array of padding values for high indices for each spatial
+///     dimension `([[back,] bottom,] right)`. Can be NULL in which case
+///     padding is considered to be symmetrical.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_pooling_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc, const dnnl_dims_t strides,
+        const dnnl_dims_t kernel, const dnnl_dims_t dilation,
+        const dnnl_dims_t padding_l, const dnnl_dims_t padding_r,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_pooling
+
+/// @addtogroup dnnl_api_prelu
+/// @{
+
+/// Creates a primitive descriptor for a PReLU (leaky ReLU with trainable
+///     alpha parameter) forward propagation primitive.
+///
+/// @note
+///     weights descriptor is allowed to be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param weights_desc Alpha parameters memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_prelu_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a PReLU (leaky ReLU with trainable
+///     alpha parameter) backward propagation primitive.
+///
+/// @note
+///     weights descriptor and diff_weights descriptor are allowed
+///     to be initialized with #dnnl_format_tag_any or with format_kind
+///     set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param src_desc Source memory descriptor.
+/// @param weights_desc Alpha parameters memory descriptor.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_weights_desc Diff alpha parameters memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_prelu_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_weights_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_prelu
+
+/// @addtogroup dnnl_api_lrn
+/// @{
+
+/// Creates a primitive descriptor for an LRN forward propagation primitive.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind LRN algorithm kind: either #dnnl_lrn_across_channels or
+///     #dnnl_lrn_within_channel.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param local_size Regularization local size.
+/// @param alpha The alpha regularization parameter.
+/// @param beta The beta regularization parameter.
+/// @param k The k regularization parameter.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lrn_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t dst_desc,
+        dnnl_dim_t local_size, float alpha, float beta, float k,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an LRN backward propagation primitive.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind LRN algorithm kind: either #dnnl_lrn_across_channels or
+///     #dnnl_lrn_within_channel.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param local_size Regularization local size.
+/// @param alpha The alpha regularization parameter.
+/// @param beta The beta regularization parameter.
+/// @param k The k regularization parameter.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lrn_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t src_desc, dnnl_dim_t local_size, float alpha,
+        float beta, float k, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_lrn
+
+/// @addtogroup dnnl_api_batch_normalization
+/// @{
+
+/// Creates a primitive descriptor for a batch normalization forward propagation
+///     primitive.
+///
+/// @note
+///     In-place operation is supported: the dst can refer to the same memory
+///     as the src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param epsilon Batch normalization epsilon parameter.
+/// @param flags Batch normalization flags (@ref dnnl_normalization_flags_t).
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_batch_normalization_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, float epsilon, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a batch normalization backward
+///     propagation primitive.
+///
+/// @note
+///     In-place operation is supported: the diff_dst can refer to the same
+///     memory as the diff_src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_backward_data and #dnnl_backward (diffs for all parameters are
+///     computed in this case).
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param epsilon Batch normalization epsilon parameter.
+/// @param flags Batch normalization flags (@ref dnnl_normalization_flags_t).
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_batch_normalization_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t src_desc, float epsilon, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_batch_normalization
+
+/// @addtogroup dnnl_api_group_normalization
+/// @{
+
+/// Creates a primitive descriptor for a group normalization forward propagation
+///     primitive.
+///
+/// @note
+///     In-place operation is supported: the dst can refer to the same memory
+///     as the src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param groups Group normalization groups parameter.
+/// @param epsilon Group normalization epsilon parameter.
+/// @param flags Group normalization flags (@ref dnnl_normalization_flags_t).
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_group_normalization_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, dnnl_dim_t groups, float epsilon,
+        unsigned flags, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a group normalization backward
+///     propagation primitive.
+///
+/// @note
+///     In-place operation is supported: the diff_dst can refer to the same
+///     memory as the diff_src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_backward_data and #dnnl_backward (diffs for all parameters are
+///     computed in this case).
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param groups Group normalization groups parameter.
+/// @param epsilon Group normalization epsilon parameter.
+/// @param flags Group normalization flags (@ref dnnl_normalization_flags_t).
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_group_normalization_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t src_desc, dnnl_dim_t groups, float epsilon,
+        unsigned flags, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_group_normalization
+
+/// @addtogroup dnnl_api_layer_normalization
+/// @{
+
+/// Creates a primitive descriptor for a layer normalization forward propagation
+///     primitive.
+///
+/// @note
+///     In-place operation is supported: the dst can refer to the same memory
+///     as the src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param stat_desc Memory descriptor for mean and variance. If this
+///     parameter is NULL, a zero memory descriptor, or a memory descriptor
+///     with format_kind set to #dnnl_format_kind_undef, then the memory
+///     descriptor for stats is derived from @p src_desc by removing the last
+///     dimension.
+/// @param epsilon Layer normalization epsilon parameter.
+/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t).
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_layer_normalization_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_memory_desc_t stat_desc,
+        float epsilon, unsigned flags, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a layer normalization backward
+///     propagation primitive.
+///
+/// @note
+///     In-place operation is supported: the diff_dst can refer to the same
+///     memory as the diff_src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_backward_data and #dnnl_backward (diffs for all parameters are
+///     computed in this case).
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param stat_desc Memory descriptor for mean and variance. If this
+///     parameter is NULL, a zero memory descriptor, or a memory descriptor
+///     with format_kind set to #dnnl_format_kind_undef, then the memory
+///     descriptor for stats is derived from @p src_desc by removing the last
+///     dimension.
+/// @param epsilon Layer normalization epsilon parameter.
+/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t).
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_layer_normalization_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t stat_desc,
+        float epsilon, unsigned flags, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a layer normalization forward propagation
+///     primitive with a user-provided data type for the scale and shift
+///     memory objects.
+///
+/// @note
+///     In-place operation is supported: the dst can refer to the same memory
+///     as the src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param stat_desc Memory descriptor for mean and variance. If this
+///     parameter is NULL, a zero memory descriptor, or a memory descriptor
+///     with format_kind set to #dnnl_format_kind_undef, then the memory
+///     descriptor for stats is derived from @p src_desc by removing the last
+///     dimension.
+/// @param scale_shift_data_type Data type of scale and shift memory. If neither scale
+///     nor shift flag are specified the parameter is ignored.
+/// @param epsilon Layer normalization epsilon parameter.
+/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t).
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API
+dnnl_layer_normalization_forward_primitive_desc_create_v2(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_memory_desc_t stat_desc,
+        dnnl_data_type_t scale_shift_data_type, float epsilon, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a layer normalization backward
+///     propagation primitive with a user-provided data type for the
+///     scale and shift memory objects.
+///
+/// @note
+///     In-place operation is supported: the diff_dst can refer to the same
+///     memory as the diff_src.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_backward_data and #dnnl_backward (diffs for all parameters are
+///     computed in this case).
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param src_desc Source memory descriptor.
+/// @param stat_desc Memory descriptor for mean and variance. If this
+///     parameter is NULL, a zero memory descriptor, or a memory descriptor
+///     with format_kind set to #dnnl_format_kind_undef, then the memory
+///     descriptor for stats is derived from @p src_desc by removing the last
+///     dimension.
+/// @param diff_scale_shift_data_type Data type of diff scale and shift memory. If neither scale
+///     nor shift flag are specified the parameter is ignored.
+/// @param scale_shift_data_type Data type of scale and shift memory. If neither scale
+///     nor shift flag are specified the parameter is ignored.
+/// @param epsilon Layer normalization epsilon parameter.
+/// @param flags Layer normalization flags (@ref dnnl_normalization_flags_t).
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API
+dnnl_layer_normalization_backward_primitive_desc_create_v2(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_memory_desc_t src_desc, const_dnnl_memory_desc_t stat_desc,
+        dnnl_data_type_t diff_scale_shift_data_type,
+        dnnl_data_type_t scale_shift_data_type, float epsilon, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_layer_normalization
+
+/// @addtogroup dnnl_api_inner_product
+/// @{
+
+/// Creates a primitive descriptor for an inner product forward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param src_desc Source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory
+///     descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_inner_product_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an inner product backward propagation
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param weights_desc Weights memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_inner_product_backward_data_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an inner product  weights gradient
+///     primitive.
+///
+/// @note
+///     Memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive_descriptor.
+/// @param engine Engine to use.
+/// @param src_desc Source memory descriptor.
+/// @param diff_weights_desc Diff weights memory descriptor.
+/// @param diff_bias_desc Diff bias memory descriptor. Passing NULL, a zero
+///     memory descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API
+dnnl_inner_product_backward_weights_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t diff_weights_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_inner_product
+
+/// @addtogroup dnnl_api_attributes
+/// @{
+
+/// Set quantization scale and shift parameters for RNN data tensors.
+///
+/// For performance reasons, the low-precision configuration of the RNN
+/// primitives expects input activations to have the unsigned 8-bit integer
+/// data type. The scale and shift parameters are used to quantize
+/// floating-point data to unsigned integer and must be passed to the RNN
+/// primitive using attributes.
+///
+/// The quantization formula is `scale * data + shift`.
+///
+/// @note
+///     Quantization scale and shift are common for src_layer, src_iter,
+///     dst_iter, and dst_layer.
+///
+/// Example usage:
+/// @code
+///     // RNN parameters
+///     int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32;
+///     // Activations quantization parameters
+///     float scale = 63.f, shift = 64.f;
+///
+///     dnnl_primitive_attr_t rnn_attr;
+///     // Create default attributes
+///     dnnl_primitive_attr_create(&rnn_attr);
+///
+///     // Set scale and shift for int8 quantization of activation
+///     dnnl_primitive_attr_set_rnn_data_qparams(rnn_attr, scale, shift);
+///
+///     // Create an RNN primitive descriptor.
+///     dnnl_primitive_desc_t rnn_pd;
+///     dnnl_vanilla_rnn_forward_primitive_desc_create(&rnn_pd,
+///             engine, /* arguments */, attr);
+/// @endcode
+///
+/// @param attr Primitive attributes.
+/// @param scale The value to scale the data by.
+/// @param shift The value to shift the data by.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_data_qparams(
+        dnnl_primitive_attr_t attr, const float scale, const float shift);
+
+/// Returns the quantization scale and shift parameters for RNN data tensors.
+///
+/// @note
+///     Quantization scale and shift are common for src_layer, src_iter,
+///     dst_iter, and dst_layer.
+///
+/// @param attr Primitive attributes.
+/// @param scale The value to scale the data by.
+/// @param shift The value to shift the data by.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_data_qparams(
+        const_dnnl_primitive_attr_t attr, float *scale, float *shift);
+
+/// Sets quantization scaling factors for RNN weights tensors. The
+/// low-precision configuration of the RNN primitives expects input weights to
+/// use the signed 8-bit integer data type. The scaling factors are used to
+/// quantize floating-point data to signed integer and must be passed to RNN
+/// primitives using attributes.
+///
+/// @note
+///     The dimension order is always native and does not depend on the actual
+///     layout used. For example, five-dimensional weights always have (l, d,
+///     i, g, o) logical dimension ordering.
+///
+/// @note
+///     Quantization scales are common for weights_layer and weights_iteration
+///
+/// @param attr Primitive attributes.
+/// @param count Number of elements in the @p scales array.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales vector. The set i-th bit indicates that a dedicated scaling
+///     factor should be used for each index along that dimension. Set the
+///     mask to 0 to use a common scaling factor for the whole output
+///     tensor.
+/// @param scales Array of output scaling factors that must contain @p count
+///     values and the following equality must hold:
+///     \f[count = \prod\limits_{d \in mask} weights.dims[d].\f]
+///     Violations can only be detected when the attributes are used to create
+///     a primitive descriptor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_weights_qparams(
+        dnnl_primitive_attr_t attr, dnnl_dim_t count, int mask,
+        const float *scales);
+
+/// Returns the quantization scaling factors for RNN weights tensors.
+///
+/// @param attr Primitive attributes.
+/// @param count Number of elements in the @p scales array.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales vector. The set i-th bit indicates that a dedicated scaling
+///     factor should be used for each index along that dimension. Set the
+///     mask to 0 to use a common scaling factor for the whole output
+///     tensor.
+/// @param scales Array of output scaling factors that contain @p count
+///     values and the following equality must hold:
+///     \f[count = \prod\limits_{d \in mask} weights.dims[d].\f]
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_weights_qparams(
+        const_dnnl_primitive_attr_t attr, dnnl_dim_t *count, int *mask,
+        const float **scales);
+
+/// Sets quantization scaling factors for RNN projection weights tensors. The
+/// low-precision configuration of the RNN primitives expects input weights to
+/// use the signed 8-bit integer data type. The scaling factors are used to
+/// quantize floating-point data to signed integer and must be passed to RNN
+/// primitives using attributes.
+///
+/// @note
+///     The dimension order is always native and does not depend on the actual
+///     layout used. For example, five-dimensional weights always have (l, d,
+///     i, g, o) logical dimension ordering.
+///
+/// @param attr Primitive attributes.
+/// @param count Number of elements in the @p scales array.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales vector. The set i-th bit indicates that a dedicated scaling
+///     factor should be used for each index along that dimension. Set the
+///     mask to 0 to use a common scaling factor for the whole output
+///     tensor.
+/// @param scales Array of output scaling factors that must contain @p count
+///     values and the following equality must hold:
+///     \f[count = \prod\limits_{d \in mask} weights.dims[d].\f]
+///     Violations can only be detected when the attributes are used to create
+///     a primitive descriptor.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_rnn_weights_projection_qparams(
+        dnnl_primitive_attr_t attr, dnnl_dim_t count, int mask,
+        const float *scales);
+
+/// Returns the quantization scaling factors for RNN projection weights tensors.
+///
+/// @param attr Primitive attributes.
+/// @param count Number of elements in the @p scales array.
+/// @param mask Scaling factors correspondence mask that defines the
+///     correspondence between the output tensor dimensions and the @p
+///     scales vector. The set i-th bit indicates that a dedicated scaling
+///     factor should be used for each index along that dimension. Set the
+///     mask to 0 to use a common scaling factor for the whole output
+///     tensor.
+/// @param scales Array of output scaling factors that contain @p count
+///     values and the following equality must hold:
+///     \f[count = \prod\limits_{d \in mask} weights.dims[d].\f]
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_weights_projection_qparams(
+        const_dnnl_primitive_attr_t attr, dnnl_dim_t *count, int *mask,
+        const float **scales);
+
+/// @} dnnl_api_attributes
+
+/// @addtogroup dnnl_api_rnn
+/// @{
+
+/// Creates a primitive descriptor for vanilla RNN forward propagation
+///     primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc.
+///
+/// This would then indicate that the RNN forward propagation primitive should
+/// not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param activation Activation kind. Possible values are #dnnl_eltwise_relu,
+///     #dnnl_eltwise_tanh or #dnnl_eltwise_logistic.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param flags Unused.
+/// @param alpha Negative slope if activation is #dnnl_eltwise_relu.
+/// @param beta Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_vanilla_rnn_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const dnnl_alg_kind_t activation,
+        const dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc, unsigned flags, float alpha,
+        float beta, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for vanilla RNN backward propagation
+///     primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p diff_src_iter_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+///
+/// This would then indicate that the RNN backward propagation primitive should
+/// not use the respective data and should use zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param activation Activation kind. Possible values are #dnnl_eltwise_relu,
+///     #dnnl_eltwise_tanh or #dnnl_eltwise_logistic.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param flags Unused.
+/// @param alpha Negative slope if activation is #dnnl_eltwise_relu.
+/// @param beta Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_vanilla_rnn_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, const dnnl_alg_kind_t activation,
+        const dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags,
+        float alpha, float beta, const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an LSTM forward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p src_iter_c_desc,
+/// - @p weights_peephole_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc together with @p dst_iter_c_desc.
+///
+/// This would then indicate that the LSTM forward propagation primitive should
+/// not use them and should default to zero values instead.
+///
+/// The @p weights_projection_desc could either be @c NULL or point to a zero
+/// memory descriptor. This would then indicate that the LSTM doesn't have
+/// recurrent projection layer.
+///
+/// @note
+///     All memory descriptors can be initialized with #dnnl_format_tag_any or
+///     with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param src_iter_c_desc Memory descriptor for the input recurrent cell
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param weights_peephole_desc Memory descriptor for the weights applied to
+///     the cell states (according to the Peephole LSTM formula).
+/// @param weights_projection_desc Memory descriptor for the weights applied to
+///     the hidden states to get the recurrent projection (according to the
+///     Projection LSTM formula).
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param dst_iter_c_desc Memory descriptor for the output recurrent cell
+///     state vector.
+/// @param flags Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lstm_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t src_iter_c_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t weights_peephole_desc,
+        const_dnnl_memory_desc_t weights_projection_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t dst_iter_c_desc, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for an LSTM backward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p src_iter_c_desc, @p diff_src_iter_desc,
+///   and @p diff_src_iter_c_desc,
+/// - @p weights_peephole_desc together with @p diff_weights_peephole_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p dst_iter_c_desc, @p diff_dst_iter_desc,
+///   and @p diff_dst_iter_c_desc.
+///
+/// This would then indicate that the LSTM backward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// The @p weights_projection_desc together with @p
+/// diff_weights_projection_desc could either be @c NULL or point to a zero
+/// memory descriptor. This would then indicate that the LSTM doesn't have
+/// recurrent projection layer.
+///
+/// @note
+///     All memory descriptors can be initialized with #dnnl_format_tag_any or
+///     with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param src_iter_c_desc Memory descriptor for the input recurrent cell
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param weights_peephole_desc Memory descriptor for the weights applied to
+///     the cell states (according to the Peephole LSTM formula).
+/// @param weights_projection_desc Memory descriptor for the weights applied to
+///     the hidden states to get the recurrent projection (according to the
+///     Projection LSTM formula).
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param dst_iter_c_desc Memory descriptor for the output recurrent cell
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_src_iter_c_desc Memory descriptor for the diff of input
+/// recurrent cell state vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_weights_peephole_desc Memory descriptor for the diff of weights
+///     applied to the cell states (according to the Peephole LSTM formula).
+/// @param diff_weights_projection_desc Memory descriptor for the diff of
+///     weights applied to the hidden states to get the recurrent projection
+///     (according to the Projection LSTM formula).
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param diff_dst_iter_c_desc Memory descriptor for the diff of output
+///     recurrent cell state vector.
+/// @param flags Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lstm_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t src_iter_c_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t weights_peephole_desc,
+        const_dnnl_memory_desc_t weights_projection_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t dst_iter_c_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_src_iter_c_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_weights_peephole_desc,
+        const_dnnl_memory_desc_t diff_weights_projection_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_c_desc, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for GRU forward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc.
+///
+/// This would then indicate that the GRU forward propagation primitive should
+/// not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param flags Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_gru_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for GRU backward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p diff_src_iter_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+///
+/// This would then indicate that the GRU backward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param flags Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_gru_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a descriptor for LBR GRU forward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc.
+///
+/// This would then indicate that the LBR GRU forward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param flags Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lbr_gru_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for LBR GRU backward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p diff_src_iter_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+///
+/// This would then indicate that the LBR GRU backward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param flags Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lbr_gru_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for AUGRU forward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc.
+///
+/// This would then indicate that the AUGRU forward propagation primitive should
+/// not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param attention_desc Memory descriptor for the attention vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param flags Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_augru_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t attention_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for AUGRU backward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p diff_src_iter_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+///
+/// This would then indicate that the AUGRU backward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param attention_desc Memory descriptor for the attention vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_attention_desc Memory descriptor for the diff of attention vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param flags Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_augru_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t attention_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_attention_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for LBR AUGRU forward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc,
+/// - @p bias_desc,
+/// - @p dst_iter_desc.
+///
+/// This would then indicate that the LBR AUGRU forward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param attention_desc Memory descriptor for the attention vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param flags Unused.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lbr_augru_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t attention_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for LBR AUGRU backward propagation primitive.
+///
+/// The following arguments may either be @c NULL or point to a zero memory
+/// descriptor:
+/// - @p src_iter_desc together with @p diff_src_iter_desc,
+/// - @p bias_desc together with @p diff_bias_desc,
+/// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+///
+/// This would then indicate that the LBR AUGRU backward propagation primitive
+/// should not use them and should default to zero values instead.
+///
+/// @note
+///     All memory descriptors can be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Must be #dnnl_backward.
+/// @param direction RNN direction. See @ref dnnl_rnn_direction_t for more
+///     info.
+/// @param src_layer_desc Memory descriptor for the input vector.
+/// @param src_iter_desc Memory descriptor for the input recurrent hidden
+///     state vector.
+/// @param attention_desc Memory descriptor for the attention vector.
+/// @param weights_layer_desc Memory descriptor for the weights applied to the
+///     layer input.
+/// @param weights_iter_desc Memory descriptor for the weights applied to the
+///     recurrent input.
+/// @param bias_desc Bias memory descriptor.
+/// @param dst_layer_desc Memory descriptor for the output vector.
+/// @param dst_iter_desc Memory descriptor for the output recurrent hidden
+///     state vector.
+/// @param diff_src_layer_desc Memory descriptor for the diff of input vector.
+/// @param diff_src_iter_desc Memory descriptor for the diff of input recurrent
+///     hidden state vector.
+/// @param diff_attention_desc Memory descriptor for the diff of attention vector.
+/// @param diff_weights_layer_desc Memory descriptor for the diff of weights
+///     applied to the layer input.
+/// @param diff_weights_iter_desc Memory descriptor for the diff of weights
+///     applied to the recurrent input.
+/// @param diff_bias_desc Diff bias memory descriptor.
+/// @param diff_dst_layer_desc Memory descriptor for the diff of output
+///     vector.
+/// @param diff_dst_iter_desc Memory descriptor for the diff of output
+///     recurrent hidden state vector.
+/// @param flags Unused.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_lbr_augru_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_rnn_direction_t direction,
+        const_dnnl_memory_desc_t src_layer_desc,
+        const_dnnl_memory_desc_t src_iter_desc,
+        const_dnnl_memory_desc_t attention_desc,
+        const_dnnl_memory_desc_t weights_layer_desc,
+        const_dnnl_memory_desc_t weights_iter_desc,
+        const_dnnl_memory_desc_t bias_desc,
+        const_dnnl_memory_desc_t dst_layer_desc,
+        const_dnnl_memory_desc_t dst_iter_desc,
+        const_dnnl_memory_desc_t diff_src_layer_desc,
+        const_dnnl_memory_desc_t diff_src_iter_desc,
+        const_dnnl_memory_desc_t diff_attention_desc,
+        const_dnnl_memory_desc_t diff_weights_layer_desc,
+        const_dnnl_memory_desc_t diff_weights_iter_desc,
+        const_dnnl_memory_desc_t diff_bias_desc,
+        const_dnnl_memory_desc_t diff_dst_layer_desc,
+        const_dnnl_memory_desc_t diff_dst_iter_desc, unsigned flags,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_rnn
+
+/// @addtogroup dnnl_api_matmul
+/// @{
+
+/// Creates a primitive descriptor for a matrix multiplication primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param src_desc Source memory descriptor (matrix A)
+/// @param weights_desc Weights memory descriptor (matrix B)
+/// @param bias_desc Bias memory descriptor. Passing NULL, a zero memory
+///     descriptor, or a memory descriptor with format_kind set to
+///     #dnnl_format_kind_undef disables the bias term.
+/// @param dst_desc Destination memory descriptor (matrix C).
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_matmul_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t weights_desc,
+        const_dnnl_memory_desc_t bias_desc, const_dnnl_memory_desc_t dst_desc,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_matmul
+
+/// @addtogroup dnnl_api_resampling Resampling
+/// @{
+
+/// Creates a primitive descriptor for a resampling forward propagation
+///     primitive.
+///
+/// @note
+///     Destination memory descriptor is allowed to be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param prop_kind Propagation kind. Possible values are
+///     #dnnl_forward_training and #dnnl_forward_inference.
+/// @param alg_kind resampling algorithm kind: either #dnnl_resampling_nearest,
+///     or #dnnl_resampling_linear.
+/// @param factors Array of scaling factors for spatial dimension.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_resampling_forward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_prop_kind_t prop_kind, dnnl_alg_kind_t alg_kind,
+        const float *factors, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr);
+
+/// Creates a primitive descriptor for a resampling backward propagation
+///     primitive.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind resamplinging algorithm kind: either
+///     #dnnl_resampling_nearest, or #dnnl_resampling_linear.
+/// @param diff_src_desc Diff source memory descriptor.
+/// @param diff_dst_desc Diff destination memory descriptor.
+/// @param factors Array of scaling factors for spatial dimension.
+/// @param hint_fwd_pd Primitive descriptor for a respective forward propagation
+///     primitive.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+///
+dnnl_status_t DNNL_API dnnl_resampling_backward_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const float *factors,
+        const_dnnl_memory_desc_t diff_src_desc,
+        const_dnnl_memory_desc_t diff_dst_desc,
+        const_dnnl_primitive_desc_t hint_fwd_pd,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_resampling
+
+/// @addtogroup dnnl_api_reduction Reduction
+/// @{
+
+/// Creates a primitive descriptor for a reduction primitive.
+///
+/// @note
+///     Destination memory descriptor is allowed to be initialized with
+///     #dnnl_format_tag_any or with format_kind set to #dnnl_format_kind_any.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind reduction algorithm kind. Possible values:
+///     #dnnl_reduction_max, #dnnl_reduction_min, #dnnl_reduction_sum,
+///     #dnnl_reduction_mul, #dnnl_reduction_mean, #dnnl_reduction_norm_lp_max,
+///     #dnnl_reduction_norm_lp_sum, #dnnl_reduction_norm_lp_power_p_max,
+///     #dnnl_reduction_norm_lp_power_p_sum.
+/// @param p Algorithm specific parameter.
+/// @param eps Algorithm specific parameter.
+/// @param src_desc Source memory descriptor.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_reduction_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src_desc,
+        const_dnnl_memory_desc_t dst_desc, float p, float eps,
+        const_dnnl_primitive_attr_t attr);
+
+/// @} dnnl_api_reduction
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_primitive_cache
+/// @{
+
+/// Returns the number of primitives that can be held in the primitive cache
+/// at the same time.
+///
+/// @param capacity Primitive cache capacity to query. Concurrently
+/// accessing @p capacity is safe.
+/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the
+///     @p capacity value is invalid, and #dnnl_success/#dnnl::status::success on
+///     success.
+dnnl_status_t DNNL_API dnnl_get_primitive_cache_capacity(int *capacity);
+
+/// Sets a number of primitives that can be held in the primitive cache
+/// at a time.
+///
+/// @param capacity Primitive cache capacity to set. If a new @p capacity is
+/// less than a number of primitives that the primitive cache already has
+/// then the excess entries will be evicted. Setting the @p capacity to 0
+/// clears the primitive cache and disables it. Concurrently modifying
+/// @p capacity is safe.
+/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the
+///     @p capacity value is invalid, and #dnnl_success/#dnnl::status::success on
+///     success.
+dnnl_status_t DNNL_API dnnl_set_primitive_cache_capacity(int capacity);
+
+/// @} dnnl_api_primitive_cache
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// Configures dumping of JIT-generated code.
+///
+/// @note
+///     This setting overrides the DNNL_JIT_DUMP environment variable.
+///
+/// @param enable Flag value. Set to 0 to disable and set to 1 to enable.
+/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the
+///     @p flag value is invalid, and #dnnl_success/#dnnl::status::success on
+///     success.
+dnnl_status_t DNNL_API dnnl_set_jit_dump(int enable);
+
+/// Sets library profiling flags. The flags define which profilers are
+/// supported.
+///
+/// @note
+///     This setting overrides DNNL_JIT_PROFILE environment variable.
+///
+/// @sa @ref dev_guide_profilers
+///
+/// @param flags Profiling flags that can contain the following bits:
+///     - @ref DNNL_JIT_PROFILE_VTUNE -- integration with VTune Profiler
+///         (on by default)
+///     - @ref DNNL_JIT_PROFILE_LINUX_JITDUMP -- produce Linux-specific
+///         jit-pid.dump output (off by default). The location of the output
+///         is controlled via JITDUMPDIR environment variable or via
+///         dnnl_set_jit_profiling_jitdumpdir() function.
+///     - @ref DNNL_JIT_PROFILE_LINUX_PERFMAP -- produce Linux-specific
+///         perf-pid.map output (off by default). The output is always placed
+///         into /tmp.
+///
+///     Passing @ref DNNL_JIT_PROFILE_NONE disables profiling completely.
+///
+/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the
+///     @p flags value is invalid, and #dnnl_success/#dnnl::status::success on
+///     success.
+dnnl_status_t DNNL_API dnnl_set_jit_profiling_flags(unsigned flags);
+
+/// Sets JIT dump output path. Only applicable to Linux and is only
+/// used when profiling flags have DNNL_JIT_PROFILE_LINUX_PERF bit set.
+///
+/// After the first JIT kernel is generated, the jitdump output will be placed
+/// into temporary directory created using the mkdtemp template
+/// 'dir/.debug/jit/dnnl.XXXXXX'.
+///
+/// @sa @ref dev_guide_profilers
+///
+/// @note
+///     This setting overrides JITDUMPDIR environment variable.  If
+///     JITDUMPDIR is not set, and this function is never called, the path
+///     defaults to HOME. Passing NULL reverts the value to default.
+///
+/// @note
+///     The directory is accessed only when the first JIT kernel is being
+///     created. JIT profiling will be disabled in case of any errors
+///     accessing or creating this directory.
+///
+/// @param dir JIT dump output path.
+/// @returns #dnnl_success/#dnnl::status::success if the
+///     output directory was set correctly and an error status otherwise.
+/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented on Windows.
+dnnl_status_t DNNL_API dnnl_set_jit_profiling_jitdumpdir(const char *dir);
+
+/// Sets the maximal ISA the library can dispatch to on the CPU. See
+/// #dnnl_cpu_isa_t and #dnnl::cpu_isa for the list of the values accepted by
+/// the C and C++ API functions respectively.
+///
+/// This function has effect only once, and returns an error on subsequent
+/// calls. It should also be invoked before any other oneDNN API call, otherwise
+/// it may return an error.
+///
+/// This function overrides the DNNL_MAX_CPU_ISA environment variable. The
+/// environment variable can be set to the desired maximal ISA name in upper
+/// case and with dnnl_cpu_isa prefix removed. For example:
+/// `DNNL_MAX_CPU_ISA=AVX2`.
+///
+/// @note
+///     The ISAs are only partially ordered:
+///         - SSE41 < AVX < AVX2 < AVX2_VNNI < AVX2_VNNI_2,
+///         - AVX2 < AVX512_CORE < AVX512_CORE_VNNI < AVX512_CORE_BF16
+///           < AVX10_1_512 < AVX10_1_512_AMX < AVX10_1_512_AMX_FP16,
+///         - AVX2_VNNI < AVX10_1_512.
+///     Aliases:
+///         - AVX512_CORE_FP16 = AVX10_1_512
+///         - AVX512_CORE_AMX = AVX10_1_512_AMX
+///         - AVX512_CORE_AMX_FP16 = AVX10_1_512_AMX_FP16
+///
+/// @sa @ref dev_guide_cpu_dispatcher_control for more details
+///
+/// @param isa Maximal ISA the library should dispatch to. Pass
+///     #dnnl_cpu_isa_default/#dnnl::cpu_isa::isa_default to remove ISA restrictions
+///     (except for ISAs with initial support in the library).
+/// @returns #dnnl_success/#dnnl::status::success on success and a
+///     #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the @p isa
+///     parameter is invalid or the ISA cannot be changed at this time.
+/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented if the feature
+///     was disabled at build time (see @ref dev_guide_build_options for more
+///     details).
+dnnl_status_t DNNL_API dnnl_set_max_cpu_isa(dnnl_cpu_isa_t isa);
+
+/// Gets the maximal ISA the library can dispatch to on the CPU. See
+/// #dnnl_cpu_isa_t and #dnnl::cpu_isa for the list of the values returned by
+/// the C and C++ API functions respectively.
+///
+/// @sa @ref dev_guide_cpu_dispatcher_control for more details
+///
+/// @returns #dnnl_cpu_isa_t value reflecting the maximal ISA the library may
+///     dispatch to.
+dnnl_cpu_isa_t DNNL_API dnnl_get_effective_cpu_isa(void);
+
+/// Sets the hints flag for the CPU ISA. See #dnnl_cpu_isa_hints_t and
+/// #dnnl::cpu_isa_hints for the list of the values accepted by the C and C++
+/// API functions respectively.
+///
+/// This function has effect only once, and returns an error on subsequent
+/// calls. It should also be invoked before any other oneDNN API call, otherwise
+/// it may return an error.
+///
+/// This function overrides the DNNL_CPU_ISA_HINTS environment variable.
+/// @sa @ref dev_guide_cpu_isa_hints for more details
+///
+/// @param isa_hints CPU ISA hints to be passed over to the implementation.
+///     Pass #dnnl_cpu_isa_no_hints/#dnnl::cpu_isa_hints::no_hints to use
+///     default features i.e. no hints.
+/// @returns #dnnl_success/#dnnl::status::success on success and a
+///     #dnnl_runtime_error/#dnnl::status::runtime_error if the ISA hints cannot
+///     be specified at the current time.
+/// @returns #dnnl_unimplemented/#dnnl::status::unimplemented if the feature
+///     was disabled at build time (see @ref dev_guide_build_options for more
+///     details).
+dnnl_status_t DNNL_API dnnl_set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints);
+
+/// Gets the ISA specific hints that library can follow. See
+/// #dnnl_cpu_isa_hints_t and #dnnl::cpu_isa_hints for the list of the values
+///  returned by the C and C++ API functions respectively.
+///
+/// @sa @ref dev_guide_cpu_isa_hints for more details
+///
+/// @returns #dnnl_cpu_isa_hints_t value reflecting the ISA specific hints the
+/// library can follow.
+dnnl_cpu_isa_hints_t DNNL_API dnnl_get_cpu_isa_hints(void);
+
+/// @} dnnl_api_service
+
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+
+/// @addtogroup dnnl_api_profiling Profiling
+/// @{
+
+/// Resets a profiler's state.
+///
+/// @param stream Stream associated with the profiler.
+///
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_reset_profiling(dnnl_stream_t stream);
+
+/// Queries profiling data. The profiling data accumulates for each primitive
+/// execution. The @p num_entries will be equal to the number of executions
+/// since the last `dnnl_reset_profiling` call. In order to query the
+/// @p num_entries the @p data parameter should be NULL. When @p data is NULL
+/// then the @p data_kind parameter is ignored.
+///
+/// The profiling data can be reset by calling #dnnl_reset_profiling.
+///
+/// @note
+///     It is required to wait for all submitted primitives to complete
+///     using #dnnl_stream_wait prior to querying profiling data.
+///
+/// @param stream Stream that was used for executing a primitive that
+/// is being profiled.
+/// @param data_kind Profiling data kind to query.
+/// @param num_entries Number of profiling data entries.
+/// @param data Profiling data.
+///
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_query_profiling_data(dnnl_stream_t stream,
+        dnnl_profiling_data_kind_t data_kind, int *num_entries, uint64_t *data);
+
+/// @} dnnl_api_profiling
+#endif
+
+/// @addtogroup dnnl_api_blas
+/// @{
+
+/// Performs single-precision matrix-matrix multiply.
+///
+/// The operation is defined as:
+///
+/// `C := alpha * op( A ) * op( B ) + beta * C`
+///
+/// where
+///  - `op( X ) = X` or `op( X ) = X**T`,
+///  - `alpha` and `beta` are scalars, and
+///  - `A`, `B`, and `C` are matrices:
+///     - `op( A )` is an `MxK` matrix,
+///     - `op( B )` is an `KxN` matrix,
+///     - `C` is an `MxN` matrix.
+///
+/// The matrices are assumed to be stored in row-major order (the elements in
+/// each of the matrix rows are contiguous in memory).
+///
+/// @note
+///     This API does not support XERBLA. Instead, unlike the standard BLAS
+///     functions, this one returns a dnnl_status_t value to allow error
+///     handling.
+///
+/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not
+///     transposed, and 'T' or 't' means that A is transposed.
+/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not
+///     transposed, and 'T' or 't' means that B is transposed.
+/// @param M The M dimension.
+/// @param N The N dimension.
+/// @param K The K dimension.
+/// @param alpha The alpha parameter that is used to scale the product of
+///     matrices A and B.
+/// @param A A pointer to the A matrix data.
+/// @param lda The leading dimension for the matrix A.
+/// @param B A pointer to the B matrix data.
+/// @param ldb The leading dimension for the matrix B.
+/// @param beta The beta parameter that is used to scale the matrix C.
+/// @param C A pointer to the C matrix data.
+/// @param ldc The leading dimension for the matrix C.
+/// @returns #dnnl_success/#dnnl::status::success on success and a status
+///     describing the error otherwise.
+dnnl_status_t DNNL_API dnnl_sgemm(char transa, char transb, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda,
+        const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc);
+
+/// Performs integer matrix-matrix multiply on 8-bit unsigned matrix A, 8-bit
+/// signed matrix B, and 32-bit signed resulting matrix C.
+///
+/// The operation is defined as:
+///
+/// `C := alpha * (op(A) - A_offset) * (op(B) - B_offset) + beta * C + C_offset`
+///
+/// where
+///  - `op( X ) = X` or `op( X ) = X**T`,
+///  - `alpha` and `beta` are scalars, and
+///  - `A`, `B`, and `C` are matrices:
+///     - `op( A )` is an `MxK` matrix,
+///     - `op( B )` is an `KxN` matrix,
+///     - `C` is an `MxN` matrix.
+///  - `A_offset` is an `MxK` matrix with every element equal the `ao` value,
+///  - `B_offset` is an `KxN` matrix with every element equal the `bo` value,
+///  - `C_offset` is an `MxN` matrix which is defined by the `co` array of size `len`:
+///    - if `offsetc = F`: the `len` must be at least `1`,
+///    - if `offsetc = C`: the `len` must be at least `max(1, m)`,
+///    - if `offsetc = R`: the `len` must be at least `max(1, n)`,
+///
+/// The matrices are assumed to be stored in row-major order (the elements in
+/// each of the matrix rows are contiguous in memory).
+///
+/// @note
+///     This API does not support XERBLA. Instead, unlike the standard BLAS
+///     functions, this one returns a dnnl_status_t value to allow error
+///     handling.
+///
+/// @warning
+///     On some architectures saturation may happen during intermediate
+///     computations, which would lead to unexpected results. For more
+///     details, refer to @ref dev_guide_int8_computations.
+///
+/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not
+///     transposed, and 'T' or 't' means that A is transposed.
+/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not
+///     transposed, and 'T' or 't' means that B is transposed.
+/// @param offsetc Flag specifying how offsets should be applied to matrix C:
+///     - 'F' means that the same offset will be applied to each element of
+///         the matrix C,
+///     - 'C' means that individual offset will be applied to each element
+///         within each column,
+///     - 'R' means that individual offset will be applied to each element
+///         within each row.
+/// @param M The M dimension.
+/// @param N The N dimension.
+/// @param K The K dimension.
+/// @param alpha The alpha parameter that is used to scale the product of
+///     matrices A and B.
+/// @param A A pointer to the A matrix data.
+/// @param lda The leading dimension for the matrix A.
+/// @param ao The offset value for the matrix A.
+/// @param B A pointer to the B matrix data.
+/// @param ldb The leading dimension for the matrix B.
+/// @param bo The offset value for the matrix B.
+/// @param beta The beta parameter that is used to scale the matrix C.
+/// @param C A pointer to the C matrix data.
+/// @param ldc The leading dimension for the matrix C.
+/// @param co An array of offset values for the matrix C. The number of
+///     elements in the array depends on the value of @p offsetc.
+/// @returns #dnnl_success/#dnnl::status::success on success and a status
+///     describing the error otherwise.
+dnnl_status_t DNNL_API dnnl_gemm_u8s8s32(char transa, char transb, char offsetc,
+        dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A,
+        dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co);
+
+/// Performs integer matrix-matrix multiply on 8-bit signed matrix A, 8-bit
+/// signed matrix B, and 32-bit signed resulting matrix C.
+///
+/// The operation is defined as:
+///
+/// `C := alpha * (op(A) - A_offset) * (op(B) - B_offset) + beta * C + C_offset`
+///
+/// where
+///  - `op( X ) = X` or `op( X ) = X**T`,
+///  - `alpha` and `beta` are scalars, and
+///  - `A`, `B`, and `C` are matrices:
+///     - `op( A )` is an `MxK` matrix,
+///     - `op( B )` is an `KxN` matrix,
+///     - `C` is an `MxN` matrix.
+///  - `A_offset` is an `MxK` matrix with every element equal the `ao` value,
+///  - `B_offset` is an `KxN` matrix with every element equal the `bo` value,
+///  - `C_offset` is an `MxN` matrix which is defined by the `co` array of size `len`:
+///    - if `offsetc = F`: the `len` must be at least `1`,
+///    - if `offsetc = C`: the `len` must be at least `max(1, m)`,
+///    - if `offsetc = R`: the `len` must be at least `max(1, n)`,
+///
+/// The matrices are assumed to be stored in row-major order (the elements in
+/// each of the matrix rows are contiguous in memory).
+///
+/// @note
+///     This API does not support XERBLA. Instead, unlike the standard BLAS
+///     functions, this one returns a dnnl_status_t value to allow error
+///     handling.
+///
+/// @warning
+///     On some architectures saturation may happen during intermediate
+///     computations, which would lead to unexpected results. For more
+///     details, refer to @ref dev_guide_int8_computations.
+///
+/// @param transa Transposition flag for matrix A: 'N' or 'n' means A is not
+///     transposed, and 'T' or 't' means that A is transposed.
+/// @param transb Transposition flag for matrix B: 'N' or 'n' means B is not
+///     transposed, and 'T' or 't' means that B is transposed.
+/// @param offsetc Flag specifying how offsets should be applied to matrix C:
+///     - 'F' means that the same offset will be applied to each element of
+///         the matrix C,
+///     - 'C' means that individual offset will be applied to each element
+///         within each column,
+///     - 'R' means that individual offset will be applied to each element
+///         within each row.
+/// @param M The M dimension.
+/// @param N The N dimension.
+/// @param K The K dimension.
+/// @param alpha The alpha parameter that is used to scale the product of
+///     matrices A and B.
+/// @param A A pointer to the A matrix data.
+/// @param lda The leading dimension for the matrix A.
+/// @param ao The offset value for the matrix A.
+/// @param B A pointer to the B matrix data.
+/// @param ldb The leading dimension for the matrix B.
+/// @param bo The offset value for the matrix B.
+/// @param beta The beta parameter that is used to scale the matrix C.
+/// @param C A pointer to the C matrix data.
+/// @param ldc The leading dimension for the matrix C.
+/// @param co An array of offset values for the matrix C. The number of
+///     elements in the array depends on the value of @p offsetc.
+/// @returns #dnnl_success/#dnnl::status::success on success and a status
+///     describing the error otherwise.
+dnnl_status_t DNNL_API dnnl_gemm_s8s8s32(char transa, char transb, char offsetc,
+        dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A,
+        dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co);
+
+/// @} dnnl_api_blas
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_H */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..92dd01c09049bec24644e16d78861ee9c829b875
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl.hpp
@@ -0,0 +1,14071 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C++ API
+
+#ifndef ONEAPI_DNNL_DNNL_HPP
+#define ONEAPI_DNNL_DNNL_HPP
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl.h"
+#include "oneapi/dnnl/dnnl_common.hpp"
+
+/// @endcond
+
+/// @addtogroup dnnl_api oneDNN API
+/// @{
+
+/// oneDNN namespace
+namespace dnnl {
+
+/// @addtogroup dnnl_api_utils Utilities
+/// Utility types and definitions.
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <typename T>
+void validate_container_size(const T &v, const char *error_message,
+        int min_size = 1, int max_size = -1) {
+    const int size = (int)v.size();
+    if (size < min_size || (max_size >= 0 && size > max_size))
+        DNNL_THROW_ERROR(dnnl_invalid_arguments, error_message);
+}
+/// @endcond
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <>
+struct handle_traits<dnnl_memory_desc_t> {
+    static dnnl_status_t destructor(dnnl_memory_desc_t p) {
+        return dnnl_memory_desc_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_memory_t> {
+    static dnnl_status_t destructor(dnnl_memory_t p) {
+        return dnnl_memory_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_primitive_desc_t> {
+    static dnnl_status_t destructor(dnnl_primitive_desc_t p) {
+        return dnnl_primitive_desc_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_primitive_t> {
+    static dnnl_status_t destructor(dnnl_primitive_t p) {
+        return dnnl_primitive_destroy(p);
+    }
+};
+
+/// @endcond
+
+/// @} dnnl_api_utils
+
+struct stream;
+struct memory;
+struct primitive_desc;
+
+/// @addtogroup dnnl_api_primitives Primitives
+/// Compute primitives
+/// @sa @ref dev_guide_basic_concepts
+/// @{
+
+/// @addtogroup dnnl_api_primitives_common Common
+/// Common operations to create, destroy and inspect primitives
+/// @{
+
+/// Base class for all computational primitives.
+struct primitive : public handle<dnnl_primitive_t> {
+    /// Kinds of primitives supported by the library.
+    enum class kind {
+        /// Undefined primitive
+        undef = dnnl_undefined_primitive,
+        /// A reorder primitive.
+        reorder = dnnl_reorder,
+        /// A shuffle primitive.
+        shuffle = dnnl_shuffle,
+        /// A (out-of-place) tensor concatenation primitive.
+        concat = dnnl_concat,
+        /// A summation primitive.
+        sum = dnnl_sum,
+        /// A convolution primitive.
+        convolution = dnnl_convolution,
+        /// A deconvolution primitive.
+        deconvolution = dnnl_deconvolution,
+        /// An element-wise primitive.
+        eltwise = dnnl_eltwise,
+        /// An LRN primitive.
+        lrn = dnnl_lrn,
+        /// A batch normalization primitive.
+        batch_normalization = dnnl_batch_normalization,
+        /// An inner product primitive.
+        inner_product = dnnl_inner_product,
+        /// An RNN primitive.
+        rnn = dnnl_rnn,
+        /// A binary primitive.
+        binary = dnnl_binary,
+        /// A matmul (matrix multiplication) primitive.
+        matmul = dnnl_matmul,
+        /// A resampling primitive.
+        resampling = dnnl_resampling,
+        /// A pooling primitive.
+        pooling = dnnl_pooling,
+        /// A reduction primitive.
+        reduction = dnnl_reduction,
+        /// A PReLU primitive.
+        prelu = dnnl_prelu,
+        /// A softmax primitive.
+        softmax = dnnl_softmax,
+        /// A layer normalization primitive.
+        layer_normalization = dnnl_layer_normalization,
+        /// A group normalization primitive
+        group_normalization = dnnl_group_normalization,
+    };
+
+    using handle::handle;
+
+    /// Default constructor. Constructs an empty object.
+    primitive() = default;
+
+    /// Constructs a primitive from a C API primitive descriptor.
+    ///
+    /// @param c_pd C API primitive descriptor.
+    primitive(const_dnnl_primitive_desc_t c_pd);
+
+    /// Constructs a primitive from a C API primitive descriptor and a cache blob.
+    ///
+    /// @param c_pd C API primitive descriptor.
+    /// @param cache_blob Cache blob.
+    primitive(const_dnnl_primitive_desc_t c_pd,
+            const std::vector<uint8_t> &cache_blob);
+
+    /// Constructs a primitive from a primitive descriptor.
+    ///
+    /// @param pd Primitive descriptor.
+    primitive(const primitive_desc &pd);
+
+    /// Constructs a primitive from a primitive descriptor and a cache blob.
+    ///
+    /// @param pd Primitive descriptor.
+    /// @param cache_blob Cache blob.
+    primitive(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob);
+
+    /// Returns the C API primitive descriptor of the underlying C API
+    /// primitive.
+    ///
+    /// @returns The underlying C API primitive descriptor.
+    inline const_dnnl_primitive_desc_t get_primitive_desc() const;
+
+    /// Returns the kind of the primitive.
+    ///
+    /// @returns The primitive kind.
+    inline kind get_kind() const;
+
+    /// Returns a cache blob for the primitive.
+    ///
+    /// @returns Vector containing the cache blob.
+    ///
+    /// @note The cache blob can be empty. It's the user's responsibility to
+    ///     check whether it's empty prior to passing it to the primitive
+    ///     constructor.
+    inline std::vector<uint8_t> get_cache_blob() const;
+
+    /// Executes computations specified by the primitive in a specified stream.
+    ///
+    /// Arguments are passed via an arguments map containing <index,
+    /// memory object> pairs. The index must be one of the `DNNL_ARG_*` values
+    /// such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+    /// matching the one returned by
+    /// primitive_desc::query_md(#query::exec_arg_md, index) unless using
+    /// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+    ///
+    /// @param astream Stream object. The stream must belong to the same engine
+    ///     as the primitive.
+    /// @param args Arguments map.
+    void execute(const stream &astream,
+            const std::unordered_map<int, memory> &args) const;
+};
+
+/// Converts primitive kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API primitive kind enum value.
+/// @returns Corresponding C API primitive kind enum value.
+inline dnnl_primitive_kind_t convert_to_c(primitive::kind akind) {
+    return static_cast<dnnl_primitive_kind_t>(akind);
+}
+
+const_dnnl_primitive_desc_t primitive::get_primitive_desc() const {
+    const_dnnl_primitive_desc_t pd;
+    error::wrap_c_api(dnnl_primitive_get_primitive_desc(get(), &pd),
+            "could not get a primitive descriptor from a primitive");
+    return pd;
+}
+
+dnnl::primitive::kind primitive::get_kind() const {
+    const_dnnl_primitive_desc_t pd = get_primitive_desc();
+    // TODO (Roma): the code below is only needed because get_primitive_desc
+    // returns a C type.
+    dnnl_primitive_kind_t kind;
+    error::wrap_c_api(dnnl_primitive_desc_query(
+                              pd, dnnl_query_primitive_kind, 0, (void *)&kind),
+            "could not get a primitive kind from a primitive descriptor");
+    return static_cast<dnnl::primitive::kind>(kind);
+}
+
+std::vector<uint8_t> primitive::get_cache_blob() const {
+    size_t size;
+    error::wrap_c_api(dnnl_primitive_get_cache_blob(get(), &size, nullptr),
+            "could not get cache blob size from a primitive");
+
+    std::vector<uint8_t> cache_blob(size);
+    error::wrap_c_api(
+            dnnl_primitive_get_cache_blob(get(), &size, cache_blob.data()),
+            "could not get a cache blob from a primitive");
+    return cache_blob;
+}
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_attributes
+///
+/// A container for parameters that extend primitives behavior.
+///
+/// Attributes can also contain Post-ops, which are computations executed
+/// after the primitive.
+///
+/// @sa @ref dev_guide_attributes
+/// @sa @ref dev_guide_attributes_post_ops
+///
+/// @{
+
+/// Scratchpad mode
+enum class scratchpad_mode {
+    /// The library manages the scratchpad allocation according to the policy
+    /// specified by the `DNNL_ENABLE_CONCURRENT_EXEC`
+    /// [build option](@ref dev_guide_build_options) (default).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=OFF` (default), the library
+    /// scratchpad is common to all primitives to reduce the memory footprint.
+    /// This configuration comes with limited thread-safety properties, namely
+    /// primitives can be created and executed in parallel but cannot migrate
+    /// between threads (in other words, each primitive should be executed in
+    /// the same thread it was created in).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=ON`, the library scratchpad is
+    /// private to each primitive. The memory footprint is larger than when
+    /// using `DNNL_ENABLE_CONCURRENT_EXEC=OFF` but different primitives can be
+    /// created and run concurrently (the same primitive cannot be run
+    /// concurrently from two different threads though).
+    library = dnnl_scratchpad_mode_library,
+    /// The user manages the scratchpad allocation by querying and providing
+    /// the scratchpad memory to primitives. This mode is thread-safe as long
+    /// as the scratchpad buffers are not used concurrently by two primitive
+    /// executions.
+    user = dnnl_scratchpad_mode_user,
+};
+
+/// Converts a scratchpad mode enum value from C++ API to C API type.
+///
+/// @param mode C++ API scratchpad mode enum value.
+/// @returns Corresponding C API scratchpad mode enum value.
+inline dnnl_scratchpad_mode_t convert_to_c(scratchpad_mode mode) {
+    return static_cast<dnnl_scratchpad_mode_t>(mode);
+}
+
+/// Rounding mode
+enum class rounding_mode {
+    /// rounding mode dictated by the floating-point environment
+    environment = dnnl_rounding_mode_environment,
+    /// stochastic rounding mode where a random bias is added to the
+    /// trailing mantissa bits before conversion.
+    stochastic = dnnl_rounding_mode_stochastic
+};
+
+/// Converts a rounding mode enum value from C++ API to C API type.
+///
+/// @param mode C++ API rounding mode enum value.
+/// @returns Corresponding C API rounding mode enum value.
+inline dnnl_rounding_mode_t convert_to_c(rounding_mode mode) {
+    return static_cast<dnnl_rounding_mode_t>(mode);
+}
+
+/// Propagation kind.
+enum class prop_kind {
+    /// Undefined propagation kind.
+    undef = dnnl_prop_kind_undef,
+    /// Forward data propagation (training mode). In this mode, primitives
+    /// perform computations necessary for subsequent backward propagation.
+    forward_training = dnnl_forward_training,
+    /// Forward data propagation (inference mode). In this mode, primitives
+    /// perform only computations that are necessary for inference and omit
+    /// computations that are necessary only for backward propagation.
+    forward_inference = dnnl_forward_inference,
+    /// Forward data propagation,
+    /// alias for #dnnl::prop_kind::forward_training.
+    forward = dnnl_forward,
+    /// Backward propagation (with respect to all parameters).
+    backward = dnnl_backward,
+    /// Backward data propagation.
+    backward_data = dnnl_backward_data,
+    /// Backward weights propagation.
+    backward_weights = dnnl_backward_weights,
+    /// Backward bias propagation.
+    backward_bias = dnnl_backward_bias
+};
+
+/// Converts propagation kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API propagation kind enum value.
+/// @returns Corresponding C API propagation kind enum value.
+inline dnnl_prop_kind_t convert_to_c(prop_kind akind) {
+    return static_cast<dnnl_prop_kind_t>(akind);
+}
+
+/// Kinds of algorithms.
+enum class algorithm {
+    /// Undefined algorithm
+    undef = dnnl_alg_kind_undef,
+    /// Convolution algorithm that is chosen to be either direct or Winograd
+    /// automatically
+    convolution_auto = dnnl_convolution_auto,
+    /// Direct convolution
+    convolution_direct = dnnl_convolution_direct,
+    /// Winograd convolution
+    convolution_winograd = dnnl_convolution_winograd,
+    /// Direct deconvolution
+    deconvolution_direct = dnnl_deconvolution_direct,
+    /// Winograd deconvolution
+    deconvolution_winograd = dnnl_deconvolution_winograd,
+    /// Elementwise: rectified linear unit (ReLU)
+    eltwise_relu = dnnl_eltwise_relu,
+    /// Elementwise: hyperbolic tangent non-linearity (tanh)
+    eltwise_tanh = dnnl_eltwise_tanh,
+    /// Elementwise: exponential linear unit (ELU)
+    eltwise_elu = dnnl_eltwise_elu,
+    /// Elementwise: square
+    eltwise_square = dnnl_eltwise_square,
+    /// Elementwise: abs
+    eltwise_abs = dnnl_eltwise_abs,
+    /// Elementwise: square root
+    eltwise_sqrt = dnnl_eltwise_sqrt,
+    /// Elementwise: swish (\f$x \cdot sigmoid(a \cdot x)\f$)
+    eltwise_swish = dnnl_eltwise_swish,
+    /// Elementwise: linear
+    eltwise_linear = dnnl_eltwise_linear,
+    /// Elementwise: soft_relu
+    eltwise_soft_relu = dnnl_eltwise_soft_relu,
+    /// Elementwise: mish
+    eltwise_mish = dnnl_eltwise_mish,
+    /// Elementwise: logistic
+    eltwise_logistic = dnnl_eltwise_logistic,
+    /// Elementwise: exponent
+    eltwise_exp = dnnl_eltwise_exp,
+    /// Elementwise: tanh-based gelu
+    eltwise_gelu_tanh = dnnl_eltwise_gelu_tanh,
+    /// Elementwise: erf-based gelu
+    eltwise_gelu_erf = dnnl_eltwise_gelu_erf,
+    /// Elementwise: natural logarithm
+    eltwise_log = dnnl_eltwise_log,
+    /// Elementwise: clip
+    eltwise_clip = dnnl_eltwise_clip,
+    /// Eltwise: clip version 2
+    eltwise_clip_v2 = dnnl_eltwise_clip_v2,
+    /// Elementwise: pow
+    eltwise_pow = dnnl_eltwise_pow,
+    /// Elementwise: round
+    eltwise_round = dnnl_eltwise_round,
+    /// Elementwise: hardswish
+    eltwise_hardswish = dnnl_eltwise_hardswish,
+    /// Elementwise: hardsigmoid
+    eltwise_hardsigmoid = dnnl_eltwise_hardsigmoid,
+    /// Elementwise: rectified linar unit (ReLU) (dst for backward)
+    eltwise_relu_use_dst_for_bwd = dnnl_eltwise_relu_use_dst_for_bwd,
+    /// Elementwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
+    eltwise_tanh_use_dst_for_bwd = dnnl_eltwise_tanh_use_dst_for_bwd,
+    /// Elementwise: exponential linear unit (ELU) (dst for backward)
+    eltwise_elu_use_dst_for_bwd = dnnl_eltwise_elu_use_dst_for_bwd,
+    /// Elementwise: square root (dst for backward)
+    eltwise_sqrt_use_dst_for_bwd = dnnl_eltwise_sqrt_use_dst_for_bwd,
+    /// Elementwise: logistic (dst for backward)
+    eltwise_logistic_use_dst_for_bwd = dnnl_eltwise_logistic_use_dst_for_bwd,
+    /// Elementwise: exponent (dst for backward)
+    eltwise_exp_use_dst_for_bwd = dnnl_eltwise_exp_use_dst_for_bwd,
+    /// Elementwise: clip version 2 (dst for backward)
+    eltwise_clip_v2_use_dst_for_bwd = dnnl_eltwise_clip_v2_use_dst_for_bwd,
+    /// Local response normalization (LRN) across multiple channels
+    lrn_across_channels = dnnl_lrn_across_channels,
+    /// LRN within a single channel
+    lrn_within_channel = dnnl_lrn_within_channel,
+    /// Max pooling
+    pooling_max = dnnl_pooling_max,
+    /// Average pooling include padding
+    pooling_avg_include_padding = dnnl_pooling_avg_include_padding,
+    /// Average pooling exclude padding
+    pooling_avg_exclude_padding = dnnl_pooling_avg_exclude_padding,
+    /// RNN cell
+    vanilla_rnn = dnnl_vanilla_rnn,
+    /// LSTM cell
+    vanilla_lstm = dnnl_vanilla_lstm,
+    /// GRU cell
+    vanilla_gru = dnnl_vanilla_gru,
+    /// GRU cell with linear before reset. Differs from the vanilla GRU
+    /// in how the new memory gate is calculated:
+    /// \f$c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f$
+    /// LRB GRU expects 4 bias tensors on input:
+    /// \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$
+    lbr_gru = dnnl_lbr_gru,
+    /// AUGRU cell
+    vanilla_augru = dnnl_vanilla_augru,
+    /// AUGRU cell with linear before reset
+    lbr_augru = dnnl_lbr_augru,
+    /// Binary add
+    binary_add = dnnl_binary_add,
+    /// Binary mul
+    binary_mul = dnnl_binary_mul,
+    /// Binary max
+    binary_max = dnnl_binary_max,
+    /// Binary min
+    binary_min = dnnl_binary_min,
+    /// Binary div
+    binary_div = dnnl_binary_div,
+    /// Binary sub
+    binary_sub = dnnl_binary_sub,
+    /// Binary greater than or equal
+    binary_ge = dnnl_binary_ge,
+    /// Binary greater than
+    binary_gt = dnnl_binary_gt,
+    /// Binary less than or equal
+    binary_le = dnnl_binary_le,
+    /// Binary less than
+    binary_lt = dnnl_binary_lt,
+    /// Binary equal
+    binary_eq = dnnl_binary_eq,
+    /// Binary not equal
+    binary_ne = dnnl_binary_ne,
+    /// Binary select
+    binary_select = dnnl_binary_select,
+    /// Nearest Neighbor resampling method
+    resampling_nearest = dnnl_resampling_nearest,
+    /// Linear (Bilinear, Trilinear) resampling method
+    resampling_linear = dnnl_resampling_linear,
+    /// Reduction using max operation
+    reduction_max = dnnl_reduction_max,
+    /// Reduction using min operation
+    reduction_min = dnnl_reduction_min,
+    /// Reduction using sum operation
+    reduction_sum = dnnl_reduction_sum,
+    /// Reduction using mul operation
+    reduction_mul = dnnl_reduction_mul,
+    /// Reduction using mean operation
+    reduction_mean = dnnl_reduction_mean,
+    /// Reduction using norm_lp_max operation
+    reduction_norm_lp_max = dnnl_reduction_norm_lp_max,
+    /// Reduction using norm_lp_sum operation
+    reduction_norm_lp_sum = dnnl_reduction_norm_lp_sum,
+    /// Reduction using norm_lp_power_p_max operation
+    reduction_norm_lp_power_p_max = dnnl_reduction_norm_lp_power_p_max,
+    /// Reduction using norm_lp_power_p_sum operation
+    reduction_norm_lp_power_p_sum = dnnl_reduction_norm_lp_power_p_sum,
+    /// Softmax, numerically stable
+    softmax_accurate = dnnl_softmax_accurate,
+    /// LogSoftmax, numerically stable
+    softmax_log = dnnl_softmax_log,
+};
+
+/// Converts algorithm kind enum value from C++ API to C API type.
+/// @param aalgorithm C++ API algorithm kind enum value.
+/// @returns Corresponding C API algorithm kind enum value.
+inline dnnl_alg_kind_t convert_to_c(algorithm aalgorithm) {
+    return static_cast<dnnl_alg_kind_t>(aalgorithm);
+}
+
+/// @} dnnl_api_attributes
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Flags for normalization primitives.
+enum class normalization_flags : unsigned {
+    /// Use no normalization flags. If specified, the library computes mean and
+    /// variance on forward propagation for training and inference, outputs them
+    /// on forward propagation for training, and computes the respective
+    /// derivatives on backward propagation.
+    none = dnnl_normalization_flags_none,
+
+    /// Use global statistics. If specified, the library uses mean and
+    /// variance provided by the user as an input on forward propagation and
+    /// does not compute their derivatives on backward propagation. Otherwise,
+    /// the library computes mean and variance on forward propagation for
+    /// training and inference, outputs them on forward propagation for
+    /// training, and computes the respective derivatives on backward
+    /// propagation.
+    use_global_stats = dnnl_use_global_stats,
+
+    /// Use scale parameter. If specified, the user is expected to pass scale as
+    /// input on forward propagation. On backward propagation of type
+    /// #dnnl::prop_kind::backward, the library computes its derivative.
+    use_scale = dnnl_use_scale,
+
+    /// Use shift parameter. If specified, the user is expected to pass shift as
+    /// input on forward propagation. On backward propagation of type
+    /// #dnnl::prop_kind::backward, the library computes its derivative.
+    use_shift = dnnl_use_shift,
+
+    /// Fuse normalization with ReLU. On training, normalization will require
+    /// the workspace to implement backward propagation. On inference, the
+    /// workspace is not required and behavior is the same as when normalization
+    /// is fused with ReLU using the post-ops API.
+    fuse_norm_relu = dnnl_fuse_norm_relu,
+
+    /// Fuse normalization with elementwise binary Add and then fuse with ReLU.
+    /// On training, normalization will require the workspace to implement
+    /// backward propagation. On inference, the workspace is not required.
+    fuse_norm_add_relu = dnnl_fuse_norm_add_relu,
+};
+
+/// Converts normalization flags enum value from C++ API to C API type.
+/// @param flags C++ API normalization flags enum value.
+/// @returns Corresponding C API normalization flags enum value.
+inline dnnl_normalization_flags_t convert_to_c(normalization_flags flags) {
+    return static_cast<dnnl_normalization_flags_t>(flags);
+}
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_rnn
+/// @{
+
+/// RNN cell flags.
+enum class rnn_flags : unsigned {
+    /// Undefined RNN flags
+    undef = dnnl_rnn_flags_undef,
+    /// Do not add weights gradient to existing diff_weights memory
+    diff_weights_overwrite = dnnl_rnn_flags_diff_weights_overwrite,
+};
+
+/// Converts RNN cell flags enum value from C++ API to C API type.
+/// @param flags C++ API RNN cell flags enum value.
+/// @returns Corresponding C API RNN cell flags enum value.
+inline dnnl_rnn_flags_t convert_to_c(rnn_flags flags) {
+    return static_cast<dnnl_rnn_flags_t>(flags);
+}
+
+DNNL_DEFINE_BITMASK_OPS(normalization_flags)
+DNNL_DEFINE_BITMASK_OPS(rnn_flags)
+
+/// A direction of RNN primitive execution
+enum class rnn_direction {
+    /// Undefined RNN direction.
+    undef = dnnl_rnn_direction_undef,
+    /// Unidirectional execution of RNN primitive from left to right.
+    unidirectional_left2right = dnnl_unidirectional_left2right,
+    /// Unidirectional execution of RNN primitive from right to left.
+    unidirectional_right2left = dnnl_unidirectional_right2left,
+    /// Bidirectional execution of RNN primitive with concatenation of the
+    /// results.
+    bidirectional_concat = dnnl_bidirectional_concat,
+    /// Bidirectional execution of RNN primitive with summation of the
+    /// results.
+    bidirectional_sum = dnnl_bidirectional_sum,
+};
+
+/// Converts RNN direction enum value from C++ API to C API type.
+/// @param dir C++ API RNN direction enum value.
+/// @returns Corresponding C API RNN direction enum value.
+inline dnnl_rnn_direction_t convert_to_c(rnn_direction dir) {
+    return static_cast<dnnl_rnn_direction_t>(dir);
+}
+
+/// @} dnnl_api_rnn
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Primitive descriptor query specification.
+///
+/// In general, queries are not used with the C++ API because most queries are
+/// implemented as class members.
+///
+/// See @ref dnnl_query_t for more information.
+enum class query {
+    /// no query
+    undef = dnnl_query_undef,
+
+    /// execution engine
+    engine = dnnl_query_engine,
+    /// primitive kind
+    primitive_kind = dnnl_query_primitive_kind,
+
+    /// number of inputs expected
+    num_of_inputs_s32 = dnnl_query_num_of_inputs_s32,
+    /// number of outputs expected
+    num_of_outputs_s32 = dnnl_query_num_of_outputs_s32,
+
+    /// runtime estimation (seconds), unimplemented
+    time_estimate_f64 = dnnl_query_time_estimate_f64,
+    /// memory required for scratchpad (bytes)
+    ///
+    /// @sa @ref dev_guide_attributes_scratchpad
+    memory_consumption_s64 = dnnl_query_memory_consumption_s64,
+
+    /// scratchpad engine
+    ///
+    /// engine to be used for creating scratchpad memory
+    scratchpad_engine = dnnl_query_scratchpad_engine,
+
+    /// reorder source engine
+    reorder_src_engine = dnnl_query_reorder_src_engine,
+    /// reorder destination engine
+    reorder_dst_engine = dnnl_query_reorder_dst_engine,
+
+    /// implementation name
+    impl_info_str = dnnl_query_impl_info_str,
+
+    /// propagation kind
+    prop_kind = dnnl_query_prop_kind,
+
+    /// size of cache blob ID in bytes
+    cache_blob_id_size_s64 = dnnl_query_cache_blob_id_size_s64,
+
+    /// cache blob ID (pointer to array)
+    cache_blob_id = dnnl_query_cache_blob_id,
+
+    /// strides
+    strides = dnnl_query_strides,
+    /// dilations
+    dilations = dnnl_query_dilations,
+    /// left padding
+    padding_l = dnnl_query_padding_l,
+    /// right padding
+    padding_r = dnnl_query_padding_r,
+    /// epsilon
+    epsilon_f32 = dnnl_query_epsilon_f32,
+    /// flags
+    flags = dnnl_query_flags,
+    /// algorithm kind
+    alg_kind = dnnl_query_alg_kind,
+    /// alpha
+    alpha_f32 = dnnl_query_alpha_f32,
+    /// beta
+    beta_f32 = dnnl_query_beta_f32,
+    /// axis
+    axis_s32 = dnnl_query_axis_s32,
+    /// LRN parameter local size
+    local_size_s64 = dnnl_query_local_size_s64,
+    /// LRN parameter K
+    k_f32 = dnnl_query_k_f32,
+    /// Reduction parameter P
+    p_f32 = dnnl_query_p_f32,
+    /// Resampling parameter factors
+    factors = dnnl_query_factors,
+    /// RNN parameter cell kind
+    cell_kind = dnnl_query_cell_kind,
+    /// RNN parameter direction
+    direction = dnnl_query_direction,
+    /// RNN parameter activation kind
+    activation_kind = dnnl_query_activation_kind,
+    /// Pooling parameter kernel
+    kernel = dnnl_query_kernel,
+    /// Shuffle parameter group size
+    group_size_s64 = dnnl_query_group_size_s64,
+
+    /// source memory desc
+    src_md = dnnl_query_src_md,
+    /// source gradient (diff) memory desc
+    diff_src_md = dnnl_query_diff_src_md,
+    /// weights memory descriptor desc
+    weights_md = dnnl_query_weights_md,
+    /// weights gradient (diff) memory desc
+    diff_weights_md = dnnl_query_diff_weights_md,
+    /// destination memory desc
+    dst_md = dnnl_query_dst_md,
+    /// destination gradient (diff) memory desc
+    diff_dst_md = dnnl_query_diff_dst_md,
+    /// workspace memory desc
+    workspace_md = dnnl_query_workspace_md,
+    /// scratchpad memory desc
+    scratchpad_md = dnnl_query_scratchpad_md,
+    /// memory desc of an execute argument
+    exec_arg_md = dnnl_query_exec_arg_md,
+
+    /// number of dimensions
+    ndims_s32 = dnnl_query_ndims_s32,
+    /// vector of dimensions
+    dims = dnnl_query_dims,
+    /// data type
+    data_type = dnnl_query_data_type,
+    /// submemory offset
+    submemory_offset_s64 = dnnl_query_submemory_offset_s64,
+    /// vector of padded dimensions
+    padded_dims = dnnl_query_padded_dims,
+    /// vector of padded offsets
+    padded_offsets = dnnl_query_padded_offsets,
+    /// format kind
+    format_kind = dnnl_query_format_kind,
+    ///  number of innermost blocks
+    inner_nblks_s32 = dnnl_query_inner_nblks_s32,
+    /// vector of sizes of the innermost blocks
+    inner_blks = dnnl_query_inner_blks,
+    /// vector of logical indices of the blocks
+    inner_idxs = dnnl_query_inner_idxs,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Sparse encoding
+    sparse_encoding = dnnl_query_sparse_encoding,
+    /// Number of non-zero entries
+    nnz_s64 = dnnl_query_nnz_s64,
+    /// Number of buffers required for a memory descriptor
+    num_handles_s32 = dnnl_query_num_handles_s32,
+#endif
+};
+
+/// Converts query enum value from C++ API to C API type.
+/// @param aquery C++ API query enum value.
+/// @returns Corresponding C API query enum value.
+inline dnnl_query_t convert_to_c(query aquery) {
+    return static_cast<dnnl_query_t>(aquery);
+}
+
+/// @} dnnl_api_primitives_common
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_memory Memory
+///
+/// A container that describes and stores data. Memory objects can contain
+/// data of various types and formats. There are two levels of abstraction:
+///
+/// 1. **Memory descriptor** -- engine-agnostic logical description of data
+///     (number of dimensions, dimension sizes, and data type), and,
+///     optionally, the information about the physical format of data in
+///     memory. If this information is not known yet, a memory descriptor can
+///     be created with #dnnl::memory::format_tag::any. This allows
+///     compute-intensive primitives to choose the best format for
+///     computation. The user is responsible for reordering the data into the
+///     chosen format when formats do not match.
+///
+///     A memory descriptor can be initialized either by specifying dimensions
+///     and a memory format tag or strides for each of them, or by
+///     manipulating the dnnl_memory_desc_t structure directly.
+///
+///     @warning
+///         The latter approach requires understanding how the physical data
+///         representation is mapped to the structure and is discouraged. This
+///         topic is discussed in @ref dev_guide_understanding_memory_formats.
+///
+///     The user can query the amount of memory required by a memory
+///     descriptor using the #dnnl::memory::desc::get_size() function. The
+///     size of data in general cannot be computed as the product of
+///     dimensions multiplied by the size of the data type. So users are
+///     required to use this function for better code portability.
+///
+///     Two memory descriptors can be compared using the equality and
+///     inequality operators.  The comparison is especially useful when
+///     checking whether it is necessary to reorder data from the user's data
+///     format to a primitive's format.
+///
+/// 2. **Memory object** -- an engine-specific object that handles the memory
+///     buffer and its description (a memory descriptor). For the CPU engine or
+///     with USM, the memory buffer handle is simply a pointer to @c void. The
+///     memory buffer can be queried using #dnnl::memory::get_data_handle() and
+///     set using #dnnl::memory::set_data_handle(). The underlying SYCL buffer,
+///     when used, can be queried using #dnnl::sycl_interop::get_buffer and set
+///     using #dnnl::sycl_interop::set_buffer. A memory object can also be
+///     queried for the underlying memory descriptor and for its engine using
+///     #dnnl::memory::get_desc() and dnnl::memory::get_engine().
+///
+/// Along with ordinary memory descriptors with all dimensions being positive,
+/// the library supports *zero-volume*  memory descriptors with one or more
+/// dimensions set to zero. This is used to support the NumPy\* convention.
+/// If a zero-volume memory is passed to a primitive, the primitive typically
+/// does not perform any computations with this memory. For example:
+///
+/// - A concatenation primitive would ignore all memory object with zeroes in
+///   the concat dimension / axis.
+///
+/// - A forward convolution with a source memory object with zero in the
+///   minibatch dimension would always produce a destination memory object
+///   with a zero in the minibatch dimension and perform no computations.
+///
+/// - However, a forward convolution with a zero in one of the weights
+///   dimensions is ill-defined and is considered to be an error by the
+///   library because there is no clear definition of what the output values
+///   should be.
+///
+/// Memory buffer of a zero-volume memory is never accessed.
+///
+/// @{
+
+/// Memory object.
+///
+/// A memory object encapsulates a handle to a memory buffer allocated on a
+/// specific engine, tensor dimensions, data type, and memory format, which is
+/// the way tensor indices map to offsets in linear memory space. Memory
+/// objects are passed to primitives during execution.
+struct memory : public handle<dnnl_memory_t> {
+    using handle::handle;
+
+    /// Integer type for representing dimension sizes and indices.
+    typedef dnnl_dim_t dim;
+    /// Vector of dimensions. Implementations are free to force a limit on the
+    /// vector's length.
+    typedef std::vector<dim> dims;
+
+    /// Helper function that validates that an `std::vector` of dimensions can
+    /// be safely converted to the C API array ::dnnl_dims_t. Throws if
+    /// validation fails.
+    ///
+    /// @param v Vector of dimensions.
+    /// @param min_size Minimum expected size of the vector.
+    template <typename T>
+    static void validate_dims(const std::vector<T> &v, int min_size = 0) {
+        validate_container_size(
+                v, "dimensions are invalid", min_size, DNNL_MAX_NDIMS);
+    }
+
+    /// Data type specification.
+    enum class data_type {
+        /// Undefined data type (used for empty memory descriptors).
+        undef = dnnl_data_type_undef,
+        /// 4-bit float data type with 3-bit exponent and 0 bit mantissa.
+        f4_e3m0 = dnnl_f4_e3m0,
+        /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa.
+        f4_e2m1 = dnnl_f4_e2m1,
+        /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent.
+        e8m0 = dnnl_e8m0,
+        /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+        /// with a 5-bit exponent and a 2-bit mantissa.
+        f8_e5m2 = dnnl_f8_e5m2,
+        /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+        /// with a 4-bit exponent and a 3-bit mantissa.
+        f8_e4m3 = dnnl_f8_e4m3,
+        /// [16-bit/half-precision floating point](https://en.wikipedia.org/wiki/Half-precision_floating-point_format).
+        f16 = dnnl_f16,
+        /// non-standard
+        /// [16-bit floating point with 7-bit mantissa](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format).
+        bf16 = dnnl_bf16,
+        /// [32-bit/single-precision floating point](https://en.wikipedia.org/wiki/Single-precision_floating-point_format).
+        f32 = dnnl_f32,
+        //// [64-bit/double-precision floating point](https://en.wikipedia.org/wiki/Double-precision_floating-point_format).
+        f64 = dnnl_f64,
+        /// 32-bit signed integer.
+        s32 = dnnl_s32,
+        /// 8-bit signed integer.
+        s8 = dnnl_s8,
+        /// 8-bit unsigned integer.
+        u8 = dnnl_u8,
+        /// 4-bit signed integer.
+        s4 = dnnl_s4,
+        /// 4-bit unsigned integer.
+        u4 = dnnl_u4,
+    };
+
+    /// Returns size of data type in bytes.
+    /// @returns The number of bytes occupied by data type.
+    static size_t data_type_size(data_type adata_type) {
+        return dnnl_data_type_size(convert_to_c(adata_type));
+    }
+
+    /// Memory format kind
+    enum class format_kind {
+        /// Undefined memory format kind, used for empty memory descriptors.
+        undef = dnnl_format_kind_undef,
+        /// A special format kind that indicates that the actual format will be
+        /// selected by a primitive automatically.
+        any = dnnl_format_kind_any,
+        /// A tensor in a generic format described by the stride and blocking
+        /// values in each dimension.
+        blocked = dnnl_blocked,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        /// Format kind for sparse tensors.
+        sparse = dnnl_format_kind_sparse,
+#endif
+        /// A special format kind that indicates that tensor format is opaque.
+        opaque = dnnl_format_kind_opaque,
+    };
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Sparse encodings.
+    enum class sparse_encoding {
+            /// Undefined sparse encoding kind, used for empty memory descriptors.
+            undef = dnnl_sparse_encoding_undef,
+            /// Compressed Sparse Row (CSR) encoding.
+            csr = dnnl_csr,
+            /// An encoding that is used for an opaque storage schema for
+            /// tensors with unstructured sparsity. A memory descriptor with the
+            /// packed encoding cannot be used to create a memory object. It can
+            /// only be used to create a primitive descriptor to query the
+            /// actual memory descriptor (similar to the format tag `any`).
+            packed = dnnl_packed,
+            /// Coordinate Sparse (COO) encoding.
+            coo = dnnl_coo,
+    };
+#endif
+
+    /// Memory format tag specification.
+    ///
+    /// Memory format tags can be further divided into two categories:
+    ///
+    ///  - Domain-agnostic names, i.e. names that do not depend on the tensor
+    ///    usage in the specific primitive. These names use letters from `a`
+    ///    to `f` to denote logical dimensions and form the order in which the
+    ///    dimensions are laid in memory. For example,
+    ///    #dnnl::memory::format_tag::ab is used to denote a 2D tensor where the
+    ///    second logical dimension (denoted as `b`) is the innermost, i.e.
+    ///    has stride = 1, and the first logical dimension (`a`) is laid out in
+    ///    memory with stride equal to the size of the second dimension. On the
+    ///    other hand, #dnnl::memory::format_tag::ba is the transposed version
+    ///    of the same tensor: the outermost dimension (`a`) becomes the
+    ///    innermost one.
+    ///
+    ///  - Domain-specific names, i.e. names that make sense only in the
+    ///    context of a certain domain, such as CNN. These names are
+    ///    aliases to the corresponding domain-agnostic tags and used mostly
+    ///    for convenience. For example, #dnnl::memory::format_tag::nc
+    ///    is used to denote 2D CNN activations tensor memory format, where
+    ///    the channels dimension is the innermost one and the batch dimension
+    ///    is the outermost one. Moreover, #dnnl::memory::format_tag::nc is
+    ///    an alias for #dnnl::memory::format_tag::ab, because for
+    ///    CNN primitives the logical dimensions of activations tensors come
+    ///    in order: batch, channels, spatial.  In other words, batch
+    ///    corresponds to the first logical dimension (`a`), and channels
+    ///    correspond to the second one (`b`).
+    ///
+    /// The following domain-specific notation applies to memory format tags:
+    ///  - @c 'n' denotes the mini-batch dimension
+    ///  - @c 'c' denotes a channels dimension
+    ///  - When there are multiple channel dimensions (for example,
+    ///    in convolution weights tensor), @c 'i' and @c 'o' denote dimensions
+    ///    of input and output channels
+    ///  - @c 'g' denotes a groups dimension for convolution weights
+    ///  - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width
+    ///    respectively
+    ///
+    /// See @ref dnnl_format_tag_t for a detailed description.
+    enum class format_tag {
+        /// Undefined memory format tag
+        undef = dnnl_format_tag_undef,
+        /// Placeholder memory format tag. Used to instruct the primitive to
+        /// select a format automatically.
+        any = dnnl_format_tag_any,
+
+        /// plain 1D tensor
+        a = dnnl_a,
+
+        /// plain 2D tensor
+        ab = dnnl_ab,
+        /// permuted 2D tensor
+        ba = dnnl_ba,
+
+        /// plain 3D tensor
+        abc = dnnl_abc,
+        /// permuted 3D tensor
+        acb = dnnl_acb,
+        /// permuted 3D tensor
+        bac = dnnl_bac,
+        /// permuted 3D tensor
+        bca = dnnl_bca,
+        /// permuted 3D tensor
+        cba = dnnl_cba,
+
+        /// plain 4D tensor
+        abcd = dnnl_abcd,
+        /// permuted 4D tensor
+        abdc = dnnl_abdc,
+        /// permuted 4D tensor
+        acbd = dnnl_acbd,
+        /// permuted 4D tensor
+        acdb = dnnl_acdb,
+        /// permuted 4D tensor
+        adbc = dnnl_adbc,
+        /// permuted 4D tensor
+        bacd = dnnl_bacd,
+        /// permuted 4D tensor
+        bcda = dnnl_bcda,
+        /// permuted 4D tensor
+        cdba = dnnl_cdba,
+        /// permuted 4D tensor
+        dcab = dnnl_dcab,
+
+        /// plain 5D tensor
+        abcde = dnnl_abcde,
+        /// permuted 5D tensor
+        abdec = dnnl_abdec,
+        /// permuted 5D tensor
+        acbde = dnnl_acbde,
+        /// permuted 5D tensor
+        acdeb = dnnl_acdeb,
+        /// permuted 5D tensor
+        bacde = dnnl_bacde,
+        /// permuted 5D tensor
+        bcdea = dnnl_bcdea,
+        /// permuted 5D tensor
+        cdeba = dnnl_cdeba,
+        /// permuted 5D tensor
+        decab = dnnl_decab,
+        /// permuted 5D tensor
+        abced = dnnl_abced,
+
+        /// plain 6D tensor
+        abcdef = dnnl_abcdef,
+        /// permuted 6D tensor
+        abdfce = dnnl_abdfce,
+        /// permuted 6D tensor
+        acbdef = dnnl_acbdef,
+        /// permuted 6D tensor
+        abdefc = dnnl_abdefc,
+        /// permuted 6D tensor
+        defcab = dnnl_defcab,
+        /// permuted 6D tensor
+        abcdfe = dnnl_abcdfe,
+
+        /// plain 7D tensor
+        abcdefg = dnnl_abcdefg,
+        /// permuted 7D tensor
+        abcdegf = dnnl_abcdegf,
+
+        /// plain 8D tensor
+        abcdefgh = dnnl_abcdefgh,
+        /// permuted 8D tensor
+        abcdefhg = dnnl_abcdefhg,
+
+        /// plain 9D tensor
+        abcdefghi = dnnl_abcdefghi,
+        /// permuted 9D tensor
+        abcdefgih = dnnl_abcdefgih,
+
+        /// plain 10D tensor
+        abcdefghij = dnnl_abcdefghij,
+        /// permuted 10D tensor
+        abcdefghji = dnnl_abcdefghji,
+
+        /// plain 11D tensor
+        abcdefghijk = dnnl_abcdefghijk,
+        /// permuted 11D tensor
+        abcdefghikj = dnnl_abcdefghikj,
+
+        /// plain 12D tensor
+        abcdefghijkl = dnnl_abcdefghijkl,
+        /// permuted 12D tensor
+        abcdefghijlk = dnnl_abcdefghijlk,
+
+        /// 1D tensor; an alias for #dnnl::memory::format_tag::a
+        x = a,
+        /// 2D CNN activations tensor; an alias for #dnnl::memory::format_tag::ab
+        nc = ab,
+        /// 2D CNN activations tensor; an alias for #dnnl::memory::format_tag::ba
+        cn = ba,
+        /// 2D RNN statistics tensor; an alias for #dnnl::memory::format_tag::ab
+        tn = ab,
+        /// 2D RNN statistics tensor; an alias for #dnnl::memory::format_tag::ba
+        nt = ba,
+        /// 3D CNN activations tensor; an alias for #dnnl::memory::format_tag::abc
+        ncw = abc,
+        /// 3D CNN activations tensor; an alias for #dnnl::memory::format_tag::acb
+        nwc = acb,
+        /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::abcd
+        nchw = abcd,
+        /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::acdb
+        nhwc = acdb,
+        /// 4D CNN activations tensor; an alias for #dnnl::memory::format_tag::bcda
+        chwn = bcda,
+        /// 5D CNN activations tensor; an alias for #dnnl::memory::format_tag::abcde
+        ncdhw = abcde,
+        /// 5D CNN activations tensor; an alias for #dnnl::memory::format_tag::acdeb
+        ndhwc = acdeb,
+
+        /// 2D CNN weights tensor; an alias for #dnnl::memory::format_tag::ab
+        oi = ab,
+        /// 2D CNN weights tensor; an alias for #dnnl::memory::format_tag::ba
+        io = ba,
+        /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::abc
+        oiw = abc,
+        /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::acb
+        owi = acb,
+        /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::cba
+        wio = cba,
+        /// 3D CNN weights tensor; an alias for #dnnl::memory::format_tag::bca
+        iwo = bca,
+        /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::abcd
+        oihw = abcd,
+        /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::cdba
+        hwio = cdba,
+        /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::acdb
+        ohwi = acdb,
+        /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::bcda
+        ihwo = bcda,
+        /// 4D CNN weights tensor; an alias for #dnnl::memory::format_tag::bacd
+        iohw = bacd,
+        /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::abcde
+        oidhw = abcde,
+        /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::cdeba
+        dhwio = cdeba,
+        /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::acdeb
+        odhwi = acdeb,
+        /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::bacde
+        iodhw = bacde,
+        /// 5D CNN weights tensor; an alias for #dnnl::memory::format_tag::bcdea
+        idhwo = bcdea,
+
+        /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcd
+        goiw = abcd,
+        /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdc
+        gowi = abdc,
+        /// 4D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::dcab
+        wigo = dcab,
+        /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdec
+        gohwi = abdec,
+        /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcde
+        goihw = abcde,
+        /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::decab
+        hwigo = decab,
+        /// 5D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::acbde
+        giohw = acbde,
+        /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcdef
+        goidhw = abcdef,
+        /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abcdef
+        giodhw = acbdef,
+        /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::abdefc
+        godhwi = abdefc,
+        /// 6D CNN weights tensor with groups; an alias for #dnnl::memory::format_tag::defcab
+        dhwigo = defcab,
+
+        /// 3D RNN data tensor in the format (seq_length, batch, input
+        /// channels); an alias for #dnnl::memory::format_tag::abc.
+        tnc = abc,
+        /// 3D RNN data tensor in the format (batch, seq_length, input
+        /// channels); an alias for #dnnl::memory::format_tag::bac.
+        ntc = bac,
+        /// 4D RNN states tensor in the format (num_layers, num_directions,
+        /// batch, state channels); an alias for #dnnl::memory::format_tag::abcd.
+        ldnc = abcd,
+        /// 5D RNN weights tensor in the format (num_layers, num_directions,
+        /// input_channels, num_gates, output_channels);
+        /// an alias for #dnnl::memory::format_tag::abcde.
+        ///
+        ///  - For LSTM cells, the gates order is input, forget, candidate
+        ///    and output gate.
+        ///  - For GRU cells, the gates order is update, reset and output gate.
+        ldigo = abcde,
+        /// 5D RNN weights tensor in the format (num_layers, num_directions,
+        /// num_gates, output_channels, input_channels);
+        /// an alias for #dnnl::memory::format_tag::abdec.
+        ///
+        ///  - For LSTM cells, the gates order is input, forget, candidate
+        ///    and output gate.
+        ///  - For GRU cells, the gates order is update, reset and output gate.
+        ldgoi = abdec,
+        /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+        /// num_channels_in_hidden_state, num_channels_in_recurrent_projection);
+        /// an alias for #dnnl::memory::format_tag::abcd.
+        ldio = abcd,
+        /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+        /// num_channels_in_recurrent_projection, num_channels_in_hidden_state);
+        /// an alias for #dnnl::memory::format_tag::abdc.
+        ldoi = abdc,
+        /// 4D RNN bias tensor in the format (num_layers, num_directions,
+        /// num_gates, output_channels);
+        /// an alias for #dnnl::memory::format_tag::abcd.
+        ///
+        ///  - For LSTM cells, the gates order is input, forget, candidate
+        ///    and output gate.
+        ///  - For GRU cells, the gates order is update, reset and output gate.
+        ldgo = abcd,
+
+        // Opaque blocked formats
+
+        AB16b16a = dnnl_AB16b16a,
+        AB16b32a = dnnl_AB16b32a,
+        AB16b48a = dnnl_AB16b48a,
+        AB16b64a = dnnl_AB16b64a,
+        AB8b16a2b = dnnl_AB8b16a2b,
+        AB8b32a2b = dnnl_AB8b32a2b,
+        AB8b64a2b = dnnl_AB8b64a2b,
+        AB4b16a4b = dnnl_AB4b16a4b,
+        AB4b32a4b = dnnl_AB4b32a4b,
+        AB4b64a4b = dnnl_AB4b64a4b,
+        AB16b16a4b = dnnl_AB16b16a4b,
+        AB16b32a4b = dnnl_AB16b32a4b,
+        AB16b48a4b = dnnl_AB16b48a4b,
+        AB16b64a4b = dnnl_AB16b64a4b,
+        AB16b16a2b = dnnl_AB16b16a2b,
+        AB16b32a2b = dnnl_AB16b32a2b,
+        AB16b48a2b = dnnl_AB16b48a2b,
+        AB16b64a2b = dnnl_AB16b64a2b,
+        Ab4a = dnnl_Ab4a,
+        Ab8a = dnnl_Ab8a,
+        Ab32a = dnnl_Ab32a,
+        Abc16a = dnnl_Abc16a,
+        ABc16a16b = dnnl_ABc16a16b,
+        ABc4a4b = dnnl_ABc4a4b,
+        aBc16b = dnnl_aBc16b,
+        aBc32b = dnnl_aBc32b,
+        ABc16b16a = dnnl_ABc16b16a,
+        AcB16b16a = dnnl_AcB16b16a,
+        ABc16b32a = dnnl_ABc16b32a,
+        AcB16b32a = dnnl_AcB16b32a,
+        ABc16b48a = dnnl_ABc16b48a,
+        AcB16b48a = dnnl_AcB16b48a,
+        ABc16b64a = dnnl_ABc16b64a,
+        AcB16b64a = dnnl_AcB16b64a,
+        Abc4a = dnnl_Abc4a,
+        aBc4b = dnnl_aBc4b,
+        ABc4b16a4b = dnnl_ABc4b16a4b,
+        AcB4b16a4b = dnnl_AcB4b16a4b,
+        ABc4b32a4b = dnnl_ABc4b32a4b,
+        AcB4b32a4b = dnnl_AcB4b32a4b,
+        ABc4b64a4b = dnnl_ABc4b64a4b,
+        AcB4b64a4b = dnnl_AcB4b64a4b,
+        ABc2b8a4b = dnnl_ABc2b8a4b,
+        ABc16a16b2a = dnnl_ABc16a16b2a,
+        ABc16b16a4b = dnnl_ABc16b16a4b,
+        ABc16b32a4b = dnnl_ABc16b32a4b,
+        ABc16b48a4b = dnnl_ABc16b48a4b,
+        ABc16b64a4b = dnnl_ABc16b64a4b,
+        ABc16b16a2b = dnnl_ABc16b16a2b,
+        ABc16b32a2b = dnnl_ABc16b32a2b,
+        ABc16b48a2b = dnnl_ABc16b48a2b,
+        ABc16b64a2b = dnnl_ABc16b64a2b,
+        ABc4b4a = dnnl_ABc4b4a,
+        ABc8a16b2a = dnnl_ABc8a16b2a,
+        ABc8a8b = dnnl_ABc8a8b,
+        ABc8a4b = dnnl_ABc8a4b,
+        aBc8b = dnnl_aBc8b,
+        ABc8b16a2b = dnnl_ABc8b16a2b,
+        AcB8b16a2b = dnnl_AcB8b16a2b,
+        ABc8b32a2b = dnnl_ABc8b32a2b,
+        AcB8b32a2b = dnnl_AcB8b32a2b,
+        ABc8b64a2b = dnnl_ABc8b64a2b,
+        AcB8b64a2b = dnnl_AcB8b64a2b,
+        ABc8b8a = dnnl_ABc8b8a,
+        AcB8b8a = dnnl_AcB8b8a,
+        Abcd8a = dnnl_Abcd8a,
+        Abcd16a = dnnl_Abcd16a,
+        Abcd32a = dnnl_Abcd32a,
+        ABcd16a16b = dnnl_ABcd16a16b,
+        aBcd16b = dnnl_aBcd16b,
+        aBcd32b = dnnl_aBcd32b,
+        ABcd16b16a = dnnl_ABcd16b16a,
+        AcdB16b16a = dnnl_AcdB16b16a,
+        ABcd16b32a = dnnl_ABcd16b32a,
+        AcdB16b32a = dnnl_AcdB16b32a,
+        ABcd16b48a = dnnl_ABcd16b48a,
+        AcdB16b48a = dnnl_AcdB16b48a,
+        ABcd16b64a = dnnl_ABcd16b64a,
+        AcdB16b64a = dnnl_AcdB16b64a,
+        aBCd16b16c = dnnl_aBCd16b16c,
+        aBCd16c16b = dnnl_aBCd16c16b,
+        Abcd4a = dnnl_Abcd4a,
+        aBcd4b = dnnl_aBcd4b,
+        ABcd4b16a4b = dnnl_ABcd4b16a4b,
+        AcdB4b16a4b = dnnl_AcdB4b16a4b,
+        ABcd4b32a4b = dnnl_ABcd4b32a4b,
+        AcdB4b32a4b = dnnl_AcdB4b32a4b,
+        ABcd4b64a4b = dnnl_ABcd4b64a4b,
+        AcdB4b64a4b = dnnl_AcdB4b64a4b,
+        ABcd2b8a4b = dnnl_ABcd2b8a4b,
+        ABcd4b4a = dnnl_ABcd4b4a,
+        ABcd4a4b = dnnl_ABcd4a4b,
+        aBCd4c16b4c = dnnl_aBCd4c16b4c,
+        aBCd2c8b4c = dnnl_aBCd2c8b4c,
+        ABcd16a16b2a = dnnl_ABcd16a16b2a,
+        ABcd16b16a4b = dnnl_ABcd16b16a4b,
+        ABcd16b32a4b = dnnl_ABcd16b32a4b,
+        ABcd16b48a4b = dnnl_ABcd16b48a4b,
+        ABcd16b64a4b = dnnl_ABcd16b64a4b,
+        ABcd16b16a2b = dnnl_ABcd16b16a2b,
+        ABcd16b32a2b = dnnl_ABcd16b32a2b,
+        ABcd16b48a2b = dnnl_ABcd16b48a2b,
+        ABcd16b64a2b = dnnl_ABcd16b64a2b,
+        aBCd16b16c2b = dnnl_aBCd16b16c2b,
+        aBCd16c16b4c = dnnl_aBCd16c16b4c,
+        aBCd16c16b2c = dnnl_aBCd16c16b2c,
+        aBCd4c4b = dnnl_aBCd4c4b,
+        aBCd4b4c = dnnl_aBCd4b4c,
+        ABcd8a16b2a = dnnl_ABcd8a16b2a,
+        ABcd8a8b = dnnl_ABcd8a8b,
+        ABcd8a4b = dnnl_ABcd8a4b,
+        ABcd8a2b = dnnl_ABcd8a2b,
+        /// 4D tensor blocked by 2nd dimension with block size 8
+        aBcd8b = dnnl_aBcd8b,
+        ABcd8b16a2b = dnnl_ABcd8b16a2b,
+        AcdB8b16a2b = dnnl_AcdB8b16a2b,
+        ABcd8b32a2b = dnnl_ABcd8b32a2b,
+        AcdB8b32a2b = dnnl_AcdB8b32a2b,
+        ABcd8b64a2b = dnnl_ABcd8b64a2b,
+        AcdB8b64a2b = dnnl_AcdB8b64a2b,
+        aBCd8b16c2b = dnnl_aBCd8b16c2b,
+        /// 4D tensor blocked by 1st and 2nd dimension with block size 8
+        ABcd8b8a = dnnl_ABcd8b8a,
+        AcdB8b8a = dnnl_AcdB8b8a,
+        aBCd8b8c = dnnl_aBCd8b8c,
+        aBCd8b4c = dnnl_aBCd8b4c,
+        aBCd8c16b2c = dnnl_aBCd8c16b2c,
+        aBCd8c8b = dnnl_aBCd8c8b,
+        Abcde16a = dnnl_Abcde16a,
+        Abcde32a = dnnl_Abcde32a,
+        ABcde16a16b = dnnl_ABcde16a16b,
+        aBcde16b = dnnl_aBcde16b,
+        aBcde32b = dnnl_aBcde32b,
+        ABcde16b16a = dnnl_ABcde16b16a,
+        AcdeB16b16a = dnnl_AcdeB16b16a,
+        ABcde16b32a = dnnl_ABcde16b32a,
+        AcdeB16b32a = dnnl_AcdeB16b32a,
+        ABcde16b48a = dnnl_ABcde16b48a,
+        AcdeB16b48a = dnnl_AcdeB16b48a,
+        ABcde16b64a = dnnl_ABcde16b64a,
+        AcdeB16b64a = dnnl_AcdeB16b64a,
+        aBCde16b16c = dnnl_aBCde16b16c,
+        aBCde16c16b = dnnl_aBCde16c16b,
+        aBCde2c8b4c = dnnl_aBCde2c8b4c,
+        Abcde4a = dnnl_Abcde4a,
+        aBcde4b = dnnl_aBcde4b,
+        ABcde4b4a = dnnl_ABcde4b4a,
+        ABcde4a4b = dnnl_ABcde4a4b,
+        aBCde4b4c = dnnl_aBCde4b4c,
+        aBCde4c16b4c = dnnl_aBCde4c16b4c,
+        aBCde16b16c2b = dnnl_aBCde16b16c2b,
+        aBCde16c16b4c = dnnl_aBCde16c16b4c,
+        aBCde16c16b2c = dnnl_aBCde16c16b2c,
+        aBCdef16c16b2c = dnnl_aBCdef16c16b2c,
+        aBCde4c4b = dnnl_aBCde4c4b,
+        Abcde8a = dnnl_Abcde8a,
+        ABcde8a8b = dnnl_ABcde8a8b,
+        ABcde8a4b = dnnl_ABcde8a4b,
+        aBcde8b = dnnl_aBcde8b,
+        ABcde8b16a2b = dnnl_ABcde8b16a2b,
+        AcdeB8b16a2b = dnnl_AcdeB8b16a2b,
+        ABcde8b32a2b = dnnl_ABcde8b32a2b,
+        AcdeB8b32a2b = dnnl_AcdeB8b32a2b,
+        ABcde8b64a2b = dnnl_ABcde8b64a2b,
+        AcdeB8b64a2b = dnnl_AcdeB8b64a2b,
+        ABcde4b16a4b = dnnl_ABcde4b16a4b,
+        AcdeB4b16a4b = dnnl_AcdeB4b16a4b,
+        ABcde4b32a4b = dnnl_ABcde4b32a4b,
+        AcdeB4b32a4b = dnnl_AcdeB4b32a4b,
+        ABcde4b64a4b = dnnl_ABcde4b64a4b,
+        AcdeB4b64a4b = dnnl_AcdeB4b64a4b,
+        ABcde16b16a4b = dnnl_ABcde16b16a4b,
+        ABcde16b32a4b = dnnl_ABcde16b32a4b,
+        ABcde16b48a4b = dnnl_ABcde16b48a4b,
+        ABcde16b64a4b = dnnl_ABcde16b64a4b,
+        ABcde16b16a2b = dnnl_ABcde16b16a2b,
+        ABcde16b32a2b = dnnl_ABcde16b32a2b,
+        ABcde16b48a2b = dnnl_ABcde16b48a2b,
+        ABcde16b64a2b = dnnl_ABcde16b64a2b,
+        ABcde2b8a4b = dnnl_ABcde2b8a4b,
+        aBCde8b16c2b = dnnl_aBCde8b16c2b,
+        ABcde8b8a = dnnl_ABcde8b8a,
+        AcdeB8b8a = dnnl_AcdeB8b8a,
+        aBCde8b8c = dnnl_aBCde8b8c,
+        aBCde8b4c = dnnl_aBCde8b4c,
+        ABcd4a8b8a4b = dnnl_ABcd4a8b8a4b,
+        ABcd2a8b8a2b = dnnl_ABcd2a8b8a2b,
+        aBCde4b8c8b4c = dnnl_aBCde4b8c8b4c,
+        aBCde2b8c8b2c = dnnl_aBCde2b8c8b2c,
+        aBCde8c16b2c = dnnl_aBCde8c16b2c,
+        aBCde8c8b = dnnl_aBCde8c8b,
+        aBcdef16b = dnnl_aBcdef16b,
+        aBCdef16b16c = dnnl_aBCdef16b16c,
+        aBCdef16c16b = dnnl_aBCdef16c16b,
+        aBcdef4b = dnnl_aBcdef4b,
+        aBCdef2c8b4c = dnnl_aBCdef2c8b4c,
+        aBCdef4c4b = dnnl_aBCdef4c4b,
+        aBCdef4b4c = dnnl_aBCdef4b4c,
+        aBCdef8b8c = dnnl_aBCdef8b8c,
+        aBCdef8b4c = dnnl_aBCdef8b4c,
+        aBCdef8c16b2c = dnnl_aBCdef8c16b2c,
+        aBCdef4c16b4c = dnnl_aBCdef4c16b4c,
+        aBCdef8c8b = dnnl_aBCdef8c8b,
+        aBdc16b = dnnl_aBdc16b,
+        aBdc4b = dnnl_aBdc4b,
+        aBdc8b = dnnl_aBdc8b,
+        aBdC8b2c = dnnl_aBdC8b2c,
+        aBdC8b4c = dnnl_aBdC8b4c,
+        aBdec16b = dnnl_aBdec16b,
+        aBdec4b = dnnl_aBdec4b,
+        aBdec8b = dnnl_aBdec8b,
+        aBdeC8b2c = dnnl_aBdeC8b2c,
+        aBdeC8b4c = dnnl_aBdeC8b4c,
+        aBdefc16b = dnnl_aBdefc16b,
+        aCBdef16c16b = dnnl_aCBdef16c16b,
+        aCBdef8b8c = dnnl_aCBdef8b8c,
+        aCBdef16b16c = dnnl_aCBdef16b16c,
+        aBdefc4b = dnnl_aBdefc4b,
+        aBdefc8b = dnnl_aBdefc8b,
+        aBdefC8b2c = dnnl_aBdefC8b2c,
+        aBdefC8b4c = dnnl_aBdefC8b4c,
+        Acb16a = dnnl_Acb16a,
+        Acb4a = dnnl_Acb4a,
+        Acb8a = dnnl_Acb8a,
+        AcB8a2b = dnnl_AcB8a2b,
+        AcB8a4b = dnnl_AcB8a4b,
+        aCBd8b8c = dnnl_aCBd8b8c,
+        aCBd16b16c = dnnl_aCBd16b16c,
+        aCBd16c16b = dnnl_aCBd16c16b,
+        aCBde8b8c = dnnl_aCBde8b8c,
+        aCBde16b16c = dnnl_aCBde16b16c,
+        aCBde16c16b = dnnl_aCBde16c16b,
+        Acdb16a = dnnl_Acdb16a,
+        Acdb4a = dnnl_Acdb4a,
+        Acdb8a = dnnl_Acdb8a,
+        AcdB8a2b = dnnl_AcdB8a2b,
+        AcdB8a4b = dnnl_AcdB8a4b,
+        Acdeb16a = dnnl_Acdeb16a,
+        Acdeb4a = dnnl_Acdeb4a,
+        Acdeb8a = dnnl_Acdeb8a,
+        AcdeB8a2b = dnnl_AcdeB8a2b,
+        AcdeB8a4b = dnnl_AcdeB8a4b,
+        BAc8a8b = dnnl_BAc8a8b,
+        BAc16a16b = dnnl_BAc16a16b,
+        BAc16b16a = dnnl_BAc16b16a,
+        BAcd8a8b = dnnl_BAcd8a8b,
+        BAcd16a16b = dnnl_BAcd16a16b,
+        BAcd16b16a = dnnl_BAcd16b16a,
+        ABcd32a32b = dnnl_ABcd32a32b,
+        BAcde16b16a = dnnl_BAcde16b16a,
+        BAcde8a8b = dnnl_BAcde8a8b,
+        BAcde16a16b = dnnl_BAcde16a16b,
+        aBdec32b = dnnl_aBdec32b,
+        Abcdef16a = dnnl_Abcdef16a,
+        Abcdef32a = dnnl_Abcdef32a,
+        Acdb32a = dnnl_Acdb32a,
+        aBCd2b4c2b = dnnl_aBCd2b4c2b,
+        aBCde2b4c2b = dnnl_aBCde2b4c2b,
+        aBCdef2b4c2b = dnnl_aBCdef2b4c2b,
+        aBCd2c4b2c = dnnl_aBCd2c4b2c,
+        aBCde2c4b2c = dnnl_aBCde2c4b2c,
+        aBCdef2c4b2c = dnnl_aBCdef2c4b2c,
+        aBCd4b8c2b = dnnl_aBCd4b8c2b,
+        aBCde4b8c2b = dnnl_aBCde4b8c2b,
+        aBCdef4b8c2b = dnnl_aBCdef4b8c2b,
+        aBCd4c8b2c = dnnl_aBCd4c8b2c,
+        aBCde4c8b2c = dnnl_aBCde4c8b2c,
+        aBCdef4c8b2c = dnnl_aBCdef4c8b2c,
+        AB32a32b8a4b = dnnl_AB32a32b8a4b,
+        AB32a32b8a2b = dnnl_AB32a32b8a2b,
+        AB8a4b = dnnl_AB8a4b,
+        AB8a2b = dnnl_AB8a2b,
+        abDc16d = dnnl_abDc16d,
+        abDc32d = dnnl_abDc32d,
+        abDC16d4c = dnnl_abDC16d4c,
+        abDC32d4c = dnnl_abDC32d4c,
+        abCd32c = dnnl_abCd32c,
+        abdEc16e = dnnl_abdEc16e,
+        abdEc32e = dnnl_abdEc32e,
+        abdEC16e4c = dnnl_abdEC16e4c,
+        abdEC32e2c = dnnl_abdEC32e2c,
+        abdEC32e4c = dnnl_abdEC32e4c,
+        abdCe16c = dnnl_abdCe16c,
+        abdCe32c = dnnl_abdCe32c,
+        abdCE32c2e = dnnl_abdCE32c2e,
+        aBCdef16c16b4c = dnnl_aBCdef16c16b4c,
+        aBdC16b4c = dnnl_aBdC16b4c,
+        aBdeC16b4c = dnnl_aBdeC16b4c,
+        AcB16a4b = dnnl_AcB16a4b,
+        AcdB16a2b = dnnl_AcdB16a2b,
+        aBdefC16b4c = dnnl_aBdefC16b4c,
+        AcdeB16a4b = dnnl_AcdeB16a4b,
+
+        Acb32a = dnnl_Acb32a,
+        AcB32a2b = dnnl_AcB32a2b,
+        AcB32a4b = dnnl_AcB32a4b,
+        Acb48a = dnnl_Acb48a,
+        AcB48a2b = dnnl_AcB48a2b,
+        AcB48a4b = dnnl_AcB48a4b,
+        Acb64a = dnnl_Acb64a,
+        AcB64a2b = dnnl_AcB64a2b,
+        AcB64a4b = dnnl_AcB64a4b,
+        cBa2b = dnnl_cBa2b,
+        cBa4b = dnnl_cBa4b,
+        aBdc32b = dnnl_aBdc32b,
+        aBdC32b2c = dnnl_aBdC32b2c,
+        aBdC32b4c = dnnl_aBdC32b4c,
+        aBdc48b = dnnl_aBdc48b,
+        aBdC48b2c = dnnl_aBdC48b2c,
+        aBdC48b4c = dnnl_aBdC48b4c,
+        aBdc64b = dnnl_aBdc64b,
+        aBdC64b2c = dnnl_aBdC64b2c,
+        aBdC64b4c = dnnl_aBdC64b4c,
+        adcb = dnnl_adcb,
+        adCb2c = dnnl_adCb2c,
+        adCb4c = dnnl_adCb4c,
+        AcdB32a2b = dnnl_AcdB32a2b,
+        AcdB32a4b = dnnl_AcdB32a4b,
+        Acdb48a = dnnl_Acdb48a,
+        AcdB48a2b = dnnl_AcdB48a2b,
+        AcdB48a4b = dnnl_AcdB48a4b,
+        Acdb64a = dnnl_Acdb64a,
+        AcdB64a2b = dnnl_AcdB64a2b,
+        AcdB64a4b = dnnl_AcdB64a4b,
+        cdBa2b = dnnl_cdBa2b,
+        cdBa4b = dnnl_cdBa4b,
+        aBdeC32b2c = dnnl_aBdeC32b2c,
+        aBdeC32b4c = dnnl_aBdeC32b4c,
+        aBdec48b = dnnl_aBdec48b,
+        aBdeC48b2c = dnnl_aBdeC48b2c,
+        aBdeC48b4c = dnnl_aBdeC48b4c,
+        aBdec64b = dnnl_aBdec64b,
+        aBdeC64b2c = dnnl_aBdeC64b2c,
+        aBdeC64b4c = dnnl_aBdeC64b4c,
+        adecb = dnnl_adecb,
+        adeCb2c = dnnl_adeCb2c,
+        adeCb4c = dnnl_adeCb4c,
+        Acdeb32a = dnnl_Acdeb32a,
+        AcdeB32a2b = dnnl_AcdeB32a2b,
+        AcdeB32a4b = dnnl_AcdeB32a4b,
+        Acdeb48a = dnnl_Acdeb48a,
+        AcdeB48a2b = dnnl_AcdeB48a2b,
+        AcdeB48a4b = dnnl_AcdeB48a4b,
+        Acdeb64a = dnnl_Acdeb64a,
+        AcdeB64a2b = dnnl_AcdeB64a2b,
+        AcdeB64a4b = dnnl_AcdeB64a4b,
+        cdeBa2b = dnnl_cdeBa2b,
+        cdeBa4b = dnnl_cdeBa4b,
+        aBdefc32b = dnnl_aBdefc32b,
+        aBdefC32b2c = dnnl_aBdefC32b2c,
+        aBdefC32b4c = dnnl_aBdefC32b4c,
+        aBdefc48b = dnnl_aBdefc48b,
+        aBdefC48b2c = dnnl_aBdefC48b2c,
+        aBdefC48b4c = dnnl_aBdefC48b4c,
+        aBdefc64b = dnnl_aBdefc64b,
+        aBdefC64b2c = dnnl_aBdefC64b2c,
+        aBdefC64b4c = dnnl_aBdefC64b4c,
+        adefcb = dnnl_adefcb,
+        adefCb2c = dnnl_adefCb2c,
+        adefCb4c = dnnl_adefCb4c,
+        ABc32a32b = dnnl_ABc32a32b,
+        BAc8a16b2a = dnnl_BAc8a16b2a,
+        BAcd8a16b2a = dnnl_BAcd8a16b2a,
+        ABcde8a16b2a = dnnl_ABcde8a16b2a,
+        aCBd8b16c2b = dnnl_aCBd8b16c2b,
+        BAcde8a16b2a = dnnl_BAcde8a16b2a,
+        aCBde8b16c2b = dnnl_aCBde8b16c2b,
+        ABcde32a32b = dnnl_ABcde32a32b,
+        ABc4a8b8a4b = dnnl_ABc4a8b8a4b,
+        ABcde4a8b8a4b = dnnl_ABcde4a8b8a4b,
+        BAc4b8a8b4a = dnnl_BAc4b8a8b4a,
+        BAcd4b8a8b4a = dnnl_BAcd4b8a8b4a,
+        BAcde4b8a8b4a = dnnl_BAcde4b8a8b4a,
+        aBCd4b8c8b4c = dnnl_aBCd4b8c8b4c,
+        aBCdef4b8c8b4c = dnnl_aBCdef4b8c8b4c,
+        aBCdef8b16c2b = dnnl_aBCdef8b16c2b,
+        aCBdef8b16c2b = dnnl_aCBdef8b16c2b,
+        aBdC16b2c = dnnl_aBdC16b2c,
+        aBdeC16b2c = dnnl_aBdeC16b2c,
+        aBdefC16b2c = dnnl_aBdefC16b2c,
+        aBedc16b = dnnl_aBedc16b,
+        AcB16a2b = dnnl_AcB16a2b,
+        AcdB16a4b = dnnl_AcdB16a4b,
+        AcdeB16a2b = dnnl_AcdeB16a2b,
+        Adcb16a = dnnl_Adcb16a,
+        aCBd4c8b8c4b = dnnl_aCBd4c8b8c4b,
+        aCBde4c8b8c4b = dnnl_aCBde4c8b8c4b,
+        aCBdef4c8b8c4b = dnnl_aCBdef4c8b8c4b,
+        ABc32a16b = dnnl_ABc32a16b,
+        ABcd16a32b = dnnl_ABcd16a32b,
+        ABcd32a16b = dnnl_ABcd32a16b,
+        ABcde32a16b = dnnl_ABcde32a16b,
+        AB48a16b = dnnl_AB48a16b,
+        AB48a32b = dnnl_AB48a32b,
+        ABc40a16b = dnnl_ABc40a16b,
+        ABc40a32b = dnnl_ABc40a32b,
+        aBC48b16c = dnnl_aBC48b16c,
+        aBC48b32c = dnnl_aBC48b32c,
+        ABcd40a16b = dnnl_ABcd40a16b,
+        ABcd40a32b = dnnl_ABcd40a32b,
+        BA16a16b = dnnl_BA16a16b,
+        BA16a32b = dnnl_BA16a32b,
+        BA16a48b = dnnl_BA16a48b,
+        BA16a64b = dnnl_BA16a64b,
+        BA16a16b2a = dnnl_BA16a16b2a,
+        BA16a32b2a = dnnl_BA16a32b2a,
+        BA16a48b2a = dnnl_BA16a48b2a,
+        BA16a64b2a = dnnl_BA16a64b2a,
+        BA16a16b4a = dnnl_BA16a16b4a,
+        BA16a32b4a = dnnl_BA16a32b4a,
+        BA16a48b4a = dnnl_BA16a48b4a,
+        BA16a64b4a = dnnl_BA16a64b4a,
+        decbA16a = dnnl_decbA16a,
+        decbA8a = dnnl_decbA8a,
+        defcbA16a = dnnl_defcbA16a,
+        defcbA8a = dnnl_defcbA8a,
+        aCB16b16c = dnnl_aCB16b16c,
+        aCB16b32c = dnnl_aCB16b32c,
+        aCB16b48c = dnnl_aCB16b48c,
+        aCB16b64c = dnnl_aCB16b64c,
+        aCB16b16c2b = dnnl_aCB16b16c2b,
+        aCB16b32c2b = dnnl_aCB16b32c2b,
+        aCB16b48c2b = dnnl_aCB16b48c2b,
+        aCB16b64c2b = dnnl_aCB16b64c2b,
+        aCB16b16c4b = dnnl_aCB16b16c4b,
+        aCB16b32c4b = dnnl_aCB16b32c4b,
+        aCB16b48c4b = dnnl_aCB16b48c4b,
+        aCB16b64c4b = dnnl_aCB16b64c4b,
+        Acb24a = dnnl_Acb24a,
+        Acdb24a = dnnl_Acdb24a,
+        Acdeb24a = dnnl_Acdeb24a,
+        aBdc24b = dnnl_aBdc24b,
+        aBdec24b = dnnl_aBdec24b,
+        aBdefc24b = dnnl_aBdefc24b,
+        AcB24a2b = dnnl_AcB24a2b,
+        AcdB24a2b = dnnl_AcdB24a2b,
+        AcdeB24a2b = dnnl_AcdeB24a2b,
+        aBdC24b2c = dnnl_aBdC24b2c,
+        aBdeC24b2c = dnnl_aBdeC24b2c,
+        aBdefC24b2c = dnnl_aBdefC24b2c,
+        AcB24a4b = dnnl_AcB24a4b,
+        AcdB24a4b = dnnl_AcdB24a4b,
+        AcdeB24a4b = dnnl_AcdeB24a4b,
+        aBdC24b4c = dnnl_aBdC24b4c,
+        aBdeC24b4c = dnnl_aBdeC24b4c,
+        aBdefC24b4c = dnnl_aBdefC24b4c,
+        AB8b32a = dnnl_AB8b32a,
+        ABc8b32a = dnnl_ABc8b32a,
+        AcB8b32a = dnnl_AcB8b32a,
+        ABcd8b32a = dnnl_ABcd8b32a,
+        AcdB8b32a = dnnl_AcdB8b32a,
+        ABcde8b32a = dnnl_ABcde8b32a,
+        AcdeB8b32a = dnnl_AcdeB8b32a,
+        AB8b24a = dnnl_AB8b24a,
+        ABc8b24a = dnnl_ABc8b24a,
+        AcB8b24a = dnnl_AcB8b24a,
+        ABcd8b24a = dnnl_ABcd8b24a,
+        AcdB8b24a = dnnl_AcdB8b24a,
+        ABcde8b24a = dnnl_ABcde8b24a,
+        AcdeB8b24a = dnnl_AcdeB8b24a,
+        AB8b16a = dnnl_AB8b16a,
+        ABc8b16a = dnnl_ABc8b16a,
+        AcB8b16a = dnnl_AcB8b16a,
+        ABcd8b16a = dnnl_ABcd8b16a,
+        AcdB8b16a = dnnl_AcdB8b16a,
+        ABcde8b16a = dnnl_ABcde8b16a,
+        AcdeB8b16a = dnnl_AcdeB8b16a,
+        AB8b8a = dnnl_AB8b8a,
+
+        format_tag_last = dnnl_format_tag_last,
+
+        nCdhw16c = dnnl_nCdhw16c,
+        nCdhw4c = dnnl_nCdhw4c,
+        nCdhw8c = dnnl_nCdhw8c,
+        nChw16c = dnnl_nChw16c,
+        nChw4c = dnnl_nChw4c,
+        nChw8c = dnnl_nChw8c,
+        nCw16c = dnnl_nCw16c,
+        nCw4c = dnnl_nCw4c,
+        nCw8c = dnnl_nCw8c,
+        NCw16n16c = dnnl_NCw16n16c,
+        NChw16n16c = dnnl_NChw16n16c,
+        NCdhw16n16c = dnnl_NCdhw16n16c,
+        NCdhw32n32c = dnnl_NCdhw32n32c,
+        NChw32n32c = dnnl_NChw32n32c,
+        IOhw16i16o = dnnl_IOhw16i16o,
+        OI16i16o = dnnl_OI16i16o,
+        OI16i32o = dnnl_OI16i32o,
+        OI16i48o = dnnl_OI16i48o,
+        OI16i64o = dnnl_OI16i64o,
+        OI8i16o2i = dnnl_OI8i16o2i,
+        OI8i32o2i = dnnl_OI8i32o2i,
+        OI8i64o2i = dnnl_OI8i64o2i,
+        OI4i8o4i = dnnl_OI4i8o4i,
+        OI4i16o4i = dnnl_OI4i16o4i,
+        OI4i24o4i = dnnl_OI4i24o4i,
+        OI4i32o4i = dnnl_OI4i32o4i,
+        OI4i64o4i = dnnl_OI4i64o4i,
+        Ohwi32o = dnnl_Ohwi32o,
+        IOdhw16i16o = dnnl_IOdhw16i16o,
+        gIOhw16i16o = dnnl_gIOhw16i16o,
+        gOhwi32o = dnnl_gOhwi32o,
+        Goidhw16g = dnnl_Goidhw16g,
+        IOw8o8i = dnnl_IOw8o8i,
+        IOw16o16i = dnnl_IOw16o16i,
+        OIw16i16o = dnnl_OIw16i16o,
+        OwI16i16o = dnnl_OwI16i16o,
+        OIw16i32o = dnnl_OIw16i32o,
+        OwI16i32o = dnnl_OwI16i32o,
+        OIw16i48o = dnnl_OIw16i48o,
+        OwI16i48o = dnnl_OwI16i48o,
+        OIw16i64o = dnnl_OIw16i64o,
+        OwI16i64o = dnnl_OwI16i64o,
+        IOw16i16o = dnnl_IOw16i16o,
+        gIOw16i16o = dnnl_gIOw16i16o,
+        OIw16o16i = dnnl_OIw16o16i,
+        Oiw16o = dnnl_Oiw16o,
+        OIw4i8o4i = dnnl_OIw4i8o4i,
+        OwI4i8o4i = dnnl_OwI4i8o4i,
+        OIw4i16o4i = dnnl_OIw4i16o4i,
+        OwI4i16o4i = dnnl_OwI4i16o4i,
+        OIw4i24o4i = dnnl_OIw4i24o4i,
+        OwI4i24o4i = dnnl_OwI4i24o4i,
+        OIw4i32o4i = dnnl_OIw4i32o4i,
+        OwI4i32o4i = dnnl_OwI4i32o4i,
+        OIw4i64o4i = dnnl_OIw4i64o4i,
+        OwI4i64o4i = dnnl_OwI4i64o4i,
+        OIw2i8o4i = dnnl_OIw2i8o4i,
+        OIw4i4o = dnnl_OIw4i4o,
+        OIw4o4i = dnnl_OIw4o4i,
+        Oiw4o = dnnl_Oiw4o,
+        OIw8i16o2i = dnnl_OIw8i16o2i,
+        OwI8i16o2i = dnnl_OwI8i16o2i,
+        OIw8i32o2i = dnnl_OIw8i32o2i,
+        OwI8i32o2i = dnnl_OwI8i32o2i,
+        OIw8i64o2i = dnnl_OIw8i64o2i,
+        OwI8i64o2i = dnnl_OwI8i64o2i,
+        OIw8i8o = dnnl_OIw8i8o,
+        OwI8i8o = dnnl_OwI8i8o,
+        OIw8o16i2o = dnnl_OIw8o16i2o,
+        OIw8o8i = dnnl_OIw8o8i,
+        OIw8o4i = dnnl_OIw8o4i,
+        OIw16i16o4i = dnnl_OIw16i16o4i,
+        OIw16i32o4i = dnnl_OIw16i32o4i,
+        OIw16i48o4i = dnnl_OIw16i48o4i,
+        OIw16i64o4i = dnnl_OIw16i64o4i,
+        OIw16i16o2i = dnnl_OIw16i16o2i,
+        OIw16i32o2i = dnnl_OIw16i32o2i,
+        OIw16i48o2i = dnnl_OIw16i48o2i,
+        OIw16i64o2i = dnnl_OIw16i64o2i,
+        OIw16o16i2o = dnnl_OIw16o16i2o,
+        Owi16o = dnnl_Owi16o,
+        OwI16o2i = dnnl_OwI16o2i,
+        Iwo16i = dnnl_Iwo16i,
+        IwO16i2o = dnnl_IwO16i2o,
+        IwO16i4o = dnnl_IwO16i4o,
+        Owi4o = dnnl_Owi4o,
+        Owi8o = dnnl_Owi8o,
+        OwI8o2i = dnnl_OwI8o2i,
+        OwI8o4i = dnnl_OwI8o4i,
+        IOhw8o8i = dnnl_IOhw8o8i,
+        IOhw16o16i = dnnl_IOhw16o16i,
+        Ohwi16o = dnnl_Ohwi16o,
+        OhwI16o2i = dnnl_OhwI16o2i,
+        Ihwo16i = dnnl_Ihwo16i,
+        IhwO16i2o = dnnl_IhwO16i2o,
+        IhwO16i4o = dnnl_IhwO16i4o,
+        Ohwi4o = dnnl_Ohwi4o,
+        Ohwi8o = dnnl_Ohwi8o,
+        OhwI8o2i = dnnl_OhwI8o2i,
+        OhwI8o4i = dnnl_OhwI8o4i,
+        OIhw16i16o = dnnl_OIhw16i16o,
+        OhwI16i16o = dnnl_OhwI16i16o,
+        OIhw16i32o = dnnl_OIhw16i32o,
+        OhwI16i32o = dnnl_OhwI16i32o,
+        OIhw16i48o = dnnl_OIhw16i48o,
+        OhwI16i48o = dnnl_OhwI16i48o,
+        OIhw16i64o = dnnl_OIhw16i64o,
+        OhwI16i64o = dnnl_OhwI16i64o,
+        OIhw16o16i = dnnl_OIhw16o16i,
+        Oihw16o = dnnl_Oihw16o,
+        OIhw4i8o4i = dnnl_OIhw4i8o4i,
+        OhwI4i8o4i = dnnl_OhwI4i8o4i,
+        OIhw4i16o4i = dnnl_OIhw4i16o4i,
+        OhwI4i16o4i = dnnl_OhwI4i16o4i,
+        OIhw4i24o4i = dnnl_OIhw4i24o4i,
+        OhwI4i24o4i = dnnl_OhwI4i24o4i,
+        OIhw4i32o4i = dnnl_OIhw4i32o4i,
+        OhwI4i32o4i = dnnl_OhwI4i32o4i,
+        OIhw4i64o4i = dnnl_OIhw4i64o4i,
+        OhwI4i64o4i = dnnl_OhwI4i64o4i,
+        OIhw4i4o = dnnl_OIhw4i4o,
+        OIhw4o4i = dnnl_OIhw4o4i,
+        Oihw4o = dnnl_Oihw4o,
+        OIhw8i16o2i = dnnl_OIhw8i16o2i,
+        OhwI8i16o2i = dnnl_OhwI8i16o2i,
+        OIhw8i32o2i = dnnl_OIhw8i32o2i,
+        OhwI8i32o2i = dnnl_OhwI8i32o2i,
+        OIhw8i64o2i = dnnl_OIhw8i64o2i,
+        OhwI8i64o2i = dnnl_OhwI8i64o2i,
+        OIhw8i8o = dnnl_OIhw8i8o,
+        OhwI8i8o = dnnl_OhwI8i8o,
+        OIhw8o16i2o = dnnl_OIhw8o16i2o,
+        OIhw8o8i = dnnl_OIhw8o8i,
+        OIhw8o4i = dnnl_OIhw8o4i,
+        OIhw2i8o4i = dnnl_OIhw2i8o4i,
+        IOdhw8o8i = dnnl_IOdhw8o8i,
+        IOdhw16o16i = dnnl_IOdhw16o16i,
+        Odhwi16o = dnnl_Odhwi16o,
+        OdhwI16o2i = dnnl_OdhwI16o2i,
+        Idhwo16i = dnnl_Idhwo16i,
+        IdhwO16i2o = dnnl_IdhwO16i2o,
+        IdhwO16i4o = dnnl_IdhwO16i4o,
+        Odhwi4o = dnnl_Odhwi4o,
+        Odhwi8o = dnnl_Odhwi8o,
+        OdhwI8o2i = dnnl_OdhwI8o2i,
+        OdhwI8o4i = dnnl_OdhwI8o4i,
+        OIdhw16i16o = dnnl_OIdhw16i16o,
+        OdhwI16i16o = dnnl_OdhwI16i16o,
+        OIdhw16i32o = dnnl_OIdhw16i32o,
+        OdhwI16i32o = dnnl_OdhwI16i32o,
+        OIdhw16i48o = dnnl_OIdhw16i48o,
+        OdhwI16i48o = dnnl_OdhwI16i48o,
+        OIdhw16i64o = dnnl_OIdhw16i64o,
+        OdhwI16i64o = dnnl_OdhwI16i64o,
+        OIdhw16o16i = dnnl_OIdhw16o16i,
+        OIdhw16o16i2o = dnnl_OIdhw16o16i2o,
+        Oidhw16o = dnnl_Oidhw16o,
+        OIdhw4i4o = dnnl_OIdhw4i4o,
+        OIdhw4o4i = dnnl_OIdhw4o4i,
+        Oidhw4o = dnnl_Oidhw4o,
+        OIdhw8i16o2i = dnnl_OIdhw8i16o2i,
+        OdhwI8i16o2i = dnnl_OdhwI8i16o2i,
+        OIdhw8i32o2i = dnnl_OIdhw8i32o2i,
+        OdhwI8i32o2i = dnnl_OdhwI8i32o2i,
+        OIdhw8i64o2i = dnnl_OIdhw8i64o2i,
+        OdhwI8i64o2i = dnnl_OdhwI8i64o2i,
+        OIdhw4i8o4i = dnnl_OIdhw4i8o4i,
+        OdhwI4i8o4i = dnnl_OdhwI4i8o4i,
+        OIdhw4i16o4i = dnnl_OIdhw4i16o4i,
+        OdhwI4i16o4i = dnnl_OdhwI4i16o4i,
+        OIdhw16i16o4i = dnnl_OIdhw16i16o4i,
+        OIdhw16i32o4i = dnnl_OIdhw16i32o4i,
+        OIdhw16i48o4i = dnnl_OIdhw16i48o4i,
+        OIdhw16i64o4i = dnnl_OIdhw16i64o4i,
+        OIdhw16i16o2i = dnnl_OIdhw16i16o2i,
+        OIdhw16i32o2i = dnnl_OIdhw16i32o2i,
+        OIdhw16i48o2i = dnnl_OIdhw16i48o2i,
+        OIdhw16i64o2i = dnnl_OIdhw16i64o2i,
+        OIdhw4i24o4i = dnnl_OIdhw4i24o4i,
+        OdhwI4i24o4i = dnnl_OdhwI4i24o4i,
+        OIdhw4i32o4i = dnnl_OIdhw4i32o4i,
+        OdhwI4i32o4i = dnnl_OdhwI4i32o4i,
+        OIdhw4i64o4i = dnnl_OIdhw4i64o4i,
+        OdhwI4i64o4i = dnnl_OdhwI4i64o4i,
+        OIdhw2i8o4i = dnnl_OIdhw2i8o4i,
+        OIdhw8i8o = dnnl_OIdhw8i8o,
+        OdhwI8i8o = dnnl_OdhwI8i8o,
+        OIdhw8o8i = dnnl_OIdhw8o8i,
+        OIdhw8o4i = dnnl_OIdhw8o4i,
+        gIOw8o8i = dnnl_gIOw8o8i,
+        gIOw16o16i = dnnl_gIOw16o16i,
+        gOIw16i16o = dnnl_gOIw16i16o,
+        gOIw16o16i = dnnl_gOIw16o16i,
+        gOiw16o = dnnl_gOiw16o,
+        gOIw4i16o4i = dnnl_gOIw4i16o4i,
+        gOIw2i8o4i = dnnl_gOIw2i8o4i,
+        gOIw4i4o = dnnl_gOIw4i4o,
+        gOIw4o4i = dnnl_gOIw4o4i,
+        gOiw4o = dnnl_gOiw4o,
+        gOIw8i16o2i = dnnl_gOIw8i16o2i,
+        gOIw8i8o = dnnl_gOIw8i8o,
+        gOIw8o16i2o = dnnl_gOIw8o16i2o,
+        gOIw8o8i = dnnl_gOIw8o8i,
+        gOIw8o4i = dnnl_gOIw8o4i,
+        gOIw16i16o4i = dnnl_gOIw16i16o4i,
+        gOIw16i16o2i = dnnl_gOIw16i16o2i,
+        gOIw16o16i2o = dnnl_gOIw16o16i2o,
+        gOwi16o = dnnl_gOwi16o,
+        gOwI16o2i = dnnl_gOwI16o2i,
+        gIwo16i = dnnl_gIwo16i,
+        gIwO16i2o = dnnl_gIwO16i2o,
+        gIwO16i4o = dnnl_gIwO16i4o,
+        gOwi4o = dnnl_gOwi4o,
+        gOwi8o = dnnl_gOwi8o,
+        gOwI8o2i = dnnl_gOwI8o2i,
+        gOwI8o4i = dnnl_gOwI8o4i,
+        Goiw8g = dnnl_Goiw8g,
+        Goiw16g = dnnl_Goiw16g,
+        gIOhw8o8i = dnnl_gIOhw8o8i,
+        gIOhw16o16i = dnnl_gIOhw16o16i,
+        gOhwi16o = dnnl_gOhwi16o,
+        gOhwI16o2i = dnnl_gOhwI16o2i,
+        gIhwo16i = dnnl_gIhwo16i,
+        gIhwO16i2o = dnnl_gIhwO16i2o,
+        gIhwO16i4o = dnnl_gIhwO16i4o,
+        gOhwi4o = dnnl_gOhwi4o,
+        gOhwi8o = dnnl_gOhwi8o,
+        gOhwI8o2i = dnnl_gOhwI8o2i,
+        gOhwI8o4i = dnnl_gOhwI8o4i,
+        Goihw16g = dnnl_Goihw16g,
+        gOIhw16i16o = dnnl_gOIhw16i16o,
+        gOIhw16o16i = dnnl_gOIhw16o16i,
+        gOihw16o = dnnl_gOihw16o,
+        gOIhw4i16o4i = dnnl_gOIhw4i16o4i,
+        gOIhw2i8o4i = dnnl_gOIhw2i8o4i,
+        gOIhw4i4o = dnnl_gOIhw4i4o,
+        gOIhw4o4i = dnnl_gOIhw4o4i,
+        gOihw4o = dnnl_gOihw4o,
+        Goihw8g = dnnl_Goihw8g,
+        gOIhw8i16o2i = dnnl_gOIhw8i16o2i,
+        gOIhw8i8o = dnnl_gOIhw8i8o,
+        gOIhw8o16i2o = dnnl_gOIhw8o16i2o,
+        OIw4o8i8o4i = dnnl_OIw4o8i8o4i,
+        OIdhw4o8i8o4i = dnnl_OIdhw4o8i8o4i,
+        OIhw4o8i8o4i = dnnl_OIhw4o8i8o4i,
+        OIhw2o8i8o2i = dnnl_OIhw2o8i8o2i,
+        gOIw4o8i8o4i = dnnl_gOIw4o8i8o4i,
+        gOIdhw4o8i8o4i = dnnl_gOIdhw4o8i8o4i,
+        gOIhw4o8i8o4i = dnnl_gOIhw4o8i8o4i,
+        gOIhw2o8i8o2i = dnnl_gOIhw2o8i8o2i,
+        OIhw16i16o4i = dnnl_OIhw16i16o4i,
+        OIhw16i32o4i = dnnl_OIhw16i32o4i,
+        OIhw16i48o4i = dnnl_OIhw16i48o4i,
+        OIhw16i64o4i = dnnl_OIhw16i64o4i,
+        OIhw16i16o2i = dnnl_OIhw16i16o2i,
+        OIhw16i32o2i = dnnl_OIhw16i32o2i,
+        OIhw16i48o2i = dnnl_OIhw16i48o2i,
+        OIhw16i64o2i = dnnl_OIhw16i64o2i,
+        OIhw16o16i2o = dnnl_OIhw16o16i2o,
+        gOIhw16i16o4i = dnnl_gOIhw16i16o4i,
+        gOIhw16i16o2i = dnnl_gOIhw16i16o2i,
+        gOIhw16o16i2o = dnnl_gOIhw16o16i2o,
+        gOIhw8o8i = dnnl_gOIhw8o8i,
+        gOIhw8o4i = dnnl_gOIhw8o4i,
+        gIOdhw16i16o = dnnl_gIOdhw16i16o,
+        gIOdhw8o8i = dnnl_gIOdhw8o8i,
+        gIOdhw16o16i = dnnl_gIOdhw16o16i,
+        gOdhwi16o = dnnl_gOdhwi16o,
+        gOdhwI16o2i = dnnl_gOdhwI16o2i,
+        gIdhwo16i = dnnl_gIdhwo16i,
+        gIdhwO16i2o = dnnl_gIdhwO16i2o,
+        gIdhwO16i4o = dnnl_gIdhwO16i4o,
+        gOdhwi4o = dnnl_gOdhwi4o,
+        gOdhwi8o = dnnl_gOdhwi8o,
+        gOdhwI8o2i = dnnl_gOdhwI8o2i,
+        gOdhwI8o4i = dnnl_gOdhwI8o4i,
+        gOIdhw16i16o = dnnl_gOIdhw16i16o,
+        gOIdhw16o16i = dnnl_gOIdhw16o16i,
+        gOIdhw16o16i2o = dnnl_gOIdhw16o16i2o,
+        gOidhw16o = dnnl_gOidhw16o,
+        gOIdhw4i4o = dnnl_gOIdhw4i4o,
+        gOIdhw4o4i = dnnl_gOIdhw4o4i,
+        gOidhw4o = dnnl_gOidhw4o,
+        gOIdhw8i16o2i = dnnl_gOIdhw8i16o2i,
+        gOIdhw4i16o4i = dnnl_gOIdhw4i16o4i,
+        gOIdhw16i16o4i = dnnl_gOIdhw16i16o4i,
+        gOIdhw16i16o2i = dnnl_gOIdhw16i16o2i,
+        gOIdhw2i8o4i = dnnl_gOIdhw2i8o4i,
+        gOIdhw8i8o = dnnl_gOIdhw8i8o,
+        gOIdhw8o8i = dnnl_gOIdhw8o8i,
+        gOIdhw8o4i = dnnl_gOIdhw8o4i,
+        gOIw2i4o2i = dnnl_gOIw2i4o2i,
+        gOIhw2i4o2i = dnnl_gOIhw2i4o2i,
+        gOIdhw2i4o2i = dnnl_gOIdhw2i4o2i,
+        gOIw2o4i2o = dnnl_gOIw2o4i2o,
+        gOIhw2o4i2o = dnnl_gOIhw2o4i2o,
+        gOIdhw2o4i2o = dnnl_gOIdhw2o4i2o,
+        gOIw4i8o2i = dnnl_gOIw4i8o2i,
+        gOIhw4i8o2i = dnnl_gOIhw4i8o2i,
+        gOIdhw4i8o2i = dnnl_gOIdhw4i8o2i,
+        gOIw4o8i2o = dnnl_gOIw4o8i2o,
+        gOIhw4o8i2o = dnnl_gOIhw4o8i2o,
+        gOIdhw4o8i2o = dnnl_gOIdhw4o8i2o,
+
+        ldOi16o = abDc16d,
+        ldOi32o = abDc32d,
+        ldOI16o4i = abDC16d4c,
+        ldOI32o4i = abDC32d4c,
+        ldgOi16o = abdEc16e,
+        ldgOI16o4i = abdEC16e4c,
+        ldgOi32o = abdEc32e,
+        ldgOI32o2i = abdEC32e2c,
+        ldgOI32o4i = abdEC32e4c,
+        OwI16o4i = dnnl_OwI16o4i,
+        OhwI16o4i = dnnl_OhwI16o4i,
+        gOwI16o4i = dnnl_gOwI16o4i,
+        gOhwI16o4i = dnnl_gOhwI16o4i,
+        OdhwI16o4i = dnnl_OdhwI16o4i,
+        gOdhwI16o4i = dnnl_gOdhwI16o4i,
+
+        Owi32o = dnnl_Owi32o,
+        OwI32o2i = dnnl_OwI32o2i,
+        OwI32o4i = dnnl_OwI32o4i,
+        Owi48o = dnnl_Owi48o,
+        OwI48o2i = dnnl_OwI48o2i,
+        OwI48o4i = dnnl_OwI48o4i,
+        Owi64o = dnnl_Owi64o,
+        OwI64o2i = dnnl_OwI64o2i,
+        OwI64o4i = dnnl_OwI64o4i,
+        Iwo32i = dnnl_Iwo32i,
+        IwO32i2o = dnnl_IwO32i2o,
+        IwO32i4o = dnnl_IwO32i4o,
+        Iwo48i = dnnl_Iwo48i,
+        IwO48i2o = dnnl_IwO48i2o,
+        IwO48i4o = dnnl_IwO48i4o,
+        Iwo64i = dnnl_Iwo64i,
+        IwO64i2o = dnnl_IwO64i2o,
+        IwO64i4o = dnnl_IwO64i4o,
+        wIo2i = dnnl_wIo2i,
+        wIo4i = dnnl_wIo4i,
+        gOwi32o = dnnl_gOwi32o,
+        gOwI32o2i = dnnl_gOwI32o2i,
+        gOwI32o4i = dnnl_gOwI32o4i,
+        gOwi48o = dnnl_gOwi48o,
+        gOwI48o2i = dnnl_gOwI48o2i,
+        gOwI48o4i = dnnl_gOwI48o4i,
+        gOwi64o = dnnl_gOwi64o,
+        gOwI64o2i = dnnl_gOwI64o2i,
+        gOwI64o4i = dnnl_gOwI64o4i,
+        gIwo32i = dnnl_gIwo32i,
+        gIwO32i2o = dnnl_gIwO32i2o,
+        gIwO32i4o = dnnl_gIwO32i4o,
+        gIwo48i = dnnl_gIwo48i,
+        gIwO48i2o = dnnl_gIwO48i2o,
+        gIwO48i4o = dnnl_gIwO48i4o,
+        gIwo64i = dnnl_gIwo64i,
+        gIwO64i2o = dnnl_gIwO64i2o,
+        gIwO64i4o = dnnl_gIwO64i4o,
+        gwio = dnnl_gwio,
+        gwIo2i = dnnl_gwIo2i,
+        gwIo4i = dnnl_gwIo4i,
+        OhwI32o = dnnl_OhwI32o,
+        OhwI32o2i = dnnl_OhwI32o2i,
+        OhwI32o4i = dnnl_OhwI32o4i,
+        Ohwi48o = dnnl_Ohwi48o,
+        OhwI48o2i = dnnl_OhwI48o2i,
+        OhwI48o4i = dnnl_OhwI48o4i,
+        Ohwi64o = dnnl_Ohwi64o,
+        OhwI64o2i = dnnl_OhwI64o2i,
+        OhwI64o4i = dnnl_OhwI64o4i,
+        Ihwo32i = dnnl_Ihwo32i,
+        IhwO32i2o = dnnl_IhwO32i2o,
+        IhwO32i4o = dnnl_IhwO32i4o,
+        Ihwo48i = dnnl_Ihwo48i,
+        IhwO48i2o = dnnl_IhwO48i2o,
+        IhwO48i4o = dnnl_IhwO48i4o,
+        Ihwo64i = dnnl_Ihwo64i,
+        IhwO64i2o = dnnl_IhwO64i2o,
+        IhwO64i4o = dnnl_IhwO64i4o,
+        hwIo2i = dnnl_hwIo2i,
+        hwIo4i = dnnl_hwIo4i,
+        gOhwI32o = dnnl_gOhwI32o,
+        gOhwI32o2i = dnnl_gOhwI32o2i,
+        gOhwI32o4i = dnnl_gOhwI32o4i,
+        gOhwi48o = dnnl_gOhwi48o,
+        gOhwI48o2i = dnnl_gOhwI48o2i,
+        gOhwI48o4i = dnnl_gOhwI48o4i,
+        gOhwi64o = dnnl_gOhwi64o,
+        gOhwI64o2i = dnnl_gOhwI64o2i,
+        gOhwI64o4i = dnnl_gOhwI64o4i,
+        gIhwo32i = dnnl_gIhwo32i,
+        gIhwO32i2o = dnnl_gIhwO32i2o,
+        gIhwO32i4o = dnnl_gIhwO32i4o,
+        gIhwo48i = dnnl_gIhwo48i,
+        gIhwO48i2o = dnnl_gIhwO48i2o,
+        gIhwO48i4o = dnnl_gIhwO48i4o,
+        gIhwo64i = dnnl_gIhwo64i,
+        gIhwO64i2o = dnnl_gIhwO64i2o,
+        gIhwO64i4o = dnnl_gIhwO64i4o,
+        ghwio = dnnl_ghwio,
+        ghwIo2i = dnnl_ghwIo2i,
+        ghwIo4i = dnnl_ghwIo4i,
+        Odhwi32o = dnnl_Odhwi32o,
+        OdhwI32o2i = dnnl_OdhwI32o2i,
+        OdhwI32o4i = dnnl_OdhwI32o4i,
+        Odhwi48o = dnnl_Odhwi48o,
+        OdhwI48o2i = dnnl_OdhwI48o2i,
+        OdhwI48o4i = dnnl_OdhwI48o4i,
+        Odhwi64o = dnnl_Odhwi64o,
+        OdhwI64o2i = dnnl_OdhwI64o2i,
+        OdhwI64o4i = dnnl_OdhwI64o4i,
+        Idhwo32i = dnnl_Idhwo32i,
+        IdhwO32i2o = dnnl_IdhwO32i2o,
+        IdhwO32i4o = dnnl_IdhwO32i4o,
+        Idhwo48i = dnnl_Idhwo48i,
+        IdhwO48i2o = dnnl_IdhwO48i2o,
+        IdhwO48i4o = dnnl_IdhwO48i4o,
+        Idhwo64i = dnnl_Idhwo64i,
+        IdhwO64i2o = dnnl_IdhwO64i2o,
+        IdhwO64i4o = dnnl_IdhwO64i4o,
+        dhwIo2i = dnnl_dhwIo2i,
+        dhwIo4i = dnnl_dhwIo4i,
+        gOdhwi32o = dnnl_gOdhwi32o,
+        gOdhwI32o2i = dnnl_gOdhwI32o2i,
+        gOdhwI32o4i = dnnl_gOdhwI32o4i,
+        gOdhwi48o = dnnl_gOdhwi48o,
+        gOdhwI48o2i = dnnl_gOdhwI48o2i,
+        gOdhwI48o4i = dnnl_gOdhwI48o4i,
+        gOdhwi64o = dnnl_gOdhwi64o,
+        gOdhwI64o2i = dnnl_gOdhwI64o2i,
+        gOdhwI64o4i = dnnl_gOdhwI64o4i,
+        gIdhwo32i = dnnl_gIdhwo32i,
+        gIdhwO32i2o = dnnl_gIdhwO32i2o,
+        gIdhwO32i4o = dnnl_gIdhwO32i4o,
+        gIdhwo48i = dnnl_gIdhwo48i,
+        gIdhwO48i2o = dnnl_gIdhwO48i2o,
+        gIdhwO48i4o = dnnl_gIdhwO48i4o,
+        gIdhwo64i = dnnl_gIdhwo64i,
+        gIdhwO64i2o = dnnl_gIdhwO64i2o,
+        gIdhwO64i4o = dnnl_gIdhwO64i4o,
+        gdhwio = dnnl_gdhwio,
+        gdhwIo2i = dnnl_gdhwIo2i,
+        gdhwIo4i = dnnl_gdhwIo4i,
+        ldIo32i = dnnl_ldIo32i,
+        ldgIo16i = dnnl_ldgIo16i,
+        ldgIo32i = dnnl_ldgIo32i,
+        ldgIO32i2o = dnnl_ldgIO32i2o,
+        nCdhw32c = dnnl_nCdhw32c,
+        nChw32c = dnnl_nChw32c,
+        nCw32c = dnnl_nCw32c,
+        NCw32n16c = dnnl_NCw32n16c,
+        NChw32n16c = dnnl_NChw32n16c,
+        NCdhw32n16c = dnnl_NCdhw32n16c,
+        NCw32n32c = dnnl_NCw32n32c,
+        OI16i16o4i = dnnl_OI16i16o4i,
+        IOw8o16i2o = dnnl_IOw8o16i2o,
+        IOhw8o16i2o = dnnl_IOhw8o16i2o,
+        Owhi16o = dnnl_Owhi16o,
+        OIdhw8o16i2o = dnnl_OIdhw8o16i2o,
+        IOdhw8o16i2o = dnnl_IOdhw8o16i2o,
+        Goiw4g = dnnl_Goiw4g,
+        gIOw8o16i2o = dnnl_gIOw8o16i2o,
+        Goiw32g = dnnl_Goiw32g,
+        Goihw4g = dnnl_Goihw4g,
+        gIOhw8o16i2o = dnnl_gIOhw8o16i2o,
+        Goihw32g = dnnl_Goihw32g,
+        gOwhi16o = dnnl_gOwhi16o,
+        IOw4i8o8i4o = dnnl_IOw4i8o8i4o,
+        IOhw4i8o8i4o = dnnl_IOhw4i8o8i4o,
+        IOdhw4i8o8i4o = dnnl_IOdhw4i8o8i4o,
+        gIOw4i8o8i4o = dnnl_gIOw4i8o8i4o,
+        gIOhw4i8o8i4o = dnnl_gIOhw4i8o8i4o,
+        gIOdhw4i8o8i4o = dnnl_gIOdhw4i8o8i4o,
+        gOIdhw8o16i2o = dnnl_gOIdhw8o16i2o,
+        gIOdhw8o16i2o = dnnl_gIOdhw8o16i2o,
+        Goidhw32g = dnnl_Goidhw32g,
+        OI16i32o4i = dnnl_OI16i32o4i,
+        OI16i48o4i = dnnl_OI16i48o4i,
+        OI16i64o4i = dnnl_OI16i64o4i,
+        OI16i16o2i = dnnl_OI16i16o2i,
+        OI16i32o2i = dnnl_OI16i32o2i,
+        OI16i48o2i = dnnl_OI16i48o2i,
+        OI16i64o2i = dnnl_OI16i64o2i,
+        aBdeC16c16b4c = dnnl_aBdeC16c16b4c,
+        AcB16b16a2b = dnnl_AcB16b16a2b,
+        aBdC16c16b2c = dnnl_aBdC16c16b2c,
+        AcB16b16a4b = dnnl_AcB16b16a4b,
+        aBdC16c16b4c = dnnl_aBdC16c16b4c,
+        AcdB16b16a2b = dnnl_AcdB16b16a2b,
+        aBdefC16c16b4c = dnnl_aBdefC16c16b4c,
+        AcdeB16b16a4b = dnnl_AcdeB16b16a4b,
+        AcB16b32a2b = dnnl_AcB16b32a2b,
+        AcB16b32a4b = dnnl_AcB16b32a4b,
+        AcB16b48a2b = dnnl_AcB16b48a2b,
+        AcB16b48a4b = dnnl_AcB16b48a4b,
+        AcB16b64a2b = dnnl_AcB16b64a2b,
+        AcB16b64a4b = dnnl_AcB16b64a4b,
+        aBdC16c32b2c = dnnl_aBdC16c32b2c,
+        aBdC16c32b4c = dnnl_aBdC16c32b4c,
+        aBdC16c48b2c = dnnl_aBdC16c48b2c,
+        aBdC16c48b4c = dnnl_aBdC16c48b4c,
+        aBdC16c64b2c = dnnl_aBdC16c64b2c,
+        aBdC16c64b4c = dnnl_aBdC16c64b4c,
+        AcdB16b32a2b = dnnl_AcdB16b32a2b,
+        AcdB16b32a4b = dnnl_AcdB16b32a4b,
+        AcdB16b48a2b = dnnl_AcdB16b48a2b,
+        AcdB16b48a4b = dnnl_AcdB16b48a4b,
+        AcdB16b64a2b = dnnl_AcdB16b64a2b,
+        AcdB16b64a4b = dnnl_AcdB16b64a4b,
+        aBdeC16c32b2c = dnnl_aBdeC16c32b2c,
+        aBdeC16c32b4c = dnnl_aBdeC16c32b4c,
+        aBdeC16c48b2c = dnnl_aBdeC16c48b2c,
+        aBdeC16c48b4c = dnnl_aBdeC16c48b4c,
+        aBdeC16c64b2c = dnnl_aBdeC16c64b2c,
+        aBdeC16c64b4c = dnnl_aBdeC16c64b4c,
+        AcdeB16b32a2b = dnnl_AcdeB16b32a2b,
+        AcdeB16b32a4b = dnnl_AcdeB16b32a4b,
+        AcdeB16b48a2b = dnnl_AcdeB16b48a2b,
+        AcdeB16b48a4b = dnnl_AcdeB16b48a4b,
+        AcdeB16b64a2b = dnnl_AcdeB16b64a2b,
+        AcdeB16b64a4b = dnnl_AcdeB16b64a4b,
+        aBdefC16c32b2c = dnnl_aBdefC16c32b2c,
+        aBdefC16c32b4c = dnnl_aBdefC16c32b4c,
+        aBdefC16c48b2c = dnnl_aBdefC16c48b2c,
+        aBdefC16c48b4c = dnnl_aBdefC16c48b4c,
+        aBdefC16c64b2c = dnnl_aBdefC16c64b2c,
+        aBdefC16c64b4c = dnnl_aBdefC16c64b4c,
+        OwI16i16o2i = dnnl_OwI16i16o2i,
+        gOwI16i16o2i = dnnl_gOwI16i16o2i,
+        OhwI16i16o2i = dnnl_OhwI16i16o2i,
+        gOhwI16i16o2i = dnnl_gOhwI16i16o2i,
+        OdhwI16i16o2i = dnnl_OdhwI16i16o2i,
+        gOdhwI16i16o2i = dnnl_gOdhwI16i16o2i,
+        OwI16i16o4i = dnnl_OwI16i16o4i,
+        gOwI16i16o4i = dnnl_gOwI16i16o4i,
+        OhwI16i16o4i = dnnl_OhwI16i16o4i,
+        gOhwI16i16o4i = dnnl_gOhwI16i16o4i,
+        OdhwI16i16o4i = dnnl_OdhwI16i16o4i,
+        gOdhwI16i16o4i = dnnl_gOdhwI16i16o4i,
+        OwI16i32o2i = dnnl_OwI16i32o2i,
+        OwI16i32o4i = dnnl_OwI16i32o4i,
+        OwI16i48o2i = dnnl_OwI16i48o2i,
+        OwI16i48o4i = dnnl_OwI16i48o4i,
+        OwI16i64o2i = dnnl_OwI16i64o2i,
+        OwI16i64o4i = dnnl_OwI16i64o4i,
+        gOwI16i32o2i = dnnl_gOwI16i32o2i,
+        gOwI16i32o4i = dnnl_gOwI16i32o4i,
+        gOwI16i48o2i = dnnl_gOwI16i48o2i,
+        gOwI16i48o4i = dnnl_gOwI16i48o4i,
+        gOwI16i64o2i = dnnl_gOwI16i64o2i,
+        gOwI16i64o4i = dnnl_gOwI16i64o4i,
+        OhwI16i32o2i = dnnl_OhwI16i32o2i,
+        OhwI16i32o4i = dnnl_OhwI16i32o4i,
+        OhwI16i48o2i = dnnl_OhwI16i48o2i,
+        OhwI16i48o4i = dnnl_OhwI16i48o4i,
+        OhwI16i64o2i = dnnl_OhwI16i64o2i,
+        OhwI16i64o4i = dnnl_OhwI16i64o4i,
+        gOhwI16i32o2i = dnnl_gOhwI16i32o2i,
+        gOhwI16i32o4i = dnnl_gOhwI16i32o4i,
+        gOhwI16i48o2i = dnnl_gOhwI16i48o2i,
+        gOhwI16i48o4i = dnnl_gOhwI16i48o4i,
+        gOhwI16i64o2i = dnnl_gOhwI16i64o2i,
+        gOhwI16i64o4i = dnnl_gOhwI16i64o4i,
+        OdhwI16i32o2i = dnnl_OdhwI16i32o2i,
+        OdhwI16i32o4i = dnnl_OdhwI16i32o4i,
+        OdhwI16i48o2i = dnnl_OdhwI16i48o2i,
+        OdhwI16i48o4i = dnnl_OdhwI16i48o4i,
+        OdhwI16i64o2i = dnnl_OdhwI16i64o2i,
+        OdhwI16i64o4i = dnnl_OdhwI16i64o4i,
+        IdhwO16o32i2o = dnnl_IdhwO16o32i2o,
+        IdhwO16o32i4o = dnnl_IdhwO16o32i4o,
+        IdhwO16o48i2o = dnnl_IdhwO16o48i2o,
+        IdhwO16o48i4o = dnnl_IdhwO16o48i4o,
+        IdhwO16o64i2o = dnnl_IdhwO16o64i2o,
+        IdhwO16o64i4o = dnnl_IdhwO16o64i4o,
+        gOdhwI16i32o2i = dnnl_gOdhwI16i32o2i,
+        gOdhwI16i32o4i = dnnl_gOdhwI16i32o4i,
+        gOdhwI16i48o2i = dnnl_gOdhwI16i48o2i,
+        gOdhwI16i48o4i = dnnl_gOdhwI16i48o4i,
+        gOdhwI16i64o2i = dnnl_gOdhwI16i64o2i,
+        gOdhwI16i64o4i = dnnl_gOdhwI16i64o4i,
+        gIdhwO16o32i2o = dnnl_gIdhwO16o32i2o,
+        gIdhwO16o32i4o = dnnl_gIdhwO16o32i4o,
+        gIdhwO16o48i2o = dnnl_gIdhwO16o48i2o,
+        gIdhwO16o48i4o = dnnl_gIdhwO16o48i4o,
+        gIdhwO16o64i2o = dnnl_gIdhwO16o64i2o,
+        gIdhwO16o64i4o = dnnl_gIdhwO16o64i4o,
+        IwO16o16i2o = dnnl_IwO16o16i2o,
+        IwO16o16i4o = dnnl_IwO16o16i4o,
+        IhwO16o16i2o = dnnl_IhwO16o16i2o,
+        IhwO16o16i4o = dnnl_IhwO16o16i4o,
+        IdhwO16o16i2o = dnnl_IdhwO16o16i2o,
+        IdhwO16o16i4o = dnnl_IdhwO16o16i4o,
+        gIwO16o16i2o = dnnl_gIwO16o16i2o,
+        gIwO16o16i4o = dnnl_gIwO16o16i4o,
+        gIhwO16o16i2o = dnnl_gIhwO16o16i2o,
+        gIhwO16o16i4o = dnnl_gIhwO16o16i4o,
+        gIdhwO16o16i2o = dnnl_gIdhwO16o16i2o,
+        gIdhwO16o16i4o = dnnl_gIdhwO16o16i4o,
+        IwO16o32i2o = dnnl_IwO16o32i2o,
+        IwO16o32i4o = dnnl_IwO16o32i4o,
+        IwO16o48i2o = dnnl_IwO16o48i2o,
+        IwO16o48i4o = dnnl_IwO16o48i4o,
+        IwO16o64i2o = dnnl_IwO16o64i2o,
+        IwO16o64i4o = dnnl_IwO16o64i4o,
+        gIwO16o32i2o = dnnl_gIwO16o32i2o,
+        gIwO16o32i4o = dnnl_gIwO16o32i4o,
+        gIwO16o48i2o = dnnl_gIwO16o48i2o,
+        gIwO16o48i4o = dnnl_gIwO16o48i4o,
+        gIwO16o64i2o = dnnl_gIwO16o64i2o,
+        gIwO16o64i4o = dnnl_gIwO16o64i4o,
+        IhwO16o32i2o = dnnl_IhwO16o32i2o,
+        IhwO16o32i4o = dnnl_IhwO16o32i4o,
+        IhwO16o48i2o = dnnl_IhwO16o48i2o,
+        IhwO16o48i4o = dnnl_IhwO16o48i4o,
+        IhwO16o64i2o = dnnl_IhwO16o64i2o,
+        IhwO16o64i4o = dnnl_IhwO16o64i4o,
+        gIhwO16o32i2o = dnnl_gIhwO16o32i2o,
+        gIhwO16o32i4o = dnnl_gIhwO16o32i4o,
+        gIhwO16o48i2o = dnnl_gIhwO16o48i2o,
+        gIhwO16o48i4o = dnnl_gIhwO16o48i4o,
+        gIhwO16o64i2o = dnnl_gIhwO16o64i2o,
+        gIhwO16o64i4o = dnnl_gIhwO16o64i4o,
+        aBdeC16c16b2c = dnnl_aBdeC16c16b2c,
+        aBdefC16c16b2c = dnnl_aBdefC16c16b2c,
+        AcdB16b16a4b = dnnl_AcdB16b16a4b,
+        AcdeB16b16a2b = dnnl_AcdeB16b16a2b,
+        hwioG16g = dnnl_hwioG16g,
+        hwioG8g = dnnl_hwioG8g,
+        dhwioG16g = dnnl_dhwioG16g,
+        dhwioG8g = dnnl_dhwioG8g,
+        ABc4a2b = dnnl_ABc4a2b,
+        ABc8a2b = dnnl_ABc8a2b,
+        ABcd4a2b = dnnl_ABcd4a2b,
+        ABcde4a2b = dnnl_ABcde4a2b,
+        ABcde8a2b = dnnl_ABcde8a2b,
+        ABcd4a8b8a2b = dnnl_ABcd4a8b8a2b,
+        NCdhw40n32c = dnnl_NCdhw40n32c,
+        NChw40n32c = dnnl_NChw40n32c,
+        NCw40n32c = dnnl_NCw40n32c,
+        OIdhw4o8i8o2i = dnnl_OIdhw4o8i8o2i,
+        OIhw4o8i8o2i = dnnl_OIhw4o8i8o2i,
+        OIw4o8i8o2i = dnnl_OIw4o8i8o2i,
+        gOIdhw4o8i8o2i = dnnl_gOIdhw4o8i8o2i,
+        gOIhw4o8i8o2i = dnnl_gOIhw4o8i8o2i,
+        gOIw4o8i8o2i = dnnl_gOIw4o8i8o2i,
+        IOdhw4i8o8i2o = dnnl_IOdhw4i8o8i2o,
+        IOhw4i8o8i2o = dnnl_IOhw4i8o8i2o,
+        IOw4i8o8i2o = dnnl_IOw4i8o8i2o,
+        gIOdhw4i8o8i2o = dnnl_gIOdhw4i8o8i2o,
+        gIOhw4i8o8i2o = dnnl_gIOhw4i8o8i2o,
+        gIOw4i8o8i2o = dnnl_gIOw4i8o8i2o,
+        aBCd8b2c = dnnl_aBCd8b2c,
+        ABcde40a16b = dnnl_ABcde40a16b,
+        ABcde40a32b = dnnl_ABcde40a32b,
+        aBCde8b2c = dnnl_aBCde8b2c,
+        ABcde4a8b8a2b = dnnl_ABcde4a8b8a2b,
+        ABc4a8b8a2b = dnnl_ABc4a8b8a2b,
+        aBCdef4b8c8b2c = dnnl_aBCdef4b8c8b2c,
+        aBCde4b8c8b2c = dnnl_aBCde4b8c8b2c,
+        aBCd4b8c8b2c = dnnl_aBCd4b8c8b2c,
+        BAcde4b8a8b2a = dnnl_BAcde4b8a8b2a,
+        BAcd4b8a8b2a = dnnl_BAcd4b8a8b2a,
+        BAc4b8a8b2a = dnnl_BAc4b8a8b2a,
+        aCBdef4c8b8c2b = dnnl_aCBdef4c8b8c2b,
+        aCBde4c8b8c2b = dnnl_aCBde4c8b8c2b,
+        aCBd4c8b8c2b = dnnl_aCBd4c8b8c2b,
+        aBCdef8b2c = dnnl_aBCdef8b2c,
+        AB32a16b = dnnl_AB32a16b,
+        AB32a32b = dnnl_AB32a32b,
+        BA4b8a8b2a = dnnl_BA4b8a8b2a,
+        BA4b8a8b4a = dnnl_BA4b8a8b4a,
+        aBC32b16c = dnnl_aBC32b16c,
+        aBC32b32c = dnnl_aBC32b32c,
+        aCB4c8b8c2b = dnnl_aCB4c8b8c2b,
+        aCB4c8b8c4b = dnnl_aCB4c8b8c4b,
+        ABc2b8a16b4a = dnnl_ABc2b8a16b4a,
+        ABcd2b8a16b4a = dnnl_ABcd2b8a16b4a,
+        ABcde2b8a16b4a = dnnl_ABcde2b8a16b4a,
+        ABc2a8b16a4b = dnnl_ABc2a8b16a4b,
+        ABc2a8b16a2b = dnnl_ABc2a8b16a2b,
+        ABc2b32a8b = dnnl_ABc2b32a8b,
+        ABcd2a8b16a4b = dnnl_ABcd2a8b16a4b,
+        ABcd2a8b16a2b = dnnl_ABcd2a8b16a2b,
+        aCBd2c8b16c2b = dnnl_aCBd2c8b16c2b,
+        ABcd2b32a8b = dnnl_ABcd2b32a8b,
+        aBCd2c8b16c2b = dnnl_aBCd2c8b16c2b,
+        ABcde2a8b16a4b = dnnl_ABcde2a8b16a4b,
+        ABcde2a8b16a2b = dnnl_ABcde2a8b16a2b,
+        aCBde2c8b16c2b = dnnl_aCBde2c8b16c2b,
+        ABcde2b32a8b = dnnl_ABcde2b32a8b,
+        aBC2b8c16b2c = dnnl_aBC2b8c16b2c,
+        aBCd2b8c16b2c = dnnl_aBCd2b8c16b2c,
+        aBCde2b8c16b2c = dnnl_aBCde2b8c16b2c,
+        aBCdef2b8c16b2c = dnnl_aBCdef2b8c16b2c,
+        BAcde2b8a16b4a = dnnl_BAcde2b8a16b4a,
+        BAcd2b8a16b4a = dnnl_BAcd2b8a16b4a,
+        BAc2b8a16b4a = dnnl_BAc2b8a16b4a,
+        BAcde2b8a16b2a = dnnl_BAcde2b8a16b2a,
+        BAcd2b8a16b2a = dnnl_BAcd2b8a16b2a,
+        BAc2b8a16b2a = dnnl_BAc2b8a16b2a,
+        aBCde2c8b16c2b = dnnl_aBCde2c8b16c2b,
+        aBCdef2c8b16c2b = dnnl_aBCdef2c8b16c2b,
+        aCBdef2c8b16c2b = dnnl_aCBdef2c8b16c2b,
+        aBCd2b8c16b4c = dnnl_aBCd2b8c16b4c,
+        aBCde2b8c16b4c = dnnl_aBCde2b8c16b4c,
+        NCdhw40n16c = dnnl_NCdhw40n16c,
+        NCw40n16c = dnnl_NCw40n16c,
+        NChw40n16c = dnnl_NChw40n16c,
+        NCw2c32n8c = dnnl_NCw2c32n8c,
+        NChw2c32n8c = dnnl_NChw2c32n8c,
+        NCdhw2c32n8c = dnnl_NCdhw2c32n8c,
+        OIw2i8o16i4o = dnnl_OIw2i8o16i4o,
+        OIhw2i8o16i4o = dnnl_OIhw2i8o16i4o,
+        OIdhw2i8o16i4o = dnnl_OIdhw2i8o16i4o,
+        OIw2o8i16o4i = dnnl_OIw2o8i16o4i,
+        OIw2o8i16o2i = dnnl_OIw2o8i16o2i,
+        IOw2i8o16i4o = dnnl_IOw2i8o16i4o,
+        IOw2i8o16i2o = dnnl_IOw2i8o16i2o,
+        OIhw2o8i16o4i = dnnl_OIhw2o8i16o4i,
+        OIhw2o8i16o2i = dnnl_OIhw2o8i16o2i,
+        IOhw2i8o16i4o = dnnl_IOhw2i8o16i4o,
+        IOhw2i8o16i2o = dnnl_IOhw2i8o16i2o,
+        OIdhw2o8i16o4i = dnnl_OIdhw2o8i16o4i,
+        OIdhw2o8i16o2i = dnnl_OIdhw2o8i16o2i,
+        IOdhw2i8o16i4o = dnnl_IOdhw2i8o16i4o,
+        IOdhw2i8o16i2o = dnnl_IOdhw2i8o16i2o,
+        gOIw2o8i16o2i = dnnl_gOIw2o8i16o2i,
+        gIOw2i8o16i2o = dnnl_gIOw2i8o16i2o,
+        gIOhw2i8o16i2o = dnnl_gIOhw2i8o16i2o,
+        gIOdhw2i8o16i2o = dnnl_gIOdhw2i8o16i2o,
+        gOIhw2o8i16o2i = dnnl_gOIhw2o8i16o2i,
+        gOIdhw2o8i16o2i = dnnl_gOIdhw2o8i16o2i,
+        gOIw2o8i16o4i = dnnl_gOIw2o8i16o4i,
+        gOIhw2o8i16o4i = dnnl_gOIhw2o8i16o4i,
+        BA4b8a16b2a = dnnl_BA4b8a16b2a,
+        BA4b8a16b4a = dnnl_BA4b8a16b4a,
+        aCB4c8b16c2b = dnnl_aCB4c8b16c2b,
+        aCB4c8b16c4b = dnnl_aCB4c8b16c4b,
+        aCB16c2b = dnnl_aCB16c2b,
+        aCB16c4b = dnnl_aCB16c4b,
+        BA16b2a = dnnl_BA16b2a,
+        BA16b4a = dnnl_BA16b4a,
+        BA4b4a = dnnl_BA4b4a,
+        BA8b4a = dnnl_BA8b4a,
+        aBC16b16c = dnnl_aBC16b16c,
+        aBC16b32c = dnnl_aBC16b32c,
+        AB16a16b = dnnl_AB16a16b,
+        AB16a32b = dnnl_AB16a32b,
+        ABcde16a16b2a = dnnl_ABcde16a16b2a,
+        aBCdef16b16c2b = dnnl_aBCdef16b16c2b,
+        Acedb16a = dnnl_Acedb16a,
+        aBdfec16b = dnnl_aBdfec16b,
+        Odwhi16o = dnnl_Odwhi16o,
+        gOdwhi16o = dnnl_gOdwhi16o,
+        abdEC64e2c = dnnl_abdEC64e2c,
+        abdEC64e4c = dnnl_abdEC64e4c,
+        ldgOI64o2i = abdEC64e2c,
+        ldgOI64o4i = abdEC64e4c,
+        abCd4c = dnnl_abCd4c,
+        abCde4c = dnnl_abCde4c,
+        abCdef4c = dnnl_abCdef4c,
+        abCde32c = dnnl_abCde32c,
+        abCdef32c = dnnl_abCdef32c,
+        aCdefB16b32c2b = dnnl_aCdefB16b32c2b,
+        aCdefB16b32c4b = dnnl_aCdefB16b32c4b,
+        aCdefB16b48c2b = dnnl_aCdefB16b48c2b,
+        aCdefB16b48c4b = dnnl_aCdefB16b48c4b,
+        aCdefB16b64c2b = dnnl_aCdefB16b64c2b,
+        aCdefB16b64c4b = dnnl_aCdefB16b64c4b,
+        BcdeA16a32b2a = dnnl_BcdeA16a32b2a,
+        BcdeA16a32b4a = dnnl_BcdeA16a32b4a,
+        BcdeA16a48b2a = dnnl_BcdeA16a48b2a,
+        BcdeA16a48b4a = dnnl_BcdeA16a48b4a,
+        BcdeA16a64b2a = dnnl_BcdeA16a64b2a,
+        BcdeA16a64b4a = dnnl_BcdeA16a64b4a,
+        aCdefb32c = dnnl_aCdefb32c,
+        aCdefB32c2b = dnnl_aCdefB32c2b,
+        aCdefB32c4b = dnnl_aCdefB32c4b,
+        aCdefb48c = dnnl_aCdefb48c,
+        aCdefB48c2b = dnnl_aCdefB48c2b,
+        aCdefB48c4b = dnnl_aCdefB48c4b,
+        aCdefb64c = dnnl_aCdefb64c,
+        aCdefB64c2b = dnnl_aCdefB64c2b,
+        aCdefB64c4b = dnnl_aCdefB64c4b,
+        Bcdea32b = dnnl_Bcdea32b,
+        BcdeA32b2a = dnnl_BcdeA32b2a,
+        BcdeA32b4a = dnnl_BcdeA32b4a,
+        Bcdea48b = dnnl_Bcdea48b,
+        BcdeA48b2a = dnnl_BcdeA48b2a,
+        BcdeA48b4a = dnnl_BcdeA48b4a,
+        Bcdea64b = dnnl_Bcdea64b,
+        BcdeA64b2a = dnnl_BcdeA64b2a,
+        BcdeA64b4a = dnnl_BcdeA64b4a,
+        Bca32b = dnnl_Bca32b,
+        BcA32b2a = dnnl_BcA32b2a,
+        BcA32b4a = dnnl_BcA32b4a,
+        Bca48b = dnnl_Bca48b,
+        BcA48b2a = dnnl_BcA48b2a,
+        BcA48b4a = dnnl_BcA48b4a,
+        Bca64b = dnnl_Bca64b,
+        BcA64b2a = dnnl_BcA64b2a,
+        BcA64b4a = dnnl_BcA64b4a,
+        aCdb32c = dnnl_aCdb32c,
+        aCdB32c2b = dnnl_aCdB32c2b,
+        aCdB32c4b = dnnl_aCdB32c4b,
+        aCdb48c = dnnl_aCdb48c,
+        aCdB48c2b = dnnl_aCdB48c2b,
+        aCdB48c4b = dnnl_aCdB48c4b,
+        aCdb64c = dnnl_aCdb64c,
+        aCdB64c2b = dnnl_aCdB64c2b,
+        aCdB64c4b = dnnl_aCdB64c4b,
+        BcA16a16b2a = dnnl_BcA16a16b2a,
+        BcA16a16b4a = dnnl_BcA16a16b4a,
+        BcdA16a16b2a = dnnl_BcdA16a16b2a,
+        BcdA16a16b4a = dnnl_BcdA16a16b4a,
+        BcdeA16a16b2a = dnnl_BcdeA16a16b2a,
+        BcdeA16a16b4a = dnnl_BcdeA16a16b4a,
+        aCdB16b16c2b = dnnl_aCdB16b16c2b,
+        aCdB16b16c4b = dnnl_aCdB16b16c4b,
+        aCdeB16b16c2b = dnnl_aCdeB16b16c2b,
+        aCdeB16b16c4b = dnnl_aCdeB16b16c4b,
+        aCdefB16b16c2b = dnnl_aCdefB16b16c2b,
+        aCdefB16b16c4b = dnnl_aCdefB16b16c4b,
+        BcA16a32b2a = dnnl_BcA16a32b2a,
+        BcA16a32b4a = dnnl_BcA16a32b4a,
+        BcA16a48b2a = dnnl_BcA16a48b2a,
+        BcA16a48b4a = dnnl_BcA16a48b4a,
+        BcA16a64b2a = dnnl_BcA16a64b2a,
+        BcA16a64b4a = dnnl_BcA16a64b4a,
+        aCdB16b32c2b = dnnl_aCdB16b32c2b,
+        aCdB16b32c4b = dnnl_aCdB16b32c4b,
+        aCdB16b48c2b = dnnl_aCdB16b48c2b,
+        aCdB16b48c4b = dnnl_aCdB16b48c4b,
+        aCdB16b64c2b = dnnl_aCdB16b64c2b,
+        aCdB16b64c4b = dnnl_aCdB16b64c4b,
+        BcdA16a32b2a = dnnl_BcdA16a32b2a,
+        BcdA16a32b4a = dnnl_BcdA16a32b4a,
+        BcdA16a48b2a = dnnl_BcdA16a48b2a,
+        BcdA16a48b4a = dnnl_BcdA16a48b4a,
+        BcdA16a64b2a = dnnl_BcdA16a64b2a,
+        BcdA16a64b4a = dnnl_BcdA16a64b4a,
+        aCdeB16b32c2b = dnnl_aCdeB16b32c2b,
+        aCdeB16b32c4b = dnnl_aCdeB16b32c4b,
+        aCdeB16b48c2b = dnnl_aCdeB16b48c2b,
+        aCdeB16b48c4b = dnnl_aCdeB16b48c4b,
+        aCdeB16b64c2b = dnnl_aCdeB16b64c2b,
+        aCdeB16b64c4b = dnnl_aCdeB16b64c4b,
+        Bca16b = dnnl_Bca16b,
+        BcA16b2a = dnnl_BcA16b2a,
+        BcA16b4a = dnnl_BcA16b4a,
+        Bcda16b = dnnl_Bcda16b,
+        BcdA16b2a = dnnl_BcdA16b2a,
+        BcdA16b4a = dnnl_BcdA16b4a,
+        Bcdea16b = dnnl_Bcdea16b,
+        BcdeA16b2a = dnnl_BcdeA16b2a,
+        BcdeA16b4a = dnnl_BcdeA16b4a,
+        aCdb16c = dnnl_aCdb16c,
+        aCdB16c2b = dnnl_aCdB16c2b,
+        aCdB16c4b = dnnl_aCdB16c4b,
+        aCdeb16c = dnnl_aCdeb16c,
+        aCdeB16c2b = dnnl_aCdeB16c2b,
+        aCdeB16c4b = dnnl_aCdeB16c4b,
+        aCdefb16c = dnnl_aCdefb16c,
+        aCdefB16c2b = dnnl_aCdefB16c2b,
+        aCdefB16c4b = dnnl_aCdefB16c4b,
+        Bcda32b = dnnl_Bcda32b,
+        BcdA32b2a = dnnl_BcdA32b2a,
+        BcdA32b4a = dnnl_BcdA32b4a,
+        Bcda48b = dnnl_Bcda48b,
+        BcdA48b2a = dnnl_BcdA48b2a,
+        BcdA48b4a = dnnl_BcdA48b4a,
+        Bcda64b = dnnl_Bcda64b,
+        BcdA64b2a = dnnl_BcdA64b2a,
+        BcdA64b4a = dnnl_BcdA64b4a,
+        aCdeb32c = dnnl_aCdeb32c,
+        aCdeB32c2b = dnnl_aCdeB32c2b,
+        aCdeB32c4b = dnnl_aCdeB32c4b,
+        aCdeb48c = dnnl_aCdeb48c,
+        aCdeB48c2b = dnnl_aCdeB48c2b,
+        aCdeB48c4b = dnnl_aCdeB48c4b,
+        aCdeb64c = dnnl_aCdeb64c,
+        aCdeB64c2b = dnnl_aCdeB64c2b,
+        aCdeB64c4b = dnnl_aCdeB64c4b,
+        NChw16n32c = dnnl_NChw16n32c,
+        goIw4i = dnnl_goIw4i,
+        goIw32i = dnnl_goIw32i,
+        goIhw4i = dnnl_goIhw4i,
+        goIhw32i = dnnl_goIhw32i,
+        goIdhw4i = dnnl_goIdhw4i,
+        goIdhw32i = dnnl_goIdhw32i,
+        cab = dnnl_cab,
+        cdab = dnnl_cdab,
+        cdeab = dnnl_cdeab,
+        woi = dnnl_woi,
+        hwoi = dnnl_hwoi,
+        dhwoi = dnnl_dhwoi,
+        Owi24o = dnnl_Owi24o,
+        Ohwi24o = dnnl_Ohwi24o,
+        Odhwi24o = dnnl_Odhwi24o,
+        gOwi24o = dnnl_gOwi24o,
+        gOhwi24o = dnnl_gOhwi24o,
+        gOdhwi24o = dnnl_gOdhwi24o,
+        OwI24o2i = dnnl_OwI24o2i,
+        OhwI24o2i = dnnl_OhwI24o2i,
+        OdhwI24o2i = dnnl_OdhwI24o2i,
+        gOwI24o2i = dnnl_gOwI24o2i,
+        gOhwI24o2i = dnnl_gOhwI24o2i,
+        gOdhwI24o2i = dnnl_gOdhwI24o2i,
+        OwI24o4i = dnnl_OwI24o4i,
+        OhwI24o4i = dnnl_OhwI24o4i,
+        OdhwI24o4i = dnnl_OdhwI24o4i,
+        gOwI24o4i = dnnl_gOwI24o4i,
+        gOhwI24o4i = dnnl_gOhwI24o4i,
+        gOdhwI24o4i = dnnl_gOdhwI24o4i,
+        OI8i32o = dnnl_OI8i32o,
+        OIw8i32o = dnnl_OIw8i32o,
+        OwI8i32o = dnnl_OwI8i32o,
+        OIhw8i32o = dnnl_OIhw8i32o,
+        OhwI8i32o = dnnl_OhwI8i32o,
+        OIdhw8i32o = dnnl_OIdhw8i32o,
+        OdhwI8i32o = dnnl_OdhwI8i32o,
+        OI8i24o = dnnl_OI8i24o,
+        OIw8i24o = dnnl_OIw8i24o,
+        OwI8i24o = dnnl_OwI8i24o,
+        OIhw8i24o = dnnl_OIhw8i24o,
+        OhwI8i24o = dnnl_OhwI8i24o,
+        OIdhw8i24o = dnnl_OIdhw8i24o,
+        OdhwI8i24o = dnnl_OdhwI8i24o,
+        OI8i16o = dnnl_OI8i16o,
+        OIw8i16o = dnnl_OIw8i16o,
+        OwI8i16o = dnnl_OwI8i16o,
+        OIhw8i16o = dnnl_OIhw8i16o,
+        OhwI8i16o = dnnl_OhwI8i16o,
+        OIdhw8i16o = dnnl_OIdhw8i16o,
+        OdhwI8i16o = dnnl_OdhwI8i16o,
+        OI8i8o = dnnl_OI8i8o,
+        AB4b8a4b = dnnl_AB4b8a4b,
+        AB4b24a4b = dnnl_AB4b24a4b,
+        ABc4b8a4b = dnnl_ABc4b8a4b,
+        AcB4b8a4b = dnnl_AcB4b8a4b,
+        ABc4b24a4b = dnnl_ABc4b24a4b,
+        AcB4b24a4b = dnnl_AcB4b24a4b,
+        ABcd4b8a4b = dnnl_ABcd4b8a4b,
+        AcdB4b8a4b = dnnl_AcdB4b8a4b,
+        ABcd4b24a4b = dnnl_ABcd4b24a4b,
+        AcdB4b24a4b = dnnl_AcdB4b24a4b,
+        ABcde4b8a4b = dnnl_ABcde4b8a4b,
+        AcdeB4b8a4b = dnnl_AcdeB4b8a4b,
+        ABcde4b24a4b = dnnl_ABcde4b24a4b,
+        AcdeB4b24a4b = dnnl_AcdeB4b24a4b,
+        Bca8b = dnnl_Bca8b,
+        BcA8b2a = dnnl_BcA8b2a,
+        Bcda8b = dnnl_Bcda8b,
+        BcdA8b2a = dnnl_BcdA8b2a,
+        Bcdea8b = dnnl_Bcdea8b,
+        BcdeA8b2a = dnnl_BcdeA8b2a,
+        aCdb8c = dnnl_aCdb8c,
+        aCdB8c2b = dnnl_aCdB8c2b,
+        aCdeb8c = dnnl_aCdeb8c,
+        aCdeB8c2b = dnnl_aCdeB8c2b,
+        aCdefb8c = dnnl_aCdefb8c,
+        aCdefB8c2b = dnnl_aCdefB8c2b,
+        Bca24b = dnnl_Bca24b,
+        BcA24b2a = dnnl_BcA24b2a,
+        Bcda24b = dnnl_Bcda24b,
+        BcdA24b2a = dnnl_BcdA24b2a,
+        Bcdea24b = dnnl_Bcdea24b,
+        BcdeA24b2a = dnnl_BcdeA24b2a,
+        aCdb24c = dnnl_aCdb24c,
+        aCdB24c2b = dnnl_aCdB24c2b,
+        aCdeb24c = dnnl_aCdeb24c,
+        aCdeB24c2b = dnnl_aCdeB24c2b,
+        aCdefb24c = dnnl_aCdefb24c,
+        aCdefB24c2b = dnnl_aCdefB24c2b,
+        Iwo8i = dnnl_Iwo8i,
+        IwO8i2o = dnnl_IwO8i2o,
+        Iwo24i = dnnl_Iwo24i,
+        IwO24i2o = dnnl_IwO24i2o,
+        Ihwo8i = dnnl_Ihwo8i,
+        IhwO8i2o = dnnl_IhwO8i2o,
+        Ihwo24i = dnnl_Ihwo24i,
+        IhwO24i2o = dnnl_IhwO24i2o,
+        Idhwo8i = dnnl_Idhwo8i,
+        IdhwO8i2o = dnnl_IdhwO8i2o,
+        Idhwo24i = dnnl_Idhwo24i,
+        IdhwO24i2o = dnnl_IdhwO24i2o,
+        gIwo8i = dnnl_gIwo8i,
+        gIwO8i2o = dnnl_gIwO8i2o,
+        gIwo24i = dnnl_gIwo24i,
+        gIwO24i2o = dnnl_gIwO24i2o,
+        gIhwo8i = dnnl_gIhwo8i,
+        gIhwO8i2o = dnnl_gIhwO8i2o,
+        gIhwo24i = dnnl_gIhwo24i,
+        gIhwO24i2o = dnnl_gIhwO24i2o,
+        gIdhwo8i = dnnl_gIdhwo8i,
+        gIdhwO8i2o = dnnl_gIdhwO8i2o,
+        gIdhwo24i = dnnl_gIdhwo24i,
+        gIdhwO24i2o = dnnl_gIdhwO24i2o,
+        OhwI24o = dnnl_OhwI24o,
+        gOhwI24o = dnnl_gOhwI24o,
+        AB8b24a2b = dnnl_AB8b24a2b,
+        ABc8b24a2b = dnnl_ABc8b24a2b,
+        AcB8b24a2b = dnnl_AcB8b24a2b,
+        ABcd8b24a2b = dnnl_ABcd8b24a2b,
+        AcdB8b24a2b = dnnl_AcdB8b24a2b,
+        ABcde8b24a2b = dnnl_ABcde8b24a2b,
+        AcdeB8b24a2b = dnnl_AcdeB8b24a2b,
+        AB8b8a2b = dnnl_AB8b8a2b,
+        ABc8b8a2b = dnnl_ABc8b8a2b,
+        AcB8b8a2b = dnnl_AcB8b8a2b,
+        ABcd8b8a2b = dnnl_ABcd8b8a2b,
+        AcdB8b8a2b = dnnl_AcdB8b8a2b,
+        ABcde8b8a2b = dnnl_ABcde8b8a2b,
+        AcdeB8b8a2b = dnnl_AcdeB8b8a2b,
+        OI8i8o2i = dnnl_OI8i8o2i,
+        OI8i24o2i = dnnl_OI8i24o2i,
+        OIw8i8o2i = dnnl_OIw8i8o2i,
+        OwI8i8o2i = dnnl_OwI8i8o2i,
+        OIw8i24o2i = dnnl_OIw8i24o2i,
+        OwI8i24o2i = dnnl_OwI8i24o2i,
+        OIhw8i8o2i = dnnl_OIhw8i8o2i,
+        OhwI8i8o2i = dnnl_OhwI8i8o2i,
+        OIhw8i24o2i = dnnl_OIhw8i24o2i,
+        OhwI8i24o2i = dnnl_OhwI8i24o2i,
+        OIdhw8i8o2i = dnnl_OIdhw8i8o2i,
+        OdhwI8i8o2i = dnnl_OdhwI8i8o2i,
+        OIdhw8i24o2i = dnnl_OIdhw8i24o2i,
+        OdhwI8i24o2i = dnnl_OdhwI8i24o2i,
+        BcA8b4a = dnnl_BcA8b4a,
+        BcdA8b4a = dnnl_BcdA8b4a,
+        BcdeA8b4a = dnnl_BcdeA8b4a,
+        aCdB8c4b = dnnl_aCdB8c4b,
+        aCdeB8c4b = dnnl_aCdeB8c4b,
+        aCdefB8c4b = dnnl_aCdefB8c4b,
+        BcA24b4a = dnnl_BcA24b4a,
+        BcdA24b4a = dnnl_BcdA24b4a,
+        BcdeA24b4a = dnnl_BcdeA24b4a,
+        aCdB24c4b = dnnl_aCdB24c4b,
+        aCdeB24c4b = dnnl_aCdeB24c4b,
+        aCdefB24c4b = dnnl_aCdefB24c4b,
+        ABc16a4b = dnnl_ABc16a4b,
+        ABcd16a4b = dnnl_ABcd16a4b,
+        ABcde16a4b = dnnl_ABcde16a4b,
+        IwO8i4o = dnnl_IwO8i4o,
+        IwO24i4o = dnnl_IwO24i4o,
+        IhwO8i4o = dnnl_IhwO8i4o,
+        IhwO24i4o = dnnl_IhwO24i4o,
+        IdhwO8i4o = dnnl_IdhwO8i4o,
+        IdhwO24i4o = dnnl_IdhwO24i4o,
+        gIwO8i4o = dnnl_gIwO8i4o,
+        gIwO24i4o = dnnl_gIwO24i4o,
+        gIhwO8i4o = dnnl_gIhwO8i4o,
+        gIhwO24i4o = dnnl_gIhwO24i4o,
+        gIdhwO8i4o = dnnl_gIdhwO8i4o,
+        gIdhwO24i4o = dnnl_gIdhwO24i4o,
+        BA2a24b = dnnl_BA2a24b,
+        aCB2b24c = dnnl_aCB2b24c,
+        BA2a8b = dnnl_BA2a8b,
+        aCB2b8c = dnnl_aCB2b8c,
+        BA8a24b = dnnl_BA8a24b,
+        aCB8b24c = dnnl_aCB8b24c,
+        BA8a16b = dnnl_BA8a16b,
+        aCB8b16c = dnnl_aCB8b16c,
+        BA8a8b = dnnl_BA8a8b,
+        aCB8b8c = dnnl_aCB8b8c,
+        bcad = dnnl_bcad,
+        cabd = dnnl_cabd,
+        dabc = dnnl_dabc,
+    };
+
+    /// A memory descriptor.
+    struct desc : public handle<dnnl_memory_desc_t> {
+        using handle<dnnl_memory_desc_t>::handle;
+
+        friend struct memory;
+
+        /// Constructs a zero (empty) memory descriptor. Such a memory
+        /// descriptor can be used to indicate absence of an argument.
+        desc() {
+            dnnl_memory_desc_t zero_md = nullptr;
+            error::wrap_c_api(
+                    dnnl_memory_desc_create_with_tag(&zero_md, 0, nullptr,
+                            dnnl_data_type_undef, dnnl_format_tag_undef),
+                    "could not create a zero memory descriptor");
+            reset(zero_md);
+        }
+
+        /// Constructs a memory descriptor.
+        ///
+        /// @note
+        ///     The logical order of dimensions corresponds to the `abc...`
+        ///     format tag, and the physical meaning of the dimensions depends
+        ///     both on the primitive that would operate on this memory and
+        ///     the operation context.
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param aformat_tag Memory format tag.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        desc(const dims &adims, data_type adata_type, format_tag aformat_tag,
+                bool allow_empty = false) {
+            validate_dims(adims);
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_tag(&md,
+                    (int)adims.size(), adims.data(), convert_to_c(adata_type),
+                    convert_to_c(aformat_tag));
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not construct a memory descriptor using a "
+                        "format tag");
+            reset(md);
+        }
+
+        /// Constructs a memory descriptor by strides.
+        ///
+        /// @note
+        ///     The logical order of dimensions corresponds to the `abc...`
+        ///     format tag, and the physical meaning of the dimensions depends
+        ///     both on the primitive that would operate on this memory and
+        ///     the operation context.
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param strides Strides for each dimension.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        desc(const dims &adims, data_type adata_type, const dims &strides,
+                bool allow_empty = false) {
+            validate_dims(adims);
+            if (!strides.empty()) validate_dims(strides, (int)adims.size());
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_strides(&md,
+                    (int)adims.size(), adims.data(), convert_to_c(adata_type),
+                    strides.empty() ? nullptr : &strides[0]);
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not construct a memory descriptor using "
+                        "strides");
+            reset(md);
+        }
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        /// Function for creating a memory descriptor for CSR sparse encoding.
+        ///
+        /// The created memory descriptor will describe a memory object that
+        /// contains 3 buffers. The buffers have the following meaning and
+        /// assigned numbers (index):
+        ///  - 0: values
+        ///  - 1: indices
+        ///  - 2: pointers
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param nnz Number of non-zero entries.
+        /// @param index_dt Data type of indices.
+        /// @param pointer_dt Data type of pointers.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        static desc csr(const dims &adims, data_type adata_type, dim nnz,
+                data_type index_dt, data_type pointer_dt,
+                bool allow_empty = false) {
+            validate_dims(adims);
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_csr_encoding(
+                    &md, (int)adims.size(), adims.data(),
+                    convert_to_c(adata_type), nnz, convert_to_c(index_dt),
+                    convert_to_c(pointer_dt));
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a memory descriptor for CSR sparse "
+                        "encoding");
+            return desc {md};
+        }
+
+        /// Function for creating a memory descriptor for COO sparse encodings.
+        ///
+        /// The created memory descriptor will describe a memory object that
+        /// contains n+1 buffers for an n-dimensional tensor.
+        /// The buffers have the following meaning and assigned numbers (index):
+        ///  - 0: values
+        ///  - 1: indices for dimension 0
+        ///  - 2: indices for dimension 1 ...
+        ///  - n: indices for dimension n-1
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param nnz Number of non-zero entries.
+        /// @param index_dt Data type of indices.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        static desc coo(const dims &adims, data_type adata_type, dim nnz,
+                data_type index_dt, bool allow_empty = false) {
+            validate_dims(adims);
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_coo_encoding(
+                    &md, (int)adims.size(), adims.data(),
+                    convert_to_c(adata_type), nnz, convert_to_c(index_dt));
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a memory descriptor for COO sparse "
+                        "encoding");
+            return desc {md};
+        }
+
+        /// Function for creating a memory descriptor for packed sparse
+        /// encoding.
+        ///
+        /// The created memory descriptor cannot be used to create a memory
+        /// object. It can only be used to create a primitive descriptor to
+        /// query the actual memory descriptor (similar to the format tag
+        /// `any`).
+        ///
+        /// @warning
+        ///     The meaning and content of the handles of the memory object that
+        ///     is created using the queried memory descriptor are unspecified
+        ///     therefore using the content is an undefined behavior.
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param nnz Number of non-zero entries.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        static desc packed(const dims &adims, data_type adata_type, dim nnz,
+                bool allow_empty = false) {
+            validate_dims(adims);
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_packed_encoding(
+                    &md, (int)adims.size(), adims.data(),
+                    convert_to_c(adata_type), nnz);
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a memory descriptor for packed "
+                        "sparse encoding");
+            return desc {md};
+        }
+#endif
+        /// Construct a memory descriptor from a C API ::dnnl_memory_desc_t
+        /// handle. The resulting handle is not weak and the C handle will be
+        /// destroyed during the destruction of the C++ object.
+        ///
+        /// @param md The C API memory descriptor.
+        desc(dnnl_memory_desc_t md) : handle<dnnl_memory_desc_t>(md) {}
+
+        /// Construct a memory descriptor from a binary blob.
+        ///
+        /// @param blob A binary blob previously queried from a memory descriptor.
+        desc(const std::vector<uint8_t> &blob) {
+            dnnl_memory_desc_t md = nullptr;
+            error::wrap_c_api(
+                    dnnl_memory_desc_create_with_blob(&md, blob.data()),
+                    "could not create a memory descriptor from blob");
+            reset(md);
+        }
+
+        /// Constructs a memory descriptor for a region inside an area
+        /// described by this memory descriptor.
+        //
+        /// @param adims Sizes of the region.
+        /// @param offsets Offsets to the region from the encompassing
+        ///     memory object in each dimension.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be returned. This flag is optional
+        ///     and defaults to false.
+        /// @returns A memory descriptor for the region.
+        desc submemory_desc(const dims &adims, const dims &offsets,
+                bool allow_empty = false) const {
+            validate_dims(adims, get_ndims());
+            validate_dims(offsets, get_ndims());
+            dnnl_memory_desc_t sub_md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_submemory(
+                    &sub_md, get(), adims.data(), offsets.data());
+            if (!allow_empty)
+                error::wrap_c_api(status, "could not construct a sub-memory");
+            return desc(sub_md);
+        }
+
+        /// Constructs a memory descriptor by reshaping an existing one. The
+        /// new memory descriptor inherits the data type. This operation is
+        /// valid only for memory descriptors that have format_kind set to
+        /// #dnnl::memory::format_kind::blocked or
+        /// #dnnl::memory::format_kind::any.
+        ///
+        /// The operation ensures that the transformation of the physical memory
+        /// format corresponds to the transformation of the logical dimensions.
+        /// If such transformation is impossible, the function either throws an
+        /// exception (default) or returns a zero memory descriptor depending on
+        /// the `allow_empty` flag.
+        ///
+        /// The reshape operation can be described as a combination of the
+        /// following basic operations:
+        /// 1. Add a dimension of size `1`. This is always possible.
+        /// 2. Remove a dimension of size `1`. This is possible only if the
+        ///    dimension has no padding (i.e.
+        ///    `padded_dims[dim] == dims[dim] && dims[dim] == 1`).
+        /// 3. Split a dimension into multiple ones. This is possible only if
+        ///    the product of all tensor dimensions stays constant and the
+        ///    dimension being split does not have padding (i.e.
+        ///    `padded_dims[dim] = dims[dim]`).
+        /// 4. Join multiple consecutive dimensions into a single one. As in
+        ///    the cases above, this requires that the dimensions do not have
+        ///    padding and that the memory format is such that in physical
+        ///    memory these dimensions are dense and have the same order as
+        ///    their logical counterparts. This also assumes that these
+        ///    dimensions are not blocked.
+        ///    - Here, 'dense' means:
+        ///      `stride for dim[i] == (stride for dim[i + 1]) * dim[i + 1]`;
+        ///    - And 'same order' means:
+        ///      `i < j` if and only if `stride for dim[j] <= stride for dim[i]`.
+        ///
+        /// @warning
+        ///     Some combinations of physical memory layout and/or offsets or
+        ///     dimensions may result in a failure to make a reshape.
+        ///
+        /// @param adims New dimensions. The product of dimensions must
+        ///     remain constant.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be returned. This flag is optional
+        ///     and defaults to false.
+        /// @returns A new memory descriptor with new dimensions.
+        desc reshape(const dims &adims, bool allow_empty = false) const {
+            if (get_ndims()) validate_dims(adims, 1);
+            dnnl_memory_desc_t out_md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_reshape(
+                    &out_md, get(), (int)adims.size(), adims.data());
+            if (!allow_empty)
+                error::wrap_c_api(
+                        status, "could not reshape a memory descriptor");
+            return desc(out_md);
+        }
+
+        /// Constructs a memory descriptor by permuting axes in an existing
+        /// one.
+        ///
+        /// The physical memory layout representation is adjusted accordingly
+        /// to maintain the consistency between the logical and physical parts
+        /// of the memory descriptor. The new memory descriptor inherits the
+        /// data type.
+        ///
+        /// The new memory descriptor inherits the data type. This operation is
+        /// valid only for memory descriptors that have format_kind set to
+        /// #dnnl::memory::format_kind::blocked or
+        /// #dnnl::memory::format_kind::any.
+        ///
+        /// The logical axes will be permuted in the following manner:
+        /// @code
+        /// for (i = 0; i < get_ndims(); i++)
+        ///     new_desc.dims()[permutation[i]] = dims()[i];
+        /// @endcode
+        ///
+        /// Example:
+        /// @code
+        ///     std::vector<int> permutation = {1, 0}; // swap the first and
+        ///                                            // the second axes
+        ///     dnnl::memory::desc in_md(
+        ///             {2, 3}, data_type, memory::format_tag::ab);
+        ///     dnnl::memory::desc expect_out_md(
+        ///             {3, 2}, data_type, memory::format_tag::ba);
+        ///
+        ///     assert(in_md.permute_axes(permutation) == expect_out_md);
+        /// @endcode
+        ///
+        /// @param permutation Axes permutation.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be returned. This flag is optional
+        ///     and defaults to false.
+        /// @returns A new memory descriptor with new dimensions.
+        desc permute_axes(const std::vector<int> &permutation,
+                bool allow_empty = false) const {
+            validate_dims(permutation, get_ndims());
+            dnnl_memory_desc_t out_md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_permute_axes(
+                    &out_md, get(), permutation.data());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not permute axes of a memory descriptor");
+            return desc(out_md);
+        }
+
+        /// Returns a number of dimensions of the memory descriptor.
+        ///
+        /// @returns A number of dimensions.
+        int get_ndims() const { return query_s32(query::ndims_s32); }
+
+        /// Returns padded dimensions of the memory descriptor.
+        ///
+        /// @returns A copy of the padded dimensions vector.
+        memory::dims get_padded_dims() const {
+            return query_dims(query::padded_dims);
+        }
+
+        /// Returns padded offsets of the memory descriptor.
+        ///
+        /// @returns A copy of the padded offsets vector.
+        memory::dims get_padded_offsets() const {
+            return query_dims(query::padded_offsets);
+        }
+
+        /// Returns a submemory offset of the memory descriptor.
+        ///
+        /// @returns A submemory offset.
+        memory::dim get_submemory_offset() const {
+            dnnl_dim_t submemory_offset;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl_query_submemory_offset_s64, &submemory_offset);
+            return status == dnnl_success ? submemory_offset : 0;
+        }
+
+        /// Returns strides of the memory descriptor.
+        ///
+        /// @note
+        ///     This API is only applicable to memory descriptors with format
+        ///     kind #dnnl_blocked.
+        ///
+        /// @returns A copy of the strides vector.
+        /// @returns An empty #dnnl::memory::dims if the memory descriptor
+        ///     does not have strides.
+        memory::dims get_strides() const { return query_dims(query::strides); }
+
+        /// Returns a number of inner blocks of the memory descriptor.
+        ///
+        /// @note
+        ///     This API is only applicable to memory descriptors with format
+        ///     kind #dnnl_blocked.
+        ///
+        /// @returns A number of inner blocks.
+        int get_inner_nblks() const {
+            return query_s32(query::inner_nblks_s32);
+        }
+
+        /// Returns inner blocks of the memory descriptor.
+        ///
+        /// @note
+        ///     This API is only applicable to memory descriptors with format
+        ///     kind #dnnl_blocked.
+        ///
+        /// @returns A copy of the inner blocks vector.
+        /// @returns An empty #dnnl::memory::dims if the memory descriptor
+        ///     does not have inner blocks.
+        memory::dims get_inner_blks() const {
+            return query_dims(query::inner_blks);
+        }
+
+        /// Returns inner indices of the memory descriptor.
+        ///
+        /// @note
+        ///     This API is only applicable to memory descriptors with format
+        ///     kind #dnnl_blocked.
+        ///
+        /// @returns A copy of the inner indices vector.
+        /// @returns An empty #dnnl::memory::dims if the memory descriptor
+        ///     does not have inner indices.
+        memory::dims get_inner_idxs() const {
+            return query_dims(query::inner_idxs);
+        }
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        /// Returns number of handles.
+        ///
+        /// @returns A number of handles.
+        int get_num_handles() const {
+            int nhandles;
+            dnnl_status_t status = dnnl_memory_desc_query_v2(
+                    get(), dnnl_query_num_handles_s32, 0, &nhandles);
+            return status == dnnl_success ? nhandles : 0;
+        }
+
+        /// Returns a number of non-zero entries of the memory descriptor.
+        ///
+        /// @returns A number non-zero entries.
+        dim get_nnz() const {
+            dnnl_dim_t nnz;
+            dnnl_status_t status = dnnl_memory_desc_query_v2(
+                    get(), dnnl_query_nnz_s64, 0, &nnz);
+            return status == dnnl_success ? nnz : 0;
+        }
+
+        /// Returns the sparse encoding of the memory descriptor.
+        ///
+        /// @returns the sparse encoding kind.
+        memory::sparse_encoding get_sparse_encoding() const {
+            dnnl_sparse_encoding_t sparse_encoding;
+            dnnl_status_t status = dnnl_memory_desc_query_v2(
+                    get(), dnnl_query_sparse_encoding, 0, &sparse_encoding);
+            return status == dnnl_success
+                    ? static_cast<dnnl::memory::sparse_encoding>(
+                            sparse_encoding)
+                    : dnnl::memory::sparse_encoding::undef;
+        }
+
+        /// Returns the data type of the memory descriptor.
+        ///
+        /// @returns The data type.
+        memory::data_type get_data_type(int index = 0) const {
+            return query_data_type(query::data_type, index);
+        }
+#else
+        /// Returns the data type of the memory descriptor.
+        ///
+        /// @returns The data type.
+        memory::data_type get_data_type() const {
+            return query_data_type(query::data_type);
+        }
+#endif
+
+        /// Returns the format kind of the memory descriptor.
+        ///
+        /// @returns the format kind.
+        memory::format_kind get_format_kind() const {
+            dnnl_format_kind_t format_kind;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl_query_format_kind, &format_kind);
+            return status == dnnl_success
+                    ? static_cast<dnnl::memory::format_kind>(format_kind)
+                    : dnnl::memory::format_kind::undef;
+        }
+
+        /// Returns dimensions of the memory descriptor.
+        ///
+        /// Potentially expensive due to the data copy involved.
+        /// @returns A copy of the dimensions vector.
+        memory::dims get_dims() const { return query_dims(query::dims); }
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        /// Returns size of the memory descriptor in bytes.
+        /// @param index Data index. Defaults to 0.
+        /// @returns The number of bytes required to allocate a memory buffer
+        ///     for data with a particular @p index described by this memory
+        ///     descriptor including the padding area.
+        size_t get_size(int index = 0) const {
+            return dnnl_memory_desc_get_size_v2(get(), index);
+        }
+#else
+        /// Returns size of the memory descriptor in bytes.
+        /// @returns The number of bytes required to allocate a memory buffer
+        ///     for the memory object described by this memory descriptor
+        ///     including the padding area.
+        size_t get_size() const { return dnnl_memory_desc_get_size(get()); }
+#endif
+
+        /// Returns a binary blob associated with the given memory descriptor
+        /// @returns The memory descriptor blob associated with the memory descriptor
+        std::vector<uint8_t> get_blob() {
+            size_t size;
+            dnnl_status_t status
+                    = dnnl_memory_desc_get_blob(nullptr, &size, get());
+            error::wrap_c_api(
+                    status, "could not get memory descriptor blob size");
+
+            std::vector<uint8_t> out_blob(size);
+            status = dnnl_memory_desc_get_blob(out_blob.data(), &size, get());
+            error::wrap_c_api(status, "could not get memory descriptor blob");
+            return out_blob;
+        }
+
+        /// Checks whether the memory descriptor is zero (empty).
+        /// @returns @c true if the memory descriptor describes an empty
+        ///     memory and @c false otherwise.
+        bool is_zero() const { return get_ndims() == 0; }
+
+        /// An equality operator.
+        /// @param other Another memory descriptor.
+        /// @returns Whether this and the other memory descriptors have
+        ///     the same format tag, dimensions, strides, blocking, etc.
+        bool operator==(const desc &other) const {
+            return dnnl_memory_desc_equal(get(), other.get()) != 0;
+        }
+
+        /// An inequality operator.
+        /// @param other Another memory descriptor.
+        /// @returns Whether this and the other memory descriptors describe
+        ///     different memory.
+        bool operator!=(const desc &other) const { return !operator==(other); }
+
+    private:
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        memory::data_type query_data_type(query what, int index) const {
+            dnnl_data_type_t data_type;
+            dnnl_status_t status = dnnl_memory_desc_query_v2(
+                    get(), dnnl::convert_to_c(what), index, &data_type);
+            return status == dnnl_success
+                    ? static_cast<dnnl::memory::data_type>(data_type)
+                    : dnnl::memory::data_type::undef;
+        }
+#else
+        memory::data_type query_data_type(query what) const {
+            dnnl_data_type_t data_type;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl::convert_to_c(what), &data_type);
+            return status == dnnl_success
+                    ? static_cast<dnnl::memory::data_type>(data_type)
+                    : dnnl::memory::data_type::undef;
+        }
+#endif
+
+        int query_s32(query what) const {
+            int res;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl::convert_to_c(what), &res);
+            return status == dnnl_success ? res : 0;
+        }
+
+        memory::dims query_dims(query what) const {
+            dnnl_dims_t *c_dims;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl::convert_to_c(what), &c_dims);
+
+            const int ndims
+                    = (what == query::inner_idxs || what == query::inner_blks)
+                    ? get_inner_nblks()
+                    : get_ndims();
+
+            return status == dnnl_success
+                    ? memory::dims(*c_dims, *c_dims + ndims)
+                    : memory::dims {};
+        }
+    };
+
+    /// Default constructor.
+    ///
+    /// Constructs an empty memory object, which can be used to indicate
+    /// absence of a parameter.
+    memory() = default;
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Constructs a memory object.
+    ///
+    /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory
+    /// object will have the underlying buffer set. In this case, the buffer
+    /// will be initialized as if #dnnl::memory::set_data_handle() had been
+    /// called.
+    ///
+    /// @sa memory::set_data_handle()
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    /// @param handle Handle of the memory buffer to use.
+    ///     - A pointer to the user-allocated buffer. In this case the library
+    ///       doesn't own the buffer.
+    ///     - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+    ///       allocate the buffer for the memory object. In this case the
+    ///       library owns the buffer.
+    ///     - #DNNL_MEMORY_NONE to create dnnl::memory without an underlying
+    ///       buffer.
+    memory(const desc &md, const engine &aengine, void *handle)
+        : memory(md, aengine, std::vector<void *> {handle}) {}
+
+    /// Constructs a memory object with multiple handles.
+    ///
+    /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory
+    /// object will have the underlying buffer set. In this case, the buffer
+    /// will be initialized as if #dnnl::memory::set_data_handle() had been
+    /// called.
+    ///
+    /// @sa memory::set_data_handle()
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    /// @param handles Handles of the memory buffers to use.
+    ///     For each element of the @p handles vector the following applies:
+    ///     - A pointer to the user-allocated buffer. In this case the library
+    ///       doesn't own the buffer.
+    ///     - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+    ///       allocate the buffer for the memory object. In this case the
+    ///       library owns the buffer.
+    ///     - #DNNL_MEMORY_NONE Instructs the library to skip allocation of the
+    ///       memory buffer.
+    memory(const desc &md, const engine &aengine, std::vector<void *> handles) {
+        dnnl_memory_t result;
+        dnnl_status_t status = dnnl_memory_create_v2(&result, md.get(),
+                aengine.get(), (int)handles.size(), handles.data());
+        error::wrap_c_api(status, "could not create a memory object");
+        reset(result);
+    }
+
+    /// Constructs a memory object.
+    ///
+    /// The underlying buffer(s) for the memory will be allocated by the
+    /// library.
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    memory(const desc &md, const engine &aengine) {
+        dnnl_status_t status;
+        dnnl_memory_t result;
+        const int nhandles = md.get_num_handles();
+
+        std::vector<void *> handles(nhandles, DNNL_MEMORY_ALLOCATE);
+        status = dnnl_memory_create_v2(&result, md.get(), aengine.get(),
+                (int)handles.size(), handles.data());
+
+        error::wrap_c_api(status, "could not create a memory object");
+        reset(result);
+    }
+#else
+    /// Constructs a memory object.
+    ///
+    /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory
+    /// object will have the underlying buffer set. In this case, the buffer
+    /// will be initialized as if #dnnl::memory::set_data_handle() had been
+    /// called.
+    ///
+    /// @sa memory::set_data_handle()
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    /// @param handle Handle of the memory buffer to use.
+    ///     - A pointer to the user-allocated buffer. In this case the library
+    ///       doesn't own the buffer.
+    ///     - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+    ///       allocate the buffer for the memory object. In this case the
+    ///       library owns the buffer.
+    ///     - #DNNL_MEMORY_NONE to create dnnl::memory without an underlying
+    ///       buffer.
+    memory(const desc &md, const engine &aengine, void *handle) {
+        dnnl_memory_t result;
+        error::wrap_c_api(
+                dnnl_memory_create(&result, md.get(), aengine.get(), handle),
+                "could not create a memory object");
+        reset(result);
+    }
+
+    /// Constructs a memory object.
+    ///
+    /// The underlying buffer for the memory will be allocated by the library.
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    memory(const desc &md, const engine &aengine)
+        : memory(md, aengine, DNNL_MEMORY_ALLOCATE) {}
+#endif
+
+    /// Returns the associated memory descriptor.
+    desc get_desc() const {
+        const_dnnl_memory_desc_t cdesc;
+        error::wrap_c_api(dnnl_memory_get_memory_desc(get(), &cdesc),
+                "could not get a memory descriptor from a memory object");
+        dnnl_memory_desc_t cloned_md = nullptr;
+        error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc),
+                "could not clone a memory descriptor");
+        return desc(cloned_md);
+    }
+
+    /// Returns the associated engine.
+    engine get_engine() const {
+        dnnl_engine_t c_engine;
+        error::wrap_c_api(dnnl_memory_get_engine(get(), &c_engine),
+                "could not get an engine from a memory object");
+        return engine(c_engine, true);
+    }
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Returns an underlying memory buffer that corresponds to the given index.
+    ///
+    /// On the CPU engine, or when using USM, this is a pointer to the
+    /// allocated memory.
+    void *get_data_handle(int index = 0) const {
+        void *handle;
+        error::wrap_c_api(dnnl_memory_get_data_handle_v2(get(), &handle, index),
+                "could not get a native handle from a memory object");
+        return handle;
+    }
+
+    /// Sets an underlying memory buffer that corresponds to the given index.
+    ///
+    /// @param handle Memory buffer to use. On the CPU engine or when USM is
+    ///     used, the memory buffer is a pointer to the actual data. For OpenCL
+    ///     it is a cl_mem. It must have at least
+    ///     #dnnl::memory::desc::get_size() bytes allocated.
+    /// @param index Memory index to attach the buffer. Defaults to 0.
+    void set_data_handle(void *handle, int index = 0) const {
+        error::wrap_c_api(dnnl_memory_set_data_handle_v2(get(), handle, index),
+                "could not set native handle of a memory object");
+    }
+
+    /// Maps a memory object and returns a host-side pointer to a memory
+    /// buffer with a copy of its contents. The memory buffer corresponds to
+    /// the given index.
+    ///
+    /// Mapping enables read/write directly from/to the memory contents for
+    /// engines that do not support direct memory access.
+    ///
+    /// Mapping is an exclusive operation - a memory object cannot be used in
+    /// other operations until it is unmapped via #dnnl::memory::unmap_data()
+    /// call.
+    ///
+    /// @note
+    ///     Any primitives working with the memory should be completed before
+    ///     the memory is mapped. Use #dnnl::stream::wait() to synchronize the
+    ///     corresponding execution stream.
+    ///
+    /// @note
+    ///     The map_data and unmap_data functions are provided mainly for
+    ///     debug and testing purposes and their performance may be suboptimal.
+    ///
+    /// @tparam T Data type to return a pointer to.
+    /// @param index Index of the buffer. Defaults to 0.
+    /// @returns Pointer to the mapped memory.
+    template <typename T = void>
+    T *map_data(int index = 0) const {
+        void *mapped_ptr;
+        error::wrap_c_api(dnnl_memory_map_data_v2(get(), &mapped_ptr, index),
+                "could not map memory object data");
+        return static_cast<T *>(mapped_ptr);
+    }
+
+    /// Unmaps a memory object and writes back any changes made to the
+    /// previously mapped memory buffer. The memory buffer corresponds to
+    /// the given index.
+    ///
+    /// @note
+    ///     The map_data and unmap_data functions are provided mainly for
+    ///     debug and testing purposes and their performance may be
+    ///     suboptimal.
+    ///
+    /// @param mapped_ptr A pointer previously returned by
+    ///     #dnnl::memory::map_data().
+    /// @param index Index of the buffer. Defaults to 0.
+    void unmap_data(void *mapped_ptr, int index = 0) const {
+        error::wrap_c_api(dnnl_memory_unmap_data_v2(get(), mapped_ptr, index),
+                "could not unmap memory object data");
+    }
+#else
+    /// Returns the underlying memory buffer.
+    ///
+    /// On the CPU engine, or when using USM, this is a pointer to the
+    /// allocated memory.
+    void *get_data_handle() const {
+        void *handle;
+        error::wrap_c_api(dnnl_memory_get_data_handle(get(), &handle),
+                "could not get a native handle from a memory object");
+        return handle;
+    }
+
+    /// Sets the underlying memory buffer.
+    ///
+    /// @param handle Memory buffer to use. On the CPU engine or when USM is
+    ///     used, the memory buffer is a pointer to the actual data. For OpenCL
+    ///     it is a cl_mem. It must have at least
+    ///     #dnnl::memory::desc::get_size() bytes allocated.
+    void set_data_handle(void *handle) const {
+        error::wrap_c_api(dnnl_memory_set_data_handle(get(), handle),
+                "could not set native handle of a memory object");
+    }
+
+    /// Maps a memory object and returns a host-side pointer to a memory
+    /// buffer with a copy of its contents.
+    ///
+    /// Mapping enables read/write directly from/to the memory contents for
+    /// engines that do not support direct memory access.
+    ///
+    /// Mapping is an exclusive operation - a memory object cannot be used in
+    /// other operations until it is unmapped via #dnnl::memory::unmap_data()
+    /// call.
+    ///
+    /// @note
+    ///     Any primitives working with the memory should be completed before
+    ///     the memory is mapped. Use #dnnl::stream::wait() to synchronize the
+    ///     corresponding execution stream.
+    ///
+    /// @note
+    ///     The map_data and unmap_data functions are provided mainly for
+    ///     debug and testing purposes and their performance may be suboptimal.
+    ///
+    /// @tparam T Data type to return a pointer to.
+    /// @returns Pointer to the mapped memory.
+    template <typename T = void>
+    T *map_data() const {
+        void *mapped_ptr;
+        error::wrap_c_api(dnnl_memory_map_data(get(), &mapped_ptr),
+                "could not map memory object data");
+        return static_cast<T *>(mapped_ptr);
+    }
+
+    /// Unmaps a memory object and writes back any changes made to the
+    /// previously mapped memory buffer.
+    ///
+    /// @note
+    ///     The map_data and unmap_data functions are provided mainly for
+    ///     debug and testing purposes and their performance may be
+    ///     suboptimal.
+    ///
+    /// @param mapped_ptr A pointer previously returned by
+    ///     #dnnl::memory::map_data().
+    void unmap_data(void *mapped_ptr) const {
+        error::wrap_c_api(dnnl_memory_unmap_data(get(), mapped_ptr),
+                "could not unmap memory object data");
+    }
+#endif
+
+    static dnnl_data_type_t convert_to_c(data_type adata_type) {
+        return static_cast<dnnl_data_type_t>(adata_type);
+    }
+    static dnnl_format_tag_t convert_to_c(format_tag format) {
+        return static_cast<dnnl_format_tag_t>(format);
+    }
+};
+
+inline bool operator==(dnnl_data_type_t a, memory::data_type b) {
+    return a == memory::convert_to_c(b);
+}
+inline bool operator!=(dnnl_data_type_t a, memory::data_type b) {
+    return !(a == b);
+}
+inline bool operator==(memory::data_type a, dnnl_data_type_t b) {
+    return b == a;
+}
+inline bool operator!=(memory::data_type a, dnnl_data_type_t b) {
+    return !(a == b);
+}
+
+inline bool operator==(dnnl_format_tag_t a, memory::format_tag b) {
+    return a == memory::convert_to_c(b);
+}
+inline bool operator!=(dnnl_format_tag_t a, memory::format_tag b) {
+    return !(a == b);
+}
+inline bool operator==(memory::format_tag a, dnnl_format_tag_t b) {
+    return b == a;
+}
+inline bool operator!=(memory::format_tag a, dnnl_format_tag_t b) {
+    return !(a == b);
+}
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+/// @addtogroup dnnl_api_attributes Attributes
+///
+/// A container for parameters that extend primitives behavior.
+///
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <>
+struct handle_traits<dnnl_post_ops_t> {
+    static dnnl_status_t destructor(dnnl_post_ops_t p) {
+        return dnnl_post_ops_destroy(p);
+    }
+};
+/// @endcond
+
+/// Post-ops.
+///
+/// Post-ops are computations executed after the main primitive computations
+/// and are attached to the primitive via primitive attributes.
+///
+/// @sa @ref dev_guide_attributes_post_ops
+///
+struct post_ops : public handle<dnnl_post_ops_t> {
+    using handle<dnnl_post_ops_t>::handle;
+
+    /// Constructs an empty sequence of post-ops.
+    post_ops() {
+        dnnl_post_ops_t result;
+        error::wrap_c_api(
+                dnnl_post_ops_create(&result), "could not create post-ops");
+        reset(result);
+    }
+
+    /// Creates post-ops primitive attribute from a C API ::dnnl_post_ops_t
+    /// handle. The resulting handle is not weak and the C handle will be
+    /// destroyed during the destruction of the C++ object.
+    ///
+    /// @param post_ops The C API post-ops primitive attribute.
+    post_ops(dnnl_post_ops_t post_ops) : handle<dnnl_post_ops_t>(post_ops) {}
+
+    /// Returns the number of post-ops entries.
+    int len() const { return dnnl_post_ops_len(get()); }
+
+    /// Returns the primitive kind of post-op at entry with a certain index.
+    /// @param index Index of the post-op to return the kind for.
+    /// @returns Primitive kind of the post-op at the specified index.
+    primitive::kind kind(int index) const {
+        error::wrap_c_api(index < len() ? dnnl_success : dnnl_invalid_arguments,
+                "post-ops index is out of range");
+        return static_cast<primitive::kind>(
+                dnnl_post_ops_get_kind(get(), index));
+    }
+
+    /// Appends an accumulation (sum) post-op. Prior to accumulating the
+    /// result, the previous value will be will be reduced by zero point
+    /// @p zero_point and multiplied by a scaling factor @p scale.
+    ///
+    /// The kind of this post-op is #dnnl::primitive::kind::sum.
+    ///
+    /// This feature may improve performance for cases like dequantize the
+    /// asymmetrically quantized sum's src1 tensor to f32 domain before
+    /// performing the sum operation by subtracting @p zero_point before the
+    /// scaling.
+    ///
+    /// In the simplest case when the accumulation is the only post-op,
+    /// the computations will be `dst[:] := scale * (dst[:] - zero_point) +
+    /// op(...)` instead of `dst[:] := op(...)`.
+    ///
+    /// If @p data_type is specified, the original dst tensor will be
+    /// reinterpreted as a tensor with the provided data type. Because it is a
+    /// reinterpretation, data_type and dst data type should have the same size.
+    /// As a result, computations will be `dst[:] <- scale *
+    /// (as_data_type(dst[:]) - zero_point) + op(...)` instead of
+    /// `dst[:] <- op(...)`.
+    ///
+    /// @note
+    ///     This post-op executes in-place and does not change the
+    ///     destination layout.
+    ///
+    /// @param scale Scaling factor.
+    /// @param zero_point Zero point.
+    /// @param data_type Data type.
+    void append_sum(float scale = 1.f, int32_t zero_point = 0,
+            memory::data_type data_type = memory::data_type::undef) {
+        error::wrap_c_api(dnnl_post_ops_append_sum(get(), scale, zero_point,
+                                  memory::convert_to_c(data_type)),
+                "could not append a sum post-op");
+    }
+
+    /// Returns the parameters of an accumulation (sum) post-op.
+    ///
+    /// @param index Index of the sum post-op.
+    /// @param scale Scaling factor of the sum post-op.
+    void get_params_sum(int index, float &scale) const {
+        error::wrap_c_api(dnnl_post_ops_get_params_sum(
+                                  get(), index, &scale, nullptr, nullptr),
+                "could not get parameters of a sum post-op");
+    }
+
+    /// Returns the parameters of an accumulation (sum) post-op.
+    ///
+    /// @param index Index of the sum post-op.
+    /// @param scale Scaling factor of the sum post-op.
+    /// @param data_type Data type of the sum post-op.
+    void get_params_sum(
+            int index, float &scale, memory::data_type &data_type) const {
+        dnnl_data_type_t c_data_type;
+        error::wrap_c_api(dnnl_post_ops_get_params_sum(
+                                  get(), index, &scale, nullptr, &c_data_type),
+                "could not get parameters of a sum post-op");
+        data_type = static_cast<memory::data_type>(c_data_type);
+    }
+
+    /// Returns the parameters of an accumulation (sum) post-op.
+    ///
+    /// @param index Index of the sum post-op.
+    /// @param scale Scaling factor of the sum post-op.
+    /// @param zero_point Single scalar int32_t value of zeropoint.
+    /// @param data_type Data type of the sum post-op.
+    void get_params_sum(int index, float &scale, int32_t &zero_point,
+            memory::data_type &data_type) const {
+        dnnl_data_type_t c_data_type;
+        error::wrap_c_api(dnnl_post_ops_get_params_sum(get(), index, &scale,
+                                  &zero_point, &c_data_type),
+                "could not get parameters of a sum post-op");
+        data_type = static_cast<memory::data_type>(c_data_type);
+    }
+
+    /// Appends an elementwise post-op.
+    ///
+    /// The kind of this post-op is #dnnl::primitive::kind::eltwise.
+    ///
+    /// In the simplest case when the elementwise is the only post-op, the
+    /// computations would be `dst[:] := eltwise_op (op(...))` instead
+    /// of `dst[:] <- op(...)`, where eltwise_op is configured with the given
+    /// parameters.
+    ///
+    /// @param aalgorithm Elementwise algorithm.
+    /// @param alpha Alpha parameter for the elementwise algorithm.
+    /// @param beta Beta parameter for the elementwise algorithm.
+    void append_eltwise(algorithm aalgorithm, float alpha, float beta) {
+        error::wrap_c_api(dnnl_post_ops_append_eltwise(
+                                  get(), convert_to_c(aalgorithm), alpha, beta),
+                "could not append an elementwise post-op");
+    }
+
+    /// Returns parameters of an elementwise post-op.
+    ///
+    /// @param index Index of the post-op.
+    /// @param aalgorithm Output elementwise algorithm kind.
+    /// @param alpha Output alpha parameter for the elementwise algorithm.
+    /// @param beta Output beta parameter for the elementwise algorithm.
+    void get_params_eltwise(
+            int index, algorithm &aalgorithm, float &alpha, float &beta) const {
+        dnnl_alg_kind_t c_alg;
+        error::wrap_c_api(dnnl_post_ops_get_params_eltwise(
+                                  get(), index, &c_alg, &alpha, &beta),
+                "could not get parameters of an elementwise post-op");
+        aalgorithm = static_cast<dnnl::algorithm>(c_alg);
+    }
+
+    /// Appends a depthwise post-op convolution.
+    ///
+    /// This post-op can only be fused with a 2D 1x1 convolution (convolution
+    /// with weights spatial dimension equal to 1 i.e., kh=kw=1).
+    ///
+    /// The kind of this post-op is #dnnl_convolution.
+    ///
+    /// The number of outputs for primitive remain same as before. The output
+    /// spatial size can be derived as below:
+    ///
+    /// output_height = ceil(output_height_1x1_convolution, stride)
+    /// output_width = ceil(output_width_1x1_convolution, stride)
+    ///
+    /// See @ref dev_guide_attributes_post_ops_depthwise and
+    /// @ref dev_guide_attributes_post_ops_depthwise_fusion for more info.
+    ///
+    /// @param weights_data_type Weights data type of depthwise post-op
+    /// @param bias_data_type Bias data type of depthwise post-op
+    /// @param dst_data_type Output data type of depthwise post-op
+    /// @param kernel_size Size of kernel of depthwise post-op
+    /// @param stride_size Size of stride of depthwise post-op
+    /// @param padding_l_size Size of left and top paddings of depthwise post-op
+    void append_dw(memory::data_type weights_data_type,
+            memory::data_type bias_data_type, memory::data_type dst_data_type,
+            memory::dim kernel_size, memory::dim stride_size,
+            memory::dim padding_l_size) {
+
+        error::wrap_c_api(dnnl_post_ops_append_dw(get(),
+                                  memory::convert_to_c(weights_data_type),
+                                  memory::convert_to_c(bias_data_type),
+                                  memory::convert_to_c(dst_data_type),
+                                  kernel_size, stride_size, padding_l_size),
+                "could not append depthwise post-op");
+    }
+
+    /// Returns the parameters of an depthwise post-op.
+    ///
+    /// @param index Index of the elementwise post-op.
+    /// @param weights_data_type Weights data type of depthwise post-op
+    /// @param bias_data_type Bias data type of depthwise post-op
+    /// @param dst_data_type Output data type of depthwise post-op
+    /// @param kernel_size Size of kernel of depthwise post-op
+    /// @param stride_size Size of stride of depthwise post-op
+    /// @param padding_l_size Size of left and top paddings of depthwise post-op
+    void get_params_dw(int index, memory::data_type &weights_data_type,
+            memory::data_type &bias_data_type, memory::data_type &dst_data_type,
+            memory::dim &kernel_size, memory::dim &stride_size,
+            memory::dim &padding_l_size) const {
+
+        dnnl_data_type_t c_weights_data_type;
+        dnnl_data_type_t c_bias_data_type;
+        dnnl_data_type_t c_dst_data_type;
+        dnnl_dim_t c_kernel_size;
+        dnnl_dim_t c_stride_size;
+        dnnl_dim_t c_padding_l_size;
+        error::wrap_c_api(
+                dnnl_post_ops_get_params_dw(get(), index, &c_weights_data_type,
+                        &c_bias_data_type, &c_dst_data_type, &c_kernel_size,
+                        &c_stride_size, &c_padding_l_size),
+                "could not get parameters of depthwise post-op");
+
+        weights_data_type = static_cast<memory::data_type>(c_weights_data_type);
+        bias_data_type = static_cast<memory::data_type>(c_bias_data_type);
+        dst_data_type = static_cast<memory::data_type>(c_dst_data_type);
+        kernel_size = c_kernel_size;
+        stride_size = c_stride_size;
+        padding_l_size = c_padding_l_size;
+    }
+
+    /// Appends a binary post-op.
+    ///
+    /// The kind of this post operation is #dnnl_binary.
+    ///
+    /// In the simplest case when the binary is the only post operation, the
+    /// computations would be:
+    ///
+    ///     dst[:] <- binary_op (dst[:], another_input[:])
+    ///
+    /// where binary_op is configured with the given parameters. binary_op
+    /// supports broadcast semantics for a second operand.
+    ///
+    /// @param aalgorithm Binary algorithm for the post-op.
+    /// @param src1_desc Memory descriptor of a second operand.
+    void append_binary(algorithm aalgorithm, const memory::desc &src1_desc) {
+        error::wrap_c_api(dnnl_post_ops_append_binary(get(),
+                                  convert_to_c(aalgorithm), src1_desc.get()),
+                "could not append a binary post-op");
+    }
+
+    /// Returns the parameters of a binary post-op.
+    ///
+    /// @param index Index of the binary post-op.
+    /// @param aalgorithm Output binary algorithm kind.
+    /// @param src1_desc Output memory descriptor of a second operand.
+    void get_params_binary(
+            int index, algorithm &aalgorithm, memory::desc &src1_desc) const {
+        dnnl_alg_kind_t c_alg;
+        const_dnnl_memory_desc_t cdesc;
+        error::wrap_c_api(
+                dnnl_post_ops_get_params_binary(get(), index, &c_alg, &cdesc),
+                "could not get parameters of a binary post-op");
+        aalgorithm = static_cast<dnnl::algorithm>(c_alg);
+        dnnl_memory_desc_t cloned_md = nullptr;
+        error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc),
+                "could not clone a memory descriptor");
+        src1_desc = memory::desc(cloned_md);
+    }
+
+    /// Appends a prelu forward post-op.
+    ///
+    /// The kind of this post-op is #dnnl::primitive::kind::prelu.
+    ///
+    /// The post-op can be defined as:
+    ///
+    ///      dst[:] <- prelu(dst[:], weights[:])
+    ///      prelu:
+    ///      dst[:] <- dst[:] if dst[:] > 0
+    ///      dst[:] <- dst[:] * weights[:] if dst[:] <= 0
+    ///
+    ///
+    /// Example usage:
+    /// @code
+    ///     int mb = 32, oc = 32,
+    ///         oh = 14, ow = 14; // convolution output params
+    ///     // unique weights per output channel
+    ///     vector<float> weights = { ... };
+    ///     int oc_dim = 1; // mb_dim = 0, channel_dim = 1, height_dim = 2, ...
+    ///
+    ///     // construct a convolution descriptor
+    ///     dnnl::convolution::desc conv_d;
+    ///
+    ///     dnnl::primitive_attr attr;
+    ///     attr.append_prelu(1 << oc_dim);
+    ///
+    ///     dnnl::primitive_desc conv_pd(conv_d, attr, engine);
+    ///     memory prelu_weights({{1}, dt::f32, {1}}, eng, weights.data());
+    ///
+    ///     std::unordered_map<int, memory> conv_args;
+    ///
+    ///     conv_args.insert(
+    ///      {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_WEIGHTS, prelu_weights})
+    /// @endcode
+    ///
+    /// @note
+    ///     The order of dimensions does not depend on how elements are laid
+    ///     out in memory. For example:
+    ///     - for a 2D CNN activations tensor the order is always (n, c)
+    ///     - for a 4D CNN activations tensor the order is always (n, c, h, w)
+    ///     - for a 5D CNN weights tensor the order is always
+    ///        (g, oc, ic, kh, kw)
+    ///
+    ///    Prelu weights tensor is passed in runtime execution phase. Prelu
+    ///    weights tensor data type is implicitly assumed as f32 using plain
+    ///    layout (a, ab, acb, acdb, acdeb).
+    ///
+    /// @param mask Defines the correspondence between the output tensor
+    ///     dimensions and the prelu weights tensor. The set i-th bit indicates
+    ///     that a dedicated weights value is used for each index along that
+    ///     dimension. Set the mask to 0 to use a common weights value
+    ///     for the whole output tensor.
+    void append_prelu(int mask) {
+        error::wrap_c_api(dnnl_post_ops_append_prelu(get(), mask),
+                "could not append a prelu post-op");
+    }
+
+    /// Returns the parameters of a prelu post-op.
+    ///
+    /// @param index Index of the prelu post-op.
+    /// @param mask Weights mask of prelu post-op.
+    void get_params_prelu(int index, int &mask) const {
+        error::wrap_c_api(dnnl_post_ops_get_params_prelu(get(), index, &mask),
+                "could not get parameters of a binary post-op");
+    }
+};
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <>
+struct handle_traits<dnnl_primitive_attr_t> {
+    static dnnl_status_t destructor(dnnl_primitive_attr_t p) {
+        return dnnl_primitive_attr_destroy(p);
+    }
+};
+/// @endcond
+
+/// Primitive attributes.
+///
+/// @sa @ref dev_guide_attributes
+struct primitive_attr : public handle<dnnl_primitive_attr_t> {
+    using handle<dnnl_primitive_attr_t>::handle;
+
+    /// Constructs default (empty) primitive attributes.
+    primitive_attr() {
+        dnnl_primitive_attr_t result;
+        error::wrap_c_api(dnnl_primitive_attr_create(&result),
+                "could not create primitive attribute");
+        reset(result);
+    }
+
+    /// Creates primitive attributes from a C API ::dnnl_primitive_attr_t
+    /// handle. The resulting handle is not weak and the C handle will be
+    /// destroyed during the destruction of the C++ object.
+    ///
+    /// @param attr The C API primitive attributes.
+    primitive_attr(dnnl_primitive_attr_t attr)
+        : handle<dnnl_primitive_attr_t>(attr) {}
+
+    /// Returns the parameters of a dropout attribute.
+    ///
+    /// @param mask_desc Output memory descriptor of a dropout mask.
+    void get_dropout(memory::desc &mask_desc) const {
+        const_dnnl_memory_desc_t cdesc;
+        error::wrap_c_api(dnnl_primitive_attr_get_dropout(get(), &cdesc),
+                "could not get parameters of a dropout attribute");
+        dnnl_memory_desc_t cloned_md = nullptr;
+        error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc),
+                "could not clone a memory descriptor");
+        mask_desc = memory::desc(cloned_md);
+    }
+
+    /// Sets dropout probability.
+    ///
+    /// @param mask_desc Output memory descriptor of a dropout mask.
+    void set_dropout(const memory::desc &mask_desc) {
+        error::wrap_c_api(
+                dnnl_primitive_attr_set_dropout(get(), mask_desc.get()),
+                "could not set dropout primitive attribute");
+    }
+
+    /// Returns the fpmath mode
+    fpmath_mode get_fpmath_mode() const {
+        dnnl_fpmath_mode_t result;
+        error::wrap_c_api(dnnl_primitive_attr_get_fpmath_mode(get(), &result),
+                "could not get fpmath mode primitive attribute");
+        return fpmath_mode(result);
+    }
+
+    /// Returns the fpmath mode
+    ///
+    /// @param mode Specified fpmath mode.
+    /// @param apply_to_int Use floating-point arithmetic for integer primitives.
+    void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const {
+        dnnl_fpmath_mode_t c_mode;
+        int c_apply_to_int;
+        error::wrap_c_api(dnnl_primitive_attr_get_fpmath_mode_v2(
+                                  get(), &c_mode, &c_apply_to_int),
+                "could not get fpmath mode primitive attribute");
+        mode = fpmath_mode(c_mode);
+        apply_to_int = static_cast<bool>(c_apply_to_int);
+    }
+
+    /// Sets fpmath mode.
+    ///
+    /// @param mode Specified fpmath mode.
+    /// @param apply_to_int Boolean. Use of floating-point arithmetic for integer primitives.
+    void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) {
+        error::wrap_c_api(dnnl_primitive_attr_set_fpmath_mode_v2(get(),
+                                  dnnl::convert_to_c(mode), apply_to_int),
+                "could not set fpmath mode primitive attribute");
+    }
+
+    /// Returns the accumulation mode
+    accumulation_mode get_accumulation_mode() const {
+        dnnl_accumulation_mode_t result;
+        error::wrap_c_api(
+                dnnl_primitive_attr_get_accumulation_mode(get(), &result),
+                "could not get accumulation mode primitive attribute");
+        return accumulation_mode(result);
+    }
+
+    /// Sets accumulation mode.
+    ///
+    /// @param mode Specified accumulation mode.
+    void set_accumulation_mode(accumulation_mode mode) {
+        error::wrap_c_api(dnnl_primitive_attr_set_accumulation_mode(
+                                  get(), dnnl::convert_to_c(mode)),
+                "could not set accumulation mode primitive attribute");
+    }
+
+    /// Returns the deterministic attribute value
+    bool get_deterministic() const {
+        int result;
+        error::wrap_c_api(dnnl_primitive_attr_get_deterministic(get(), &result),
+                "could not get deterministic primitive attribute");
+        return static_cast<bool>(result);
+    }
+
+    /// Sets deterministic attribute value
+    ///
+    /// @param value Specified deterministic mode.
+    void set_deterministic(bool value) {
+        error::wrap_c_api(dnnl_primitive_attr_set_deterministic(
+                                  get(), static_cast<int>(value)),
+                "could not set deterministic primitive attribute");
+    }
+
+    /// Returns the rounding mode attribute value
+    ///
+    /// @param arg Argument for which rounding mode query applies.
+    /// @returns The rounding mode applied to the specified argument.
+    rounding_mode get_rounding_mode(int arg) const {
+        dnnl_rounding_mode_t result;
+        error::wrap_c_api(dnnl_primitive_attr_get_rounding(get(), arg, &result),
+                "could not get rounding mode primitive attribute");
+        return rounding_mode(result);
+    }
+
+    /// Sets the rounding mode attribute value for a given argument
+    ///
+    /// @param arg Argument for which to set rounding mode.
+    /// @param mode Rounding mode to apply.
+    void set_rounding_mode(int arg, rounding_mode mode) {
+        error::wrap_c_api(dnnl_primitive_attr_set_rounding(
+                                  get(), arg, convert_to_c(mode)),
+                "could not set rounding mode primitive attribute");
+    }
+
+    /// Returns the scratchpad mode.
+    scratchpad_mode get_scratchpad_mode() const {
+        dnnl_scratchpad_mode_t result;
+        error::wrap_c_api(
+                dnnl_primitive_attr_get_scratchpad_mode(get(), &result),
+                "could not get scratchpad mode primitive attribute");
+        return scratchpad_mode(result);
+    }
+
+    /// Sets scratchpad mode.
+    ///
+    /// @param mode Specified scratchpad mode.
+    void set_scratchpad_mode(scratchpad_mode mode) {
+        error::wrap_c_api(dnnl_primitive_attr_set_scratchpad_mode(
+                                  get(), dnnl::convert_to_c(mode)),
+                "could not set scratchpad mode primitive attribute");
+    }
+
+    /// Sets scaling factors for primitive operations for a given memory
+    /// argument. The scaling factors must be passed at execution time
+    /// as an argument with index #DNNL_ARG_ATTR_SCALES | arg.
+    ///
+    /// @sa dnnl_primitive_attr_set_scales_mask
+    ///
+    /// @param arg Parameter argument index as passed to the
+    ///     primitive::execute() call.
+    /// @param mask Scaling factors correspondence mask that defines the
+    ///     correspondence between the tensor dimensions and the @p scales
+    ///     vector. The set i-th bit indicates that a dedicated scaling factor
+    ///     is used for each index along that dimension. Set the mask to 0 to
+    ///     use a common scaling factor for the whole output tensor.
+    void set_scales_mask(int arg, int mask) {
+        error::wrap_c_api(dnnl_primitive_attr_set_scales_mask(get(), arg, mask),
+                "could not set scales primitive attribute");
+    }
+
+    /// Sets scaling factors for primitive operations for a given memory
+    /// argument. The scaling factors must be passed at execution time
+    /// as an argument with index #DNNL_ARG_ATTR_SCALES | arg.
+    ///
+    /// @sa dnnl_primitive_attr_set_scales
+    ///
+    /// @param arg Parameter argument index as passed to the
+    ///     primitive::execute() call.
+    /// @param mask Scales correspondence mask that defines the
+    ///     correspondence between the tensor dimensions and the @p
+    ///     scales vector. The set i-th bit indicates that a dedicated
+    ///     scale is used for each index along that dimension. Set the
+    ///     mask to 0 to use a common scale for the whole output tensor.
+    /// @param groups Scaling factors correspondence groups that define the
+    ///     correspondence between the tensor dimensions and the scales array.
+    ///     The set i-th dimension indicates a number of groups of scaling
+    ///     factors used for that logical dimension in a memory indicated by @p arg.
+    /// @param data_type Scaling factors data_type.
+    void set_scales(int arg, int mask, const memory::dims &groups,
+            memory::data_type data_type = memory::data_type::f32) {
+        error::wrap_c_api(dnnl_primitive_attr_set_scales(get(), arg, mask,
+                                  (int)groups.size(), groups.data(),
+                                  memory::convert_to_c(data_type)),
+                "could not set scales primitive attribute");
+    }
+
+    /// Sets zero points for primitive operations for a given memory argument.
+    /// The zero points must be passed at execution time as an argument with
+    /// index #DNNL_ARG_ATTR_ZERO_POINTS | arg.
+    ///
+    /// @sa dnnl_primitive_attr_set_zero_points_mask
+    ///
+    /// @param arg Parameter argument index as passed to the
+    ///     primitive::execute() call.
+    /// @param mask Zero point correspondence mask that defines the
+    ///     correspondence between the tensor dimensions and the @p
+    ///     zero_points vector. The set i-th bit indicates that a dedicated
+    ///     zero point is used for each index along that dimension. Set the
+    ///     mask to 0 to use a common zero point for the whole output tensor.
+    void set_zero_points_mask(int arg, int mask) {
+        error::wrap_c_api(
+                dnnl_primitive_attr_set_zero_points_mask(get(), arg, mask),
+                "could not set zero points primitive attribute");
+    }
+
+    /// Sets zero points for primitive operations for a given memory argument.
+    /// The zero points must be passed at execution time as an argument with
+    /// index #DNNL_ARG_ATTR_ZERO_POINTS | arg.
+    ///
+    /// @sa dnnl_primitive_attr_set_zero_points
+    ///
+    /// @param arg Parameter argument index as passed to the
+    ///     primitive::execute() call.
+    /// @param mask Zero point correspondence mask that defines the
+    ///     correspondence between the tensor dimensions and the @p
+    ///     zero_points vector. The set i-th bit indicates that a dedicated
+    ///     zero point is used for each index along that dimension. Set the
+    ///     mask to 0 to use a common zero point for the whole output tensor.
+    /// @param groups Zero point factors correspondence groups that define the
+    ///     correspondence between the tensor dimensions and the zero_points array.
+    ///     The set i-th dimension indicates a number of groups of zero point
+    ///     factors used for that logical dimension in a memory indicated by @p arg.
+    /// @param data_type Zero point factors data_type.
+    void set_zero_points(int arg, int mask, const memory::dims &groups,
+            memory::data_type data_type = memory::data_type::s32) {
+        error::wrap_c_api(dnnl_primitive_attr_set_zero_points(get(), arg, mask,
+                                  (int)groups.size(), groups.data(),
+                                  memory::convert_to_c(data_type)),
+                "could not set zero points primitive attribute");
+    }
+
+    /// Returns post-ops previously set via set_post_ops().
+    ///
+    /// @returns Post-ops.
+    const post_ops get_post_ops() const {
+        const_dnnl_post_ops_t const_c_post_ops;
+        error::wrap_c_api(
+                dnnl_primitive_attr_get_post_ops(get(), &const_c_post_ops),
+                "could not get post-ops primitive attribute");
+        dnnl_post_ops_t c_post_ops;
+        error::wrap_c_api(dnnl_post_ops_clone(&c_post_ops, const_c_post_ops),
+                "could not clone post-ops primitive attribute");
+        return post_ops(c_post_ops);
+    }
+
+    /// Sets post-ops.
+    ///
+    /// @note
+    ///     There is no way to check whether the post-ops would be supported
+    ///     by the target primitive. Any error will be reported
+    ///     by the respective primitive descriptor constructor.
+    ///
+    /// @param ops Post-ops object to copy post-ops from.
+    void set_post_ops(const post_ops ops) {
+        error::wrap_c_api(dnnl_primitive_attr_set_post_ops(get(), ops.get()),
+                "could not set post-ops primitive attribute");
+    }
+
+    /// Sets quantization scale and shift parameters for RNN data tensors.
+    ///
+    /// For performance reasons, the low-precision configuration of the RNN
+    /// primitives expect input activations to have the unsigned 8-bit integer
+    /// data type. The scale and shift parameters are used to quantize
+    /// floating-point data to unsigned integer and must be passed to the RNN
+    /// primitive using attributes.
+    ///
+    /// The quantization formula is `scale * data + shift`.
+    ///
+    /// Example usage:
+    /// @code
+    ///     // RNN parameters
+    ///     int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32;
+    ///     // Activations quantization parameters
+    ///     float scale = 63.f, shift = 64.f;
+    ///
+    ///     primitive_attr attr;
+    ///
+    ///     // Set scale and shift for int8 quantization of activation
+    ///     attr.set_rnn_data_qparams(scale, shift);
+    ///
+    ///     // Create an RNN primitive descriptor.
+    ///     vanilla_rnn_forward::primitive_desc rnn_d(
+    ///             engine, /* arguments */, attr);
+    /// @endcode
+    ///
+    /// @note
+    ///     Quantization scale and shift are common for src_layer, src_iter,
+    ///     dst_iter, and dst_layer.
+    ///
+    /// @param scale The value to scale the data by.
+    /// @param shift The value to shift the data by.
+    void set_rnn_data_qparams(float scale, float shift) {
+        error::wrap_c_api(
+                dnnl_primitive_attr_set_rnn_data_qparams(get(), scale, shift),
+                "could not set RNN data quantization parameters primitive "
+                "attribute");
+    }
+
+    /// Returns the quantization scale and shift parameters for RNN data
+    /// tensors.
+    ///
+    /// @note
+    ///     Quantization scale and shift are common for src_layer, src_iter,
+    ///     dst_iter, and dst_layer.
+    ///
+    /// @param scale The value to scale the data by.
+    /// @param shift The value to shift the data by.
+    void get_rnn_data_qparams(float &scale, float &shift) {
+        float c_scale, c_shift;
+        error::wrap_c_api(dnnl_primitive_attr_get_rnn_data_qparams(
+                                  get(), &c_scale, &c_shift),
+                "could not set RNN data quantization parameters primitive "
+                "attribute");
+        scale = c_scale;
+        shift = c_shift;
+    }
+
+    /// Sets quantization scaling factors for RNN weights tensors. The
+    /// low-precision configuration of the RNN primitives expect input weights
+    /// to use the signed 8-bit integer data type. The scaling factors are
+    /// used to quantize floating-point data to signed integer and must be
+    /// passed to RNN primitives using attributes.
+    ///
+    /// @note
+    ///     The dimension order is always native and does not depend on the
+    ///     actual layout used. For example, five-dimensional weights always
+    ///     have (l, d, i, g, o) logical dimension ordering.
+    ///
+    /// @note
+    ///     Quantization scales are common for weights_layer and
+    ///     weights_iteration
+    ///
+    /// @param mask Scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the @p
+    ///     scales vector. The set i-th bit indicates that a dedicated scaling
+    ///     factor should be used each index along that dimension. Set the
+    ///     mask to 0 to use a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Constant vector of output scaling factors. The following
+    ///     equality must hold:
+    ///     \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$
+    ///     Violations can only be detected when the attributes are used to
+    ///     create a primitive descriptor.
+    void set_rnn_weights_qparams(int mask, const std::vector<float> &scales) {
+        error::wrap_c_api(dnnl_primitive_attr_set_rnn_weights_qparams(get(),
+                                  (int)scales.size(), mask, scales.data()),
+                "could not set RNN weights quantization parameters primitive "
+                "attribute");
+    }
+
+    /// Returns the quantization scaling factors for RNN projection weights
+    /// tensors.
+    ///
+    /// @note
+    ///     The dimension order is always native and does not depend on the
+    ///     actual layout used. For example, five-dimensional weights always
+    ///     have (l, d, i, g, o) logical dimension ordering.
+    ///
+    /// @param mask Scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the @p
+    ///     scales vector. The set i-th bit indicates that a dedicated scaling
+    ///     factor should be used each index along that dimension. Set the
+    ///     mask to 0 to use a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Constant vector of output scaling factors. The following
+    ///     equality must hold:
+    ///     \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$
+    ///     Violations can only be detected when the attributes are used to
+    ///     create a primitive descriptor.
+    void get_rnn_weights_qparams(int &mask, std::vector<float> &scales) {
+        dnnl_dim_t count;
+        int c_mask;
+        const float *c_scales;
+        error::wrap_c_api(dnnl_primitive_attr_get_rnn_weights_qparams(
+                                  get(), &count, &c_mask, &c_scales),
+                "could not get primitive RNN weights quantization "
+                "parameters attributes");
+        scales.resize(count);
+
+        mask = c_mask;
+        for (dnnl_dim_t c = 0; c < count; c++)
+            scales[c] = c_scales[c];
+    }
+
+    /// Sets quantization scaling factors for RNN projection weights tensors.
+    //  The low-precision configuration of the RNN primitives expect input
+    //  weights to use the signed 8-bit integer data type. The scaling factors
+    //  are used to quantize floating-point data to signed integer and must be
+    /// passed to RNN primitives using attributes.
+    ///
+    /// @note
+    ///     The dimension order is always native and does not depend on the
+    ///     actual layout used. For example, five-dimensional weights always
+    ///     have (l, d, i, g, o) logical dimension ordering.
+    ///
+    /// @note
+    ///     Quantization scales are common for weights_layer and
+    ///     weights_iteration
+    ///
+    /// @param mask Scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the @p
+    ///     scales vector. The set i-th bit indicates that a dedicated scaling
+    ///     factor should be used each index along that dimension. Set the
+    ///     mask to 0 to use a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Constant vector of output scaling factors. The following
+    ///     equality must hold:
+    ///     \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$
+    ///     Violations can only be detected when the attributes are used to
+    ///     create a primitive descriptor.
+    void set_rnn_weights_projection_qparams(
+            int mask, const std::vector<float> &scales) {
+        error::wrap_c_api(
+                dnnl_primitive_attr_set_rnn_weights_projection_qparams(
+                        get(), (int)scales.size(), mask, scales.data()),
+                "could not set primitive RNN weights projection quantization "
+                "parameters attributes");
+    }
+
+    /// Returns the quantization scaling factors for RNN projection weights
+    /// tensors.
+    ///
+    /// @note
+    ///     The dimension order is always native and does not depend on the
+    ///     actual layout used. For example, five-dimensional weights always
+    ///     have (l, d, i, g, o) logical dimension ordering.
+    ///
+    /// @param mask Scaling factors correspondence mask that defines the
+    ///     correspondence between the output tensor dimensions and the @p
+    ///     scales vector. The set i-th bit indicates that a dedicated scaling
+    ///     factor should be used each index along that dimension. Set the
+    ///     mask to 0 to use a common scaling factor for the whole output
+    ///     tensor.
+    /// @param scales Constant vector of output scaling factors. The following
+    ///     equality must hold:
+    ///     \f$scales.size() = \prod\limits_{d \in mask} weights.dims[d].\f$
+    ///     Violations can only be detected when the attributes are used to
+    ///     create a primitive descriptor.
+    void get_rnn_weights_projection_qparams(
+            int &mask, std::vector<float> &scales) {
+        dnnl_dim_t count;
+        int c_mask;
+        const float *c_scales;
+        error::wrap_c_api(
+                dnnl_primitive_attr_get_rnn_weights_projection_qparams(
+                        get(), &count, &c_mask, &c_scales),
+                "could not get primitive RNN weights projection quantization "
+                "parameters attributes");
+        scales.resize(count);
+
+        mask = c_mask;
+        for (dnnl_dim_t c = 0; c < count; c++)
+            scales[c] = c_scales[c];
+    }
+};
+
+/// @} dnnl_api_attributes
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Base class for all primitive descriptors.
+struct primitive_desc_base : public handle<dnnl_primitive_desc_t> {
+    using handle<dnnl_primitive_desc_t>::handle;
+
+    /// Default constructor. Produces an empty object.
+    primitive_desc_base() = default;
+
+    /// Returns the engine of the primitive descriptor.
+    /// @returns The engine of the primitive descriptor.
+    engine get_engine() const { return query_engine(query::engine); }
+
+    /// Returns implementation name.
+    /// @returns The implementation name.
+    const char *impl_info_str() const {
+        const char *res;
+        error::wrap_c_api(dnnl_primitive_desc_query(
+                                  get(), dnnl_query_impl_info_str, 0, &res),
+                "could not retrieve implementation info string from a "
+                "primitive descriptor");
+        return res;
+    }
+
+    /// Returns a memory::dim value (same as int64_t).
+    /// @param what The value to query.
+    /// @returns The result of the query.
+    memory::dim query_s64(query what) const {
+        memory::dim res;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl::convert_to_c(what), 0, &res);
+        return status == dnnl_success ? res : 0;
+    }
+
+    /// Returns strides.
+    /// @returns Strides.
+    /// @returns An empty #dnnl::memory::dims if the primitive does not have
+    ///     a strides parameter.
+    memory::dims get_strides() const { return query_dims(query::strides); }
+
+    /// Returns dilations.
+    /// @returns Dilations.
+    /// @returns An empty #dnnl::memory::dims if the primitive does not have
+    ///     a dilations parameter.
+    memory::dims get_dilations() const { return query_dims(query::dilations); }
+
+    /// Returns a left padding.
+    /// @returns A left padding.
+    /// @returns An empty #dnnl::memory::dims if the primitive does not have
+    ///     a left padding parameter.
+    memory::dims get_padding_l() const { return query_dims(query::padding_l); }
+
+    /// Returns a right padding.
+    /// @returns A right padding.
+    /// @returns An empty #dnnl::memory::dims if the primitive does not have
+    ///     a right padding parameter.
+    memory::dims get_padding_r() const { return query_dims(query::padding_r); }
+
+    /// Returns an epsilon.
+    /// @returns An epsilon.
+    /// @returns Zero if the primitive does not have an epsilon parameter.
+    float get_epsilon() const { return query_f32(query::epsilon_f32); }
+
+    /// Returns flags.
+    /// @tparam T Flags enumeration type.
+    /// @returns Flags.
+    /// @returns Zero if the primitive does not have a flags parameter.
+    template <typename T = unsigned>
+    T get_flags() const {
+        unsigned res;
+        dnnl_status_t status
+                = dnnl_primitive_desc_query(get(), dnnl_query_flags, 0, &res);
+        return static_cast<T>(status == dnnl_success ? res : 0x0U);
+    }
+
+    /// Returns an algorithm kind.
+    /// @returns An algorithm kind.
+    /// @returns #dnnl::algorithm::undef if the primitive does not have an
+    ///     algorithm parameter.
+    dnnl::algorithm get_algorithm() const { return query_alg(query::alg_kind); }
+
+    /// Returns an alpha.
+    /// @returns An alpha.
+    /// @returns Zero if the primitive does not have an alpha parameter.
+    float get_alpha() const { return query_f32(query::alpha_f32); }
+
+    /// Returns a beta.
+    /// @returns A beta.
+    /// @returns Zero if the primitive does not have a beta parameter.
+    float get_beta() const { return query_f32(query::beta_f32); }
+
+    /// Returns an axis.
+    /// @returns An axis.
+    /// @returns A negative number if the primitive does not have an axis
+    ///     parameter.
+    int get_axis() const {
+        int res;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl_query_axis_s32, 0, &res);
+        return status == dnnl_success ? res : -1;
+    }
+
+    /// Returns an LRN local size parameter.
+    /// @returns An LRN local size parameter.
+    /// @returns Zero if the primitive does not have an LRN local size
+    ///     parameter.
+    memory::dim get_local_size() const {
+        return query_s64(query::local_size_s64);
+    }
+
+    /// Returns an LRN K parameter.
+    /// @returns An LRN K parameter.
+    /// @returns Zero if the primitive does not have an LRN K parameter.
+    float get_k() const { return query_f32(query::k_f32); }
+
+    /// Returns a reduction P parameter.
+    /// @returns A reduction P parameter.
+    /// @returns Zero if the primitive does not have a reduction P parameter.
+    float get_p() const { return query_f32(query::p_f32); }
+
+    /// Returns a resampling factors parameters.
+    /// @returns A vector of factors.
+    /// @returns An empty vector if the primitive does not have a resampling
+    ///     factors parameter.
+    std::vector<float> get_factors() const {
+        float *factors;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl_query_factors, 0, &factors);
+
+        const bool is_backward = get_prop_kind() != prop_kind::forward_training
+                && get_prop_kind() != prop_kind::forward_inference;
+        const_dnnl_memory_desc_t md = dnnl_primitive_desc_query_md(get(),
+                is_backward ? dnnl_query_diff_dst_md : dnnl_query_dst_md, 0);
+
+        int ndims;
+        error::wrap_c_api(
+                dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &ndims),
+                "could not query ndims from a memory descriptor");
+
+        return status == dnnl_success
+                ? std::vector<float>(factors, factors + (ndims - 2))
+                : std::vector<float> {};
+    }
+
+    /// Returns an RNN cell kind parameter.
+    /// @returns An RNN cell kind parameter.
+    /// @returns #dnnl::algorithm::undef if the primitive does not have an
+    ///     RNN cell kind parameter.
+    dnnl::algorithm get_cell_kind() const {
+        return query_alg(query::cell_kind);
+    }
+
+    /// Returns an RNN direction parameter.
+    /// @returns An RNN direction parameter.
+    /// @returns #dnnl::rnn_direction::undef if the primitive does not have
+    ///     an RNN direction parameter.
+    dnnl::rnn_direction get_direction() const {
+        dnnl_rnn_direction_t direction;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl_query_direction, 0, &direction);
+        return status == dnnl_success
+                ? static_cast<dnnl::rnn_direction>(direction)
+                : dnnl::rnn_direction::undef;
+    }
+
+    /// Returns an RNN activation kind parameter.
+    /// @returns An RNN activation kind parameter.
+    /// @returns #dnnl::algorithm::undef if the primitive does not have an
+    ///     RNN activation kind parameter.
+    dnnl::algorithm get_activation_kind() const {
+        return query_alg(query::activation_kind);
+    }
+
+    /// Returns a pooling kernel parameter.
+    /// @returns A pooling kernel parameter.
+    /// @returns An empty #dnnl::memory::dims if the primitive does not have
+    ///     a pooling kernel parameter.
+    memory::dims get_kernel() const { return query_dims(query::kernel); }
+
+    /// Returns a group size parameter.
+    /// @returns A group size parameter.
+    /// @returns Zero if the primitive does not have a group size
+    ///     parameter.
+    memory::dim get_group_size() const {
+        return query_s64(query::group_size_s64);
+    }
+
+    /// Returns a propagation kind.
+    /// @returns A propagation kind.
+    /// @returns #dnnl::prop_kind::undef if the primitive does not have
+    ///     a propagation parameter.
+    dnnl::prop_kind get_prop_kind() const {
+        dnnl_prop_kind_t prop_kind;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl_query_prop_kind, 0, &prop_kind);
+        return status == dnnl_success ? static_cast<dnnl::prop_kind>(prop_kind)
+                                      : dnnl::prop_kind::undef;
+    }
+
+    /// Returns a memory descriptor.
+    ///
+    /// @note
+    ///     There are also convenience methods
+    ///     #dnnl::primitive_desc_base::src_desc(),
+    ///     #dnnl::primitive_desc_base::dst_desc(), and others.
+    ///
+    /// @param what The kind of parameter to query; can be
+    ///     #dnnl::query::src_md, #dnnl::query::dst_md, etc.
+    /// @param idx Index of the parameter. For example, convolution bias can
+    ///     be queried with what = #dnnl::query::weights_md and idx = 1.
+    /// @returns The requested memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     parameter of the specified kind or index.
+    memory::desc query_md(query what, int idx = 0) const {
+        std::vector<query> valid_q {query::src_md, query::diff_src_md,
+                query::weights_md, query::diff_weights_md, query::dst_md,
+                query::diff_dst_md, query::workspace_md, query::scratchpad_md,
+                query::exec_arg_md};
+        if (!std::any_of(valid_q.cbegin(), valid_q.cend(),
+                    [=](query q) { return what == q; }))
+            DNNL_THROW_ERROR(dnnl_invalid_arguments,
+                    "memory descriptor query is invalid");
+
+        const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(
+                get(), dnnl::convert_to_c(what), idx);
+        if (!cdesc) return memory::desc();
+
+        dnnl_memory_desc_t cloned_md = nullptr;
+        error::wrap_c_api(dnnl_memory_desc_clone(&cloned_md, cdesc),
+                "could not clone a memory descriptor");
+
+        return memory::desc(cloned_md);
+    }
+
+    /// Returns a source memory descriptor.
+    /// @param idx Source index.
+    /// @returns Source memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     source parameter with index @p idx.
+    memory::desc src_desc(int idx) const {
+        return query_md(query::src_md, idx);
+    }
+
+    /// Returns a destination memory descriptor.
+    /// @param idx Destination index.
+    /// @returns Destination memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     destination parameter with index @p idx.
+    memory::desc dst_desc(int idx) const {
+        return query_md(query::dst_md, idx);
+    }
+
+    /// Returns a weights memory descriptor.
+    /// @param idx Weights index.
+    /// @returns Weights memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     weights parameter with index @p idx.
+    memory::desc weights_desc(int idx) const {
+        return query_md(query::weights_md, idx);
+    }
+
+    /// Returns a diff source memory descriptor.
+    /// @param idx Diff source index.
+    /// @returns Diff source memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff source parameter with index @p idx.
+    memory::desc diff_src_desc(int idx) const {
+        return query_md(query::diff_src_md, idx);
+    }
+
+    /// Returns a diff destination memory descriptor.
+    /// @param idx Diff destination index.
+    /// @returns Diff destination memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff destination parameter with index @p idx.
+    memory::desc diff_dst_desc(int idx) const {
+        return query_md(query::diff_dst_md, idx);
+    }
+
+    /// Returns a diff weights memory descriptor.
+    /// @param idx Diff weights index.
+    /// @returns Diff weights memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff weights parameter with index @p idx.
+    memory::desc diff_weights_desc(int idx) const {
+        return query_md(query::diff_weights_md, idx);
+    }
+
+    // Separate versions without the index argument for documentation
+    // purposes.
+
+    /// Returns a source memory descriptor.
+    /// @returns Source memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     source parameter.
+    memory::desc src_desc() const { return src_desc(0); }
+
+    /// Returns a destination memory descriptor.
+    /// @returns Destination memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     destination parameter.
+    memory::desc dst_desc() const { return dst_desc(0); }
+
+    /// Returns a weights memory descriptor.
+    /// @returns Weights memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     weights parameter.
+    memory::desc weights_desc() const { return weights_desc(0); }
+
+    /// Returns a diff source memory descriptor.
+    /// @returns Diff source memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff source memory with.
+    memory::desc diff_src_desc() const { return diff_src_desc(0); }
+
+    /// Returns a diff destination memory descriptor.
+    /// @returns Diff destination memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff destination parameter.
+    memory::desc diff_dst_desc() const { return diff_dst_desc(0); }
+
+    /// Returns a diff weights memory descriptor.
+    /// @returns Diff weights memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///     diff weights parameter.
+    memory::desc diff_weights_desc() const { return diff_weights_desc(0); }
+
+    /// Returns the workspace memory descriptor.
+    /// @returns Workspace memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not require
+    ///     workspace parameter.
+    memory::desc workspace_desc() const {
+        return query_md(query::workspace_md, 0);
+    }
+
+    /// Returns the scratchpad memory descriptor.
+    /// @returns scratchpad memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not require
+    ///     scratchpad parameter.
+    /// @sa @ref dev_guide_attributes_scratchpad
+    memory::desc scratchpad_desc() const {
+        return query_md(query::scratchpad_md, 0);
+    }
+
+    /// Returns the engine on which the scratchpad memory is located.
+    /// @returns The engine on which the scratchpad memory is located.
+    engine scratchpad_engine() const {
+        dnnl_engine_t c_engine;
+        error::wrap_c_api(dnnl_primitive_desc_query(get(),
+                                  dnnl::convert_to_c(query::scratchpad_engine),
+                                  0, &c_engine),
+                "could not retrieve scratchpad engine from a primitive "
+                "descriptor");
+        return engine(c_engine, true);
+    }
+
+    /// Returns the primitive attributes.
+    /// @returns The primitive attributes.
+    primitive_attr get_primitive_attr() const {
+        const_dnnl_primitive_attr_t const_c_attr;
+        error::wrap_c_api(dnnl_primitive_desc_get_attr(get(), &const_c_attr),
+                "could not get attributes from a primitive descriptor");
+        dnnl_primitive_attr_t c_attr;
+        error::wrap_c_api(dnnl_primitive_attr_clone(&c_attr, const_c_attr),
+                "could not clone primitive attributes");
+        return primitive_attr(c_attr);
+    }
+
+    /// Returns the kind of the primitive descriptor.
+    /// @returns The kind of the primitive descriptor.
+    dnnl::primitive::kind get_kind() const {
+        dnnl_primitive_kind_t kind;
+        error::wrap_c_api(dnnl_primitive_desc_query(get(),
+                                  dnnl_query_primitive_kind, 0, (void *)&kind),
+                "could not get primitive kind from a primitive descriptor");
+        return static_cast<dnnl::primitive::kind>(kind);
+    }
+
+    /// Returns the cache blob ID of the primitive descriptor.
+    /// @returns The cache blob ID of the primitive descriptor.
+    std::vector<uint8_t> get_cache_blob_id() const {
+        dnnl_dim_t count;
+        const uint8_t *c_id;
+        error::wrap_c_api(
+                dnnl_primitive_desc_query(get(),
+                        dnnl::convert_to_c(query::cache_blob_id_size_s64), 0,
+                        (void *)&count),
+                "could not get size of cache blob ID from a primitive "
+                "descriptor");
+        error::wrap_c_api(dnnl_primitive_desc_query(get(),
+                                  dnnl::convert_to_c(query::cache_blob_id), 0,
+                                  (void **)&c_id),
+                "could not get cache blob ID from a primitive descriptor");
+        std::vector<uint8_t> id(c_id, c_id + count);
+        return id;
+    }
+
+protected:
+    /// Returns a float value.
+    /// @param what The value to query.
+    /// @returns The result of the query.
+    /// @returns Zero if the primitive doesn't support the query.
+    float query_f32(query what) const {
+        float res;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl::convert_to_c(what), 0, &res);
+        return status == dnnl_success ? res : 0.0f;
+    }
+
+    /// Returns an #dnnl::algorithm value.
+    /// @param what The value to query.
+    /// @returns The result of the query.
+    /// @returns #dnnl::algorithm::undef if the primitive doesn't support
+    ///     the query.
+    algorithm query_alg(query what) const {
+        dnnl_alg_kind_t res;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl::convert_to_c(what), 0, &res);
+        return status == dnnl_success ? static_cast<dnnl::algorithm>(res)
+                                      : algorithm::undef;
+    }
+
+    /// Returns a memory::dims value.
+    /// @param what The value to query.
+    /// @returns The result of the query.
+    /// @returns An empty #dnnl::memory::dims if the primitive doesn't support
+    ///     the query.
+    memory::dims query_dims(query what) const {
+        const bool is_backward = get_prop_kind() != prop_kind::forward_training
+                && get_prop_kind() != prop_kind::forward_inference;
+        const_dnnl_memory_desc_t md = dnnl_primitive_desc_query_md(get(),
+                is_backward ? dnnl_query_diff_dst_md : dnnl_query_dst_md, 0);
+
+        int nspatial_dims = 0;
+        if (md) {
+            int ndims;
+            error::wrap_c_api(
+                    dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &ndims),
+                    "could not query ndims from a memory descriptor");
+            nspatial_dims = ndims - 2;
+        }
+
+        dnnl_dims_t *c_dims;
+        dnnl_status_t status = dnnl_primitive_desc_query(
+                get(), dnnl::convert_to_c(what), 0, &c_dims);
+        return status == dnnl_success
+                ? memory::dims(*c_dims, *c_dims + nspatial_dims)
+                : memory::dims {};
+    }
+
+    /// Returns an #dnnl::engine value.
+    /// @param what The value to query.
+    /// @returns The result of the query.
+    /// @returns A weak handle to the engine that the primitive descriptor was
+    ///     created with.
+    engine query_engine(query what) const {
+        dnnl_engine_t c_engine;
+        error::wrap_c_api(dnnl_primitive_desc_query(get(),
+                                  dnnl::convert_to_c(what), 0, &c_engine),
+                "could not get an engine from a primitive_desc");
+        return engine(c_engine, true);
+    }
+
+    /// Resets the value of the handle to a clone of a C API primitive
+    /// descriptor.
+    /// @param pd A C API primitive descriptor to clone.
+    void reset_with_clone(const_dnnl_primitive_desc_t pd) {
+        dnnl_primitive_desc_t new_pd;
+        error::wrap_c_api(dnnl_primitive_desc_clone(&new_pd, pd),
+                "could not clone a primitive descriptor");
+        reset(new_pd);
+    }
+
+    /// Constructs a primitive descriptor base object from a clone of a C API
+    /// primitive descriptor after verifying that it is what the caller
+    /// expects.
+    ///
+    /// @note
+    ///     The @p prim_kind should map to a primitive that does not have
+    ///     different values of propagation kind (e.g. #dnnl::binary).
+    /// @note
+    ///     Primitive descriptor base constructed this way does not support
+    ///     next_impl() (will throw).
+    ///
+    /// @param pd C API primitive descriptor to clone.
+    /// @param prim_kind Expected primitive kind.
+    primitive_desc_base(
+            dnnl_primitive_desc_t pd, dnnl::primitive::kind prim_kind)
+        : primitive_desc_base(pd, prim_kind, dnnl::prop_kind::undef) {}
+
+    /// Constructs a primitive descriptor base object from a clone of a C API
+    /// primitive descriptor after verifying that it is what the caller
+    /// expects.
+    ///
+    /// @note
+    ///     Primitive descriptor base constructed this way does not support
+    ///     next_impl() (will throw).
+    ///
+    /// @param pd C API primitive descriptor to clone.
+    /// @param prim_kind Expected primitive kind.
+    /// @param aprop_kind Expected propagation kind.
+    primitive_desc_base(dnnl_primitive_desc_t pd,
+            dnnl::primitive::kind prim_kind, dnnl::prop_kind aprop_kind)
+        : primitive_desc_base(pd, prim_kind, aprop_kind, aprop_kind) {}
+
+    /// Constructs a primitive descriptor base object from a clone of a C API
+    /// primitive descriptor after verifying that it is what the caller
+    /// expects.
+    ///
+    /// @note
+    ///     Primitive descriptor base constructed this way does not support
+    ///     next_impl() (will throw).
+    ///
+    /// @param pd C API primitive descriptor to clone.
+    /// @param prim_kind Expected primitive kind.
+    /// @param prop_kind1 Expected propagation kind (option 1).
+    /// @param prop_kind2 Expected propagation kind (option 2). This value is
+    ///     checked if the check with @p prop_kind1 fails.
+    primitive_desc_base(dnnl_primitive_desc_t pd,
+            dnnl::primitive::kind prim_kind, dnnl::prop_kind prop_kind1,
+            dnnl::prop_kind prop_kind2) {
+        // It is OK to pass an empty primitive descriptor
+        if (pd == nullptr) return;
+
+        dnnl_status_t rc;
+
+        dnnl_primitive_kind_t c_prim_kind = convert_to_c(prim_kind);
+        dnnl_prop_kind_t c_prop_kind1 = convert_to_c(prop_kind1);
+        dnnl_prop_kind_t c_prop_kind2 = convert_to_c(prop_kind2);
+
+        // Check that primitive kind matches
+        dnnl_primitive_kind_t pd_kind;
+        rc = dnnl_primitive_desc_query(
+                pd, dnnl_query_primitive_kind, 0, (void *)&pd_kind);
+        error::wrap_c_api(
+                rc, "could not get primitive kind from a primitive descriptor");
+        if (pd_kind != c_prim_kind)
+            DNNL_THROW_ERROR(dnnl_invalid_arguments,
+                    "primitive descriptor operation kind mismatch");
+
+        // Check that propagation kind matches
+        dnnl_prop_kind_t pd_prop_kind;
+        rc = dnnl_primitive_desc_query(
+                pd, dnnl_query_prop_kind, 0, (void *)&pd_prop_kind);
+
+        // Something went wrong
+        if (rc != dnnl_success && rc != dnnl_unimplemented)
+            DNNL_THROW_ERROR(dnnl_invalid_arguments,
+                    "could not get propagation kind from the primitive "
+                    "descriptor");
+
+        // Everything is fine
+        if ((rc == dnnl_unimplemented && c_prop_kind1 == dnnl_prop_kind_undef)
+                || (rc == dnnl_success
+                        && (pd_prop_kind == c_prop_kind1
+                                || pd_prop_kind == c_prop_kind2))) {
+            reset_with_clone(pd);
+            return;
+        }
+
+        // We could get the propagation kind but there is a mismatch
+        DNNL_THROW_ERROR(dnnl_invalid_arguments,
+                "primitive descriptor propagation kind mismatch");
+    }
+
+    /// Returns a constant reference to a static instance of default constructed
+    /// primitive attributes
+    static const primitive_attr &default_attr() {
+        static const primitive_attr attr;
+        return attr;
+    }
+
+    const_dnnl_memory_desc_t optional_arg(const memory::desc *md) {
+        return md ? md->get() : nullptr;
+    }
+
+    const dnnl_dim_t *optional_arg(const memory::dims *dims) {
+        return dims ? dims->data() : nullptr;
+    }
+
+    const float *optional_arg(const std::vector<float> *arg) {
+        return arg ? arg->data() : nullptr;
+    }
+
+    using base = primitive_desc_base;
+};
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_reorder Reorder
+///
+/// A primitive to copy data between two memory objects. This primitive is
+/// typically used to change the way the data is laid out in memory.
+///
+/// @sa @ref dev_guide_reorder in developer guide
+///
+/// @{
+
+/// Reorder primitive.
+struct reorder : public primitive {
+    /// Primitive descriptor for a reorder primitive.
+    struct primitive_desc : public primitive_desc_base {
+        using primitive_desc_base::primitive_desc_base;
+
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for reorder primitive.
+        ///
+        /// @note
+        ///     If @p allow_empty is true, the constructor does not throw if a
+        ///     primitive descriptor cannot be created.
+        ///
+        /// @param src_engine Engine on which the source memory object will be
+        ///     located.
+        /// @param src_md Source memory descriptor.
+        /// @param dst_engine Engine on which the destination memory object
+        ///     will be located.
+        /// @param dst_md Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is allowed
+        ///     to fail without throwing an exception. In this case an empty
+        ///     object will be produced. This flag is optional and defaults to
+        ///     false.
+        primitive_desc(const engine &src_engine, const memory::desc &src_md,
+                const engine &dst_engine, const memory::desc &dst_md,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t result;
+            dnnl_status_t status = dnnl_reorder_primitive_desc_create(&result,
+                    src_md.get(), src_engine.get(), dst_md.get(),
+                    dst_engine.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the reorder primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for reorder primitive.
+        ///
+        /// @param src Source memory object. It is used to obtain the source
+        ///     memory descriptor and engine.
+        /// @param dst Destination memory object. It is used to obtain the
+        ///     destination memory descriptor and engine.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is allowed
+        ///     to fail without throwing an exception. In this case an empty
+        ///     object will be produced. This flag is optional and defaults to
+        ///     false.
+        primitive_desc(const memory &src, const memory &dst,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t result;
+            auto src_md = src.get_desc();
+            auto dst_md = dst.get_desc();
+            dnnl_status_t status = dnnl_reorder_primitive_desc_create(&result,
+                    src_md.get(), src.get_engine().get(), dst_md.get(),
+                    dst.get_engine().get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the reorder primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for reorder primitive from a C
+        /// API primitive descriptor which must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for reorder primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : primitive_desc_base(pd, dnnl::primitive::kind::reorder) {}
+
+        /// Returns the engine on which the source memory is allocated.
+        /// @returns The engine on which the source memory is allocated.
+        engine get_src_engine() const {
+            return query_engine(dnnl::query::reorder_src_engine);
+        }
+
+        /// Returns the engine on which the destination memory is allocated.
+        /// @returns The engine on which the destination memory is allocated.
+        engine get_dst_engine() const {
+            return query_engine(dnnl::query::reorder_dst_engine);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    reorder() = default;
+
+    /// Constructs a reorder primitive.
+    /// @param pd Primitive descriptor for reorder primitive.
+    reorder(const primitive_desc &pd) : primitive(pd.get()) {}
+
+    /// Constructs a reorder primitive from a cache blob.
+    /// @param pd Primitive descriptor for reorder primitive.
+    /// @param cache_blob Cache blob.
+    reorder(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd.get(), cache_blob) {}
+
+    /// Constructs a reorder primitive that would reorder data between memory
+    /// objects having the same memory descriptors as memory objects @p src and
+    /// @p dst.
+    ///
+    /// @param src Source memory object.
+    /// @param dst Destination memory object.
+    /// @param attr Primitive attributes to use (optional).
+    reorder(const memory &src, const memory &dst,
+            const primitive_attr &attr = primitive_attr())
+        : primitive(primitive_desc(src, dst, attr).get()) {}
+
+    using primitive::execute;
+
+    /// Executes the reorder primitive.
+    ///
+    /// @param astream Stream object. The stream must belong to the same engine
+    ///     as the primitive.
+    /// @param src Source memory object.
+    /// @param dst Destination memory object.
+    void execute(const stream &astream, memory &src, memory &dst) const {
+        primitive::execute(astream, {{DNNL_ARG_FROM, src}, {DNNL_ARG_TO, dst}});
+    }
+};
+
+/// @} dnnl_api_reorder
+
+/// @addtogroup dnnl_api_concat Concat
+///
+/// A primitive to concatenate data by arbitrary dimension.
+///
+/// @sa @ref dev_guide_concat in developer guide
+///
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+inline std::vector<const_dnnl_memory_desc_t> convert_to_c(
+        const std::vector<memory::desc> &mds) {
+    std::vector<const_dnnl_memory_desc_t> c_mds;
+    c_mds.reserve(mds.size());
+    for (const auto &md : mds)
+        c_mds.push_back(md.get());
+    return c_mds;
+}
+/// @endcond
+
+/// Tensor concatenation (concat) primitive.
+struct concat : public primitive {
+    /// Primitive descriptor for a concat primitive.
+    struct primitive_desc : public primitive_desc_base {
+        using primitive_desc_base::primitive_desc_base;
+
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an out-of-place concatenation
+        /// primitive.
+        ///
+        /// @param aengine Engine to perform the operation on.
+        /// @param dst Destination memory descriptor.
+        /// @param concat_dimension Source tensors will be concatenated over
+        ///     dimension with this index. Note that order of dimensions does
+        ///     not depend on memory format.
+        /// @param srcs Vector of source memory descriptors.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &dst,
+                int concat_dimension, const std::vector<memory::desc> &srcs,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            auto c_srcs = convert_to_c(srcs);
+
+            dnnl_primitive_desc_t result;
+            dnnl_status_t status = dnnl_concat_primitive_desc_create(&result,
+                    aengine.get(), dst.get(), (int)c_srcs.size(),
+                    concat_dimension, c_srcs.data(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the concat primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for an out-of-place concatenation
+        /// primitive.
+        ///
+        /// This version derives the destination memory descriptor
+        /// automatically.
+        ///
+        /// @param aengine Engine to perform the operation on.
+        /// @param concat_dimension Source tensors will be concatenated over
+        ///     dimension with this index. Note that order of dimensions does
+        ///     not depend on memory format.
+        /// @param srcs Vector of source memory descriptors.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, int concat_dimension,
+                const std::vector<memory::desc> &srcs,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            auto c_api_srcs = convert_to_c(srcs);
+
+            dnnl_primitive_desc_t result;
+            dnnl_status_t status = dnnl_concat_primitive_desc_create(&result,
+                    aengine.get(), nullptr, (int)c_api_srcs.size(),
+                    concat_dimension, c_api_srcs.data(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the concat primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for concat primitive from a C
+        /// API primitive descriptor which must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for concat primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : primitive_desc_base(pd, dnnl::primitive::kind::concat) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc(int)const
+        memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    concat() = default;
+
+    /// Constructs a concatenation primitive.
+    /// @param pd Primitive descriptor for concatenation primitive.
+    concat(const primitive_desc &pd) : primitive(pd.get()) {}
+
+    /// Constructs a concatenation primitive from a cache blob.
+    /// @param pd Primitive descriptor for concatenation primitive.
+    /// @param cache_blob Cache blob.
+    concat(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd.get(), cache_blob) {}
+};
+
+/// @} dnnl_api_concat
+
+/// @addtogroup dnnl_api_sum Sum
+///
+/// A primitive to sum multiple tensors.
+///
+/// @sa @ref dev_guide_sum in developer guide
+///
+/// @{
+
+/// Out-of-place summation (sum) primitive.
+struct sum : public primitive {
+    /// Primitive descriptor for a sum primitive.
+    struct primitive_desc : public primitive_desc_base {
+        using primitive_desc_base::primitive_desc_base;
+
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a sum primitive.
+        ///
+        /// @param aengine Engine to perform the operation on.
+        /// @param dst Destination memory descriptor.
+        /// @param scales Vector of scales to multiply data in each source
+        ///     memory by.
+        /// @param srcs Vector of source memory descriptors.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &dst,
+                const std::vector<float> &scales,
+                const std::vector<memory::desc> &srcs,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            validate_container_size(scales,
+                    "counts of scales and sources are not equal",
+                    (int)srcs.size(), (int)srcs.size());
+
+            auto c_api_srcs = convert_to_c(srcs);
+
+            dnnl_primitive_desc_t result;
+            dnnl_status_t status = dnnl_sum_primitive_desc_create(&result,
+                    aengine.get(), dst.get(), (int)c_api_srcs.size(),
+                    scales.data(), c_api_srcs.data(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the sum primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for a sum primitive.
+        ///
+        /// This version derives the destination memory descriptor
+        /// automatically.
+        ///
+        /// @param aengine Engine on which to perform the operation.
+        /// @param scales Vector of scales by which to multiply data in each
+        ///     source memory object.
+        /// @param srcs Vector of source memory descriptors.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const std::vector<float> &scales,
+                const std::vector<memory::desc> &srcs,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            validate_container_size(scales,
+                    "counts of scales and sources are not equal",
+                    (int)srcs.size(), (int)srcs.size());
+
+            auto c_api_srcs = convert_to_c(srcs);
+            dnnl_primitive_desc_t result;
+            dnnl_status_t status = dnnl_sum_primitive_desc_create(&result,
+                    aengine.get(), nullptr, (int)c_api_srcs.size(),
+                    scales.data(), c_api_srcs.data(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the sum primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
+        }
+
+        /// Constructs a primitive descriptor for sum primitive from a C API
+        /// primitive descriptor which must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for sum primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : primitive_desc_base(pd, dnnl::primitive::kind::sum) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc(int)const
+        memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    sum() = default;
+
+    /// Constructs a sum primitive.
+    /// @param pd Primitive descriptor for sum primitive.
+    sum(const primitive_desc &pd) : primitive(pd.get()) {}
+
+    /// Constructs a sum primitive from a cache blob.
+    /// @param pd Primitive descriptor for sum primitive.
+    /// @param cache_blob Cache blob.
+    sum(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd.get(), cache_blob) {}
+};
+
+/// @} dnnl_api_sum
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// A base class for descriptors of all primitives that support iteration
+///     over multiple implementations.
+struct primitive_desc : public primitive_desc_base {
+    using primitive_desc_base::primitive_desc_base;
+
+    primitive_desc() = default;
+
+    /// Changes the primitive descriptor to point to the next available
+    /// implementation.
+    ///
+    /// @returns @c true on success and @c false if the last available
+    /// implementation has already been reached. In the latter case, the
+    /// primitive descriptor itself is kept unchanged.
+    bool next_impl() {
+        dnnl_status_t status = dnnl_primitive_desc_next_impl(get());
+        if (status == dnnl_last_impl_reached) return false;
+        error::wrap_c_api(status, "last available implementation is reached");
+        return true;
+    }
+};
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_convolution Convolution
+///
+/// A primitive to perform 1D, 2D or 3D convolution. Supported variants are
+/// forward propagation, backward propagation, and weights gradient with or
+/// without bias.
+///
+/// @sa @ref dev_guide_convolution in developer guide
+///
+/// @{
+
+/// Convolution forward propagation primitive.
+struct convolution_forward : public primitive {
+    /// Primitive descriptor for a convolution forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a convolution forward
+        ///     propagation primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param bias_desc Bias memory descriptor. Passing zero memory
+        ///     descriptor disables the bias term.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, &bias_desc, dst_desc, strides, nullptr,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution forward
+        ///     propagation primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &dst_desc,
+                const memory::dims &strides, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, nullptr, dst_desc, strides, nullptr,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution forward
+        ///     propagation primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param bias_desc Bias memory descriptor. Passing zero memory
+        ///     descriptor disables the bias term.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, &bias_desc, dst_desc, strides, &dilates,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution forward
+        ///     propagation primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &dst_desc,
+                const memory::dims &strides, const memory::dims &dilates,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, nullptr, dst_desc, strides, &dilates,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a convolution forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// Returns the bias memory descriptor.
+        /// @returns The bias memory descriptor.
+        /// @returns A zero memory descriptor of the primitive does not have a
+        ///     bias parameter.
+        memory::desc bias_desc() const { return base::weights_desc(1); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc *bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r, const primitive_attr &attr,
+                bool allow_empty) {
+
+            memory::validate_dims(strides, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_convolution_forward_primitive_desc_create(&pd,
+                            aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            convert_to_c(aalgorithm), src_desc.get(),
+                            weights_desc.get(), optional_arg(bias_desc),
+                            dst_desc.get(), &strides[0], optional_arg(dilates),
+                            &padding_l[0], &padding_r[0], attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the convolution forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    convolution_forward() = default;
+
+    /// Constructs a convolution forward propagation primitive.
+    /// @param pd Primitive descriptor for a convolution forward propagation
+    ///     primitive.
+    convolution_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a convolution forward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a convolution forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    convolution_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Convolution backward propagation primitive.
+struct convolution_backward_data : public primitive {
+    /// Primitive descriptor for a convolution backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a convolution backward
+        ///     propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc,
+                    diff_dst_desc, strides, nullptr, padding_l, padding_r,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution backward
+        ///     propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc,
+                    diff_dst_desc, strides, &dilates, padding_l, padding_r,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a convolution backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            memory::validate_dims(strides, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, diff_src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_convolution_backward_data_primitive_desc_create(&pd,
+                            aengine.get(), convert_to_c(aalgorithm),
+                            diff_src_desc.get(), weights_desc.get(),
+                            diff_dst_desc.get(), &strides[0],
+                            optional_arg(dilates), &padding_l[0], &padding_r[0],
+                            hint_fwd_pd.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the convolution backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    convolution_backward_data() = default;
+
+    /// Constructs a convolution backward propagation primitive.
+    /// @param pd Primitive descriptor for a convolution backward propagation
+    ///     primitive.
+    convolution_backward_data(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a convolution backward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a convolution backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    convolution_backward_data(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Convolution weights gradient primitive.
+struct convolution_backward_weights : public primitive {
+    /// Primitive descriptor for a convolution weights gradient primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a convolution weights gradient
+        ///     primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_bias_desc Diff bias memory descriptor. Passing zero
+        ///     memory descriptor disables the bias term.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    &diff_bias_desc, diff_dst_desc, strides, nullptr, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution weights gradient
+        ///     primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    nullptr, diff_dst_desc, strides, nullptr, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution weights
+        ///     gradient primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_bias_desc Diff bias memory descriptor. Passing zero
+        ///     memory descriptor disables the bias term.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    &diff_bias_desc, diff_dst_desc, strides, &dilates,
+                    padding_l, padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution weights
+        ///     gradient primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Convolution algorithm. Possible values are
+        ///     #dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd, and
+        ///     #dnnl::algorithm::convolution_auto.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    nullptr, diff_dst_desc, strides, &dilates, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a convolution weights gradient
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a convolution weights
+        ///     gradient primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::convolution,
+                    dnnl::prop_kind::backward_weights) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// Returns the diff bias memory descriptor.
+        /// @returns The diff bias memory descriptor.
+        /// @returns A zero memory descriptor of the primitive does not have a
+        ///          diff bias parameter.
+        memory::desc diff_bias_desc() const {
+            return base::diff_weights_desc(1);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc *diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const convolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            memory::validate_dims(strides, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_convolution_backward_weights_primitive_desc_create(
+                            &pd, aengine.get(), convert_to_c(aalgorithm),
+                            src_desc.get(), diff_weights_desc.get(),
+                            optional_arg(diff_bias_desc), diff_dst_desc.get(),
+                            &strides[0], optional_arg(dilates), &padding_l[0],
+                            &padding_r[0], hint_fwd_pd.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the convolution weights update primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    convolution_backward_weights() = default;
+
+    /// Constructs a convolution weights gradient primitive.
+    /// @param pd Primitive descriptor for a convolution weights gradient
+    ///     primitive.
+    convolution_backward_weights(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a convolution weights gradient primitive from a cache blob.
+    /// @param pd Primitive descriptor for a convolution weights gradient
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    convolution_backward_weights(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_convolution
+//
+/// @addtogroup dnnl_api_deconvolution Deconvolution
+///
+/// A primitive to perform 1D, 2D or 3D deconvolution. Supported variants are
+/// forward propagation, backward propagation, and weights gradient with or
+/// without bias.
+///
+/// @{
+
+/// Deconvolution forward propagation primitive.
+struct deconvolution_forward : public primitive {
+    /// Primitive descriptor for a deconvolution forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a deconvolution forward
+        ///     propagation primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Deconvolution algorithm:
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param bias_desc Bias memory descriptor. Passing zero memory
+        ///     descriptor disables the bias term.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, &bias_desc, dst_desc, strides, nullptr,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution forward
+        ///     propagation primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Deconvolution algorithm:
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &dst_desc,
+                const memory::dims &strides, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, nullptr, dst_desc, strides, nullptr,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution forward
+        ///     propagation primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Deconvolution algorithm:
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param bias_desc Bias memory descriptor. Passing zero memory
+        ///     descriptor disables the bias term.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, &bias_desc, dst_desc, strides, &dilates,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution forward
+        ///     propagation primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Deconvolution algorithm:
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &dst_desc,
+                const memory::dims &strides, const memory::dims &dilates,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    weights_desc, nullptr, dst_desc, strides, &dilates,
+                    padding_l, padding_r, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a deconvolution forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const
+        memory::desc bias_desc() const { return base::weights_desc(1); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc *bias_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r, const primitive_attr &attr,
+                bool allow_empty) {
+
+            memory::validate_dims(strides, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_deconvolution_forward_primitive_desc_create(&pd,
+                            aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            convert_to_c(aalgorithm), src_desc.get(),
+                            weights_desc.get(), optional_arg(bias_desc),
+                            dst_desc.get(), &strides[0], optional_arg(dilates),
+                            &padding_l[0], &padding_r[0], attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the deconvolution forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    deconvolution_forward() = default;
+
+    /// Constructs a deconvolution forward propagation primitive.
+    /// @param pd Primitive descriptor for a deconvolution forward propagation
+    ///     primitive.
+    deconvolution_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a deconvolution forward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a deconvolution forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    deconvolution_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Deconvolution backward propagation primitive.
+struct deconvolution_backward_data : public primitive {
+    /// Primitive descriptor for a deconvolution backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a deconvolution backward
+        ///     propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm
+        ///     (#dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc,
+                    diff_dst_desc, strides, nullptr, padding_l, padding_r,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution backward
+        ///     propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm
+        ///     (#dnnl::algorithm::convolution_direct,
+        ///     #dnnl::algorithm::convolution_winograd).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param weights_desc Weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, weights_desc,
+                    diff_dst_desc, strides, &dilates, padding_l, padding_r,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a deconvolution backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            memory::validate_dims(strides, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, diff_src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_deconvolution_backward_data_primitive_desc_create(
+                            &pd, aengine.get(), convert_to_c(aalgorithm),
+                            diff_src_desc.get(), weights_desc.get(),
+                            diff_dst_desc.get(), &strides[0],
+                            optional_arg(dilates), &padding_l[0], &padding_r[0],
+                            hint_fwd_pd.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the deconvolution backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    deconvolution_backward_data() = default;
+
+    /// Constructs a deconvolution backward propagation primitive.
+    /// @param pd Primitive descriptor for a deconvolution backward propagation
+    ///     primitive.
+    deconvolution_backward_data(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a deconvolution backward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a deconvolution backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    deconvolution_backward_data(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Deconvolution weights gradient primitive.
+struct deconvolution_backward_weights : public primitive {
+    /// Primitive descriptor for a deconvolution weights gradient primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a deconvolution weights
+        ///     gradient primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm. Possible values are
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_bias_desc Diff bias memory descriptor. Passing zero
+        ///     memory descriptor disables the bias term.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    &diff_bias_desc, diff_dst_desc, strides, nullptr, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution weights
+        ///     gradient primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p padding_l, and @p padding_r contain values
+        /// for spatial dimensions only and hence must have the same number of
+        /// elements as there are spatial dimensions. The order of values is
+        /// the same as in the tensor: depth (for 3D tensors), height (for 3D
+        /// and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm. Possible values are
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    nullptr, diff_dst_desc, strides, nullptr, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution weights
+        ///     gradient primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm. Possible values are
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_bias_desc Diff bias memory descriptor. Passing zero
+        ///     memory descriptor disables the bias term.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    &diff_bias_desc, diff_dst_desc, strides, &dilates,
+                    padding_l, padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution weights
+        ///     gradient primitive without bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// Arrays @p strides, @p dilates, @p padding_l, and @p padding_r
+        /// contain values for spatial dimensions only and hence must have the
+        /// same number of elements as there are spatial dimensions. The order
+        /// of values is the same as in the tensor: depth (for 3D tensors),
+        /// height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Deconvolution algorithm. Possible values are
+        ///     #dnnl::algorithm::deconvolution_direct, and
+        ///     #dnnl::algorithm::deconvolution_winograd.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_weights_desc Diff weights memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Strides for each spatial dimension.
+        /// @param dilates Dilations for each spatial dimension. A zero value
+        ///     means no dilation in the corresponding dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a deconvolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, src_desc, diff_weights_desc,
+                    nullptr, diff_dst_desc, strides, &dilates, padding_l,
+                    padding_r, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a deconvolution weights
+        /// gradient primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a deconvolution weights
+        ///     gradient primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::deconvolution,
+                    dnnl::prop_kind::backward_weights) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::convolution_backward_weights::primitive_desc::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return base::diff_weights_desc(1);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc *diff_bias_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims *dilates, const memory::dims &padding_l,
+                const memory::dims &padding_r,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            memory::validate_dims(strides, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, src_desc.get_ndims() - 2);
+
+            if (dilates)
+                memory::validate_dims(*dilates, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_deconvolution_backward_weights_primitive_desc_create(
+                            &pd, aengine.get(), convert_to_c(aalgorithm),
+                            src_desc.get(), diff_weights_desc.get(),
+                            optional_arg(diff_bias_desc), diff_dst_desc.get(),
+                            &strides[0], optional_arg(dilates), &padding_l[0],
+                            &padding_r[0], hint_fwd_pd.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the deconvolution weights update primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    deconvolution_backward_weights() = default;
+
+    /// Constructs a deconvolution weights gradient primitive.
+    /// @param pd Primitive descriptor for a deconvolution weights gradient
+    ///     primitive.
+    deconvolution_backward_weights(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a deconvolution weights gradient primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a deconvolution weights gradient
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    deconvolution_backward_weights(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_deconvolution
+
+/// @addtogroup dnnl_api_lrn LRN
+///
+/// A primitive to perform local response normalization (LRN) across or within
+/// channels.
+///
+/// @sa @ref dev_guide_lrn in developer guide
+///
+/// @{
+
+/// Local response normalization (LRN) forward propagation primitive.
+struct lrn_forward : public primitive {
+    /// Primitive descriptor for an LRN forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an LRN forward propagation
+        ///     primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm LRN algorithm kind: either
+        ///     #dnnl::algorithm::lrn_across_channels, or
+        ///     #dnnl::algorithm::lrn_within_channel.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param local_size Regularization local size.
+        /// @param alpha The alpha regularization parameter.
+        /// @param beta The beta regularization parameter.
+        /// @param k The k regularization parameter.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, memory::dim local_size,
+                float alpha, float beta, float k,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_lrn_forward_primitive_desc_create(&pd,
+                    aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    convert_to_c(aalgorithm), src_desc.get(), dst_desc.get(),
+                    local_size, alpha, beta, k, attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the lrn forward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for an LRN forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LRN forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::lrn,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_local_size()const
+        memory::dim get_local_size() const { return base::get_local_size(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_k()const
+        float get_k() const { return base::get_k(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lrn_forward() = default;
+
+    /// Constructs an LRN forward propagation primitive.
+    /// @param pd Primitive descriptor for an LRN forward propagation
+    ///     primitive.
+    lrn_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LRN forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LRN forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lrn_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Local response normalization (LRN) backward propagation primitive.
+struct lrn_backward : public primitive {
+    /// Primitive descriptor for an LRN backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an LRN backward propagation
+        ///     primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm LRN algorithm kind: either
+        ///     #dnnl::algorithm::lrn_across_channels, or
+        ///     #dnnl::algorithm::lrn_within_channel.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param local_size Regularization local size.
+        /// @param alpha The alpha regularization parameter.
+        /// @param beta The beta regularization parameter.
+        /// @param k The k regularization parameter.
+        /// @param hint_fwd_pd Primitive descriptor for an LRN forward
+        ///     propagation primitive. It is used as a hint for deciding which
+        ///     memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                memory::dim local_size, float alpha, float beta, float k,
+                const lrn_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_lrn_backward_primitive_desc_create(&pd,
+                    aengine.get(), convert_to_c(aalgorithm),
+                    diff_src_desc.get(), diff_dst_desc.get(), src_desc.get(),
+                    local_size, alpha, beta, k, hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the lrn backward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for an LRN backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LRN backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::lrn,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_local_size()const
+        memory::dim get_local_size() const { return base::get_local_size(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_k()const
+        float get_k() const { return base::get_k(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lrn_backward() = default;
+
+    /// Constructs an LRN backward propagation primitive.
+    /// @param pd Primitive descriptor for an LRN backward propagation
+    ///     primitive.
+    lrn_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LRN backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LRN backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lrn_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_lrn
+
+/// @addtogroup dnnl_api_eltwise Eltwise
+///
+/// A primitive to perform elementwise operations such as the
+/// rectifier linear unit (ReLU).
+///
+/// Both forward and backward propagation primitives support in-place
+/// operation; that is, src and dst can refer to the same memory for forward
+/// propagation, and diff_dst and diff_src can refer to the same memory for
+/// backward propagation.
+///
+/// @warning
+///     Because the original source data is required for backward propagation,
+///     in-place forward propagation is not generally supported in the
+///     training mode. However, for algorithms supporting destination as input
+///     memory, dst can be used for the backward propagation, which makes it
+///     possible to get performance benefit even in the training mode.
+///
+/// @sa @ref dev_guide_eltwise in developer guide
+///
+/// @{
+
+/// Elementwise unary operation forward propagation primitive.
+struct eltwise_forward : public primitive {
+    /// Primitive descriptor for an elementwise forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an elementwise forward
+        ///     propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    dst_desc, nullptr, nullptr, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an elementwise forward
+        ///     propagation primitive with an alpha parameter.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param alpha The alpha parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, float alpha,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    dst_desc, &alpha, nullptr, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an elementwise forward
+        ///     propagation primitive with an alpha and beta parameters.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param alpha The alpha parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param beta The beta parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, float alpha, float beta,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, src_desc,
+                    dst_desc, &alpha, &beta, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an eltwise forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an eltwise forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::eltwise,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        dnnl::algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, const float *alpha,
+                const float *beta, const primitive_attr &attr,
+                bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_eltwise_forward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    dnnl::convert_to_c(aalgorithm), src_desc.get(),
+                    dst_desc.get(), alpha ? *alpha : 0.0f, beta ? *beta : 0.0f,
+                    attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the eltwise forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    eltwise_forward() = default;
+
+    /// Constructs an eltwise forward propagation primitive.
+    /// @param pd Primitive descriptor for an eltwise forward propagation
+    ///     primitive.
+    eltwise_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an eltwise forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an eltwise forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    eltwise_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Elementwise unary operation backward propagation primitive.
+struct eltwise_backward : public primitive {
+    /// Primitive descriptor for eltwise backward propagation.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an elementwise backward
+        ///     propagation primitive with an alpha parameter.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param data_desc Destination memory descriptor if one of the
+        ///     "use_dst_for_bwd" algorithms are used (such as
+        ///     #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor
+        ///     otherwise.
+        /// @param hint_fwd_pd Primitive descriptor for an elementwise
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const memory::desc &data_desc,
+                const eltwise_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc,
+                    data_desc, nullptr, nullptr, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for an elementwise backward
+        ///     propagation primitive with an alpha parameter.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param data_desc Destination memory descriptor if one of the
+        ///     "use_dst_for_bwd" algorithms are used (such as
+        ///     #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor
+        ///     otherwise.
+        /// @param alpha The alpha parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param hint_fwd_pd Primitive descriptor for an elementwise
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const memory::desc &data_desc, float alpha,
+                const eltwise_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc,
+                    data_desc, &alpha, nullptr, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for an elementwise backward
+        ///     propagation primitive with an alpha and beta parameters.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise algorithm kind.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param data_desc Destination memory descriptor if one of the
+        ///     "use_dst_for_bwd" algorithms are used (such as
+        ///     #dnnl_eltwise_relu_use_dst_for_bwd), source memory descriptor
+        ///     otherwise.
+        /// @param alpha The alpha parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param beta The beta parameter for the elementwise operation.
+        ///     Specific meaning depends on the algorithm.
+        /// @param hint_fwd_pd Primitive descriptor for an elementwise
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const memory::desc &data_desc, float alpha, float beta,
+                const eltwise_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, diff_src_desc, diff_dst_desc,
+                    data_desc, &alpha, &beta, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an eltwise backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an eltwise backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::eltwise,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        dnnl::algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const memory::desc &data_desc, const float *alpha,
+                const float *beta,
+                const eltwise_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_eltwise_backward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aalgorithm),
+                    diff_src_desc.get(), diff_dst_desc.get(), data_desc.get(),
+                    alpha ? *alpha : 0.0f, beta ? *beta : 0.0f,
+                    hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the eltwise backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    eltwise_backward() = default;
+
+    /// Constructs an eltwise backward propagation primitive.
+    /// @param pd Primitive descriptor for an eltwise backward propagation
+    ///     primitive.
+    eltwise_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an eltwise backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an eltwise backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    eltwise_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_eltwise
+
+/// @addtogroup dnnl_api_softmax Softmax
+///
+/// A primitive to perform softmax.
+///
+/// @sa @ref dev_guide_softmax in developer guide
+///
+/// @{
+
+/// Softmax forward propagation primitive.
+struct softmax_forward : public primitive {
+    /// Primitive descriptor for a softmax forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a softmax forward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Softmax algorithm kind: either
+        ///     #dnnl::algorithm::softmax_accurate,
+        ///     or #dnnl::algorithm::softmax_log.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param axis Axis over which softmax is computed.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, int axis,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_softmax_forward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    dnnl::convert_to_c(aalgorithm), src_desc.get(),
+                    dst_desc.get(), axis, attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the softmax forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a softmax forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a softmax forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::softmax,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        dnnl::algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_axis()const
+        int get_axis() const { return base::get_axis(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    softmax_forward() = default;
+
+    /// Constructs a softmax forward propagation primitive.
+    /// @param pd Primitive descriptor for a softmax forward propagation
+    ///     primitive.
+    softmax_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a softmax forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a softmax forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    softmax_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Softmax backward propagation primitive.
+struct softmax_backward : public primitive {
+    /// Primitive descriptor for a softmax backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a softmax backward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Softmax algorithm kind: either
+        ///     #dnnl::algorithm::softmax_accurate,
+        ///     or #dnnl::algorithm::softmax_log.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param axis Axis over which softmax is computed.
+        /// @param hint_fwd_pd Primitive descriptor for a softmax
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &dst_desc,
+                int axis, const softmax_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_softmax_backward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aalgorithm),
+                    diff_src_desc.get(), diff_dst_desc.get(), dst_desc.get(),
+                    axis, hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the softmax backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a softmax backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a softmax backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::softmax,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        dnnl::algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_axis()const
+        int get_axis() const { return base::get_axis(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    softmax_backward() = default;
+
+    /// Constructs a softmax backward propagation primitive.
+    /// @param pd Primitive descriptor for a softmax backward propagation
+    ///     primitive.
+    softmax_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a softmax backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a softmax backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    softmax_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_softmax
+
+/// @addtogroup dnnl_api_batch_normalization Batch Normalization
+///
+/// A primitive to perform batch normalization.
+///
+/// Both forward and backward propagation primitives support in-place
+/// operation; that is, src and dst can refer to the same memory for forward
+/// propagation, and diff_dst and diff_src can refer to the same memory for
+/// backward propagation.
+///
+/// The batch normalization primitives computations can be controlled by
+/// specifying different @ref dnnl::normalization_flags values. For example,
+/// batch normalization forward propagation can be configured to either
+/// compute the mean and variance or take them as arguments. It can either
+/// perform scaling and shifting using gamma and beta parameters or not.
+/// Optionally, it can also perform a fused ReLU, which in case of training
+/// would also require a workspace.
+///
+/// @sa @ref dev_guide_batch_normalization in developer guide
+///
+/// @{
+
+/// Batch normalization forward propagation primitive.
+struct batch_normalization_forward : public primitive {
+    /// Primitive descriptor for a batch normalization forward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a batch normalization forward
+        /// propagation primitive.
+        ///
+        /// @note
+        ///     In-place operation is supported: the dst can refer to the same
+        ///     memory as the src.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param epsilon Batch normalization epsilon parameter.
+        /// @param flags Batch normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                float epsilon, normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_batch_normalization_forward_primitive_desc_create(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            src_desc.get(), dst_desc.get(), epsilon,
+                            convert_to_c(flags), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the batch normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a batch normalization
+        /// forward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a batch normalization
+        ///     forward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::batch_normalization,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// Returns memory descriptor for mean.
+        /// @returns Memory descriptor for mean.
+        memory::desc mean_desc() const { return stat_desc(mean); }
+
+        /// Returns memory descriptor for variance.
+        /// @returns Memory descriptor for variance.
+        memory::desc variance_desc() const { return stat_desc(var); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+
+    private:
+        enum {
+            mean = 1,
+            var = 2,
+        };
+        memory::desc stat_desc(int kind) const {
+            const bool use_global_stats
+                    = (get_flags() & normalization_flags::use_global_stats)
+                    != normalization_flags::none;
+            return query_md(
+                    use_global_stats ? query::src_md : query::dst_md, kind);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    batch_normalization_forward() = default;
+
+    /// Constructs a batch normalization forward propagation primitive.
+    /// @param pd Primitive descriptor for a batch normalization forward
+    ///     propagation primitive.
+    batch_normalization_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a batch normalization forward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a batch normalization forward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    batch_normalization_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Batch normalization backward propagation primitive.
+struct batch_normalization_backward : public primitive {
+    /// Primitive descriptor for a batch normalization backward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a batch normalization backward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param epsilon Batch normalization epsilon parameter.
+        /// @param flags Batch normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param hint_fwd_pd Primitive descriptor for a batch normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                float epsilon, normalization_flags flags,
+                const batch_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_batch_normalization_backward_primitive_desc_create(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            diff_src_desc.get(), diff_dst_desc.get(),
+                            src_desc.get(), epsilon, convert_to_c(flags),
+                            hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the batch normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a batch normalization
+        /// backward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a batch normalization
+        ///     backward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::batch_normalization,
+                    dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) {
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const
+        memory::desc mean_desc() const { return query_md(query::src_md, 1); }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const
+        memory::desc variance_desc() const {
+            return query_md(query::src_md, 2);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    batch_normalization_backward() = default;
+
+    /// Constructs a batch normalization backward propagation primitive.
+    /// @param pd Primitive descriptor for a batch normalization backward
+    ///     propagation primitive.
+    batch_normalization_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a batch normalization backward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a batch normalization backward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    batch_normalization_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_batch_normalization
+
+/// @addtogroup dnnl_api_group_normalization Group Normalization
+///
+/// A primitive to perform group normalization.
+///
+/// Both forward and backward propagation primitives support in-place
+/// operation; that is, src and dst can refer to the same memory for forward
+/// propagation, and diff_dst and diff_src can refer to the same memory for
+/// backward propagation.
+///
+/// The group normalization primitives computations can be controlled by
+/// specifying different @ref dnnl::normalization_flags values. For example,
+/// group normalization forward propagation can be configured to either
+/// compute the mean and variance or take them as arguments. It can either
+/// perform scaling and shifting using gamma and beta parameters or not.
+///
+/// @sa @ref dev_guide_group_normalization in developer guide
+///
+/// @{
+
+/// Group normalization forward propagation primitive.
+struct group_normalization_forward : public primitive {
+    /// Primitive descriptor for a group normalization forward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a group normalization forward
+        /// propagation primitive.
+        ///
+        /// @note
+        ///     In-place operation is supported: the dst can refer to the same
+        ///     memory as the src.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param groups Group normalization groups parameter.
+        /// @param epsilon Group normalization epsilon parameter.
+        /// @param flags Group normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                memory::dim groups, float epsilon, normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_group_normalization_forward_primitive_desc_create(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            src_desc.get(), dst_desc.get(), groups, epsilon,
+                            convert_to_c(flags), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the group normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a group normalization
+        /// forward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a group normalization
+        ///     forward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::group_normalization,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// Returns memory descriptor for mean.
+        /// @returns Memory descriptor for mean.
+        memory::desc mean_desc() const { return stat_desc(mean); }
+
+        /// Returns memory descriptor for variance.
+        /// @returns Memory descriptor for variance.
+        memory::desc variance_desc() const { return stat_desc(var); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_group_size()const
+        memory::dim get_group_size() const { return base::get_group_size(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+
+    private:
+        enum {
+            mean = 1,
+            var = 2,
+        };
+        memory::desc stat_desc(int kind) const {
+            const bool use_global_stats
+                    = (get_flags() & normalization_flags::use_global_stats)
+                    != normalization_flags::none;
+            return query_md(
+                    use_global_stats ? query::src_md : query::dst_md, kind);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    group_normalization_forward() = default;
+
+    /// Constructs a group normalization forward propagation primitive.
+    /// @param pd Primitive descriptor for a group normalization forward
+    ///     propagation primitive.
+    group_normalization_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a group normalization forward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a group normalization forward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    group_normalization_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Group normalization backward propagation primitive.
+struct group_normalization_backward : public primitive {
+    /// Primitive descriptor for a group normalization backward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a group normalization backward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param groups Group normalization groups parameter.
+        /// @param epsilon Group normalization epsilon parameter.
+        /// @param flags Group normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param hint_fwd_pd Primitive descriptor for a group normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                memory::dim groups, float epsilon, normalization_flags flags,
+                const group_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_group_normalization_backward_primitive_desc_create(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            diff_src_desc.get(), diff_dst_desc.get(),
+                            src_desc.get(), groups, epsilon,
+                            convert_to_c(flags), hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the group normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a group normalization
+        /// backward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a group normalization
+        ///     backward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::group_normalization,
+                    dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) {
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::group_normalization_forward::primitive_desc::mean_desc()const
+        memory::desc mean_desc() const { return query_md(query::src_md, 1); }
+
+        /// @copydoc dnnl::group_normalization_forward::primitive_desc::variance_desc()const
+        memory::desc variance_desc() const {
+            return query_md(query::src_md, 2);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_group_size()const
+        memory::dim get_group_size() const { return base::get_group_size(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    group_normalization_backward() = default;
+
+    /// Constructs a group normalization backward propagation primitive.
+    /// @param pd Primitive descriptor for a group normalization backward
+    ///     propagation primitive.
+    group_normalization_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a group normalization backward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a group normalization backward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    group_normalization_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_group_normalization
+
+/// @addtogroup dnnl_api_layer_normalization Layer Normalization
+///
+/// A primitive to perform layer normalization. Normalization is performed
+/// within the last logical dimension of data tensor.
+///
+/// Both forward and backward propagation primitives support in-place
+/// operation; that is, src and dst can refer to the same memory for forward
+/// propagation, and diff_dst and diff_src can refer to the same memory for
+/// backward propagation.
+///
+/// The layer normalization primitives computations can be controlled by
+/// specifying different @ref dnnl::normalization_flags values. For example,
+/// layer normalization forward propagation can be configured to either
+/// compute the mean and variance or take them as arguments. It can either
+/// perform scaling and shifting using gamma and beta parameters or not.
+///
+/// @sa @ref dev_guide_layer_normalization in developer guide
+///
+/// @{
+
+/// Layer normalization forward propagation primitive.
+struct layer_normalization_forward : public primitive {
+    /// Primitive descriptor for a layer normalization forward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a layer normalization forward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param stat_desc Statistics memory descriptors.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                const memory::desc &stat_desc, float epsilon,
+                normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, dst_desc,
+                    &stat_desc, memory::data_type::f32, epsilon, flags, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization forward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                float epsilon, normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, nullptr,
+                    memory::data_type::f32, epsilon, flags, attr, allow_empty) {
+        }
+
+        /// Constructs a primitive descriptor for a layer normalization forward
+        /// propagation primitive with a user-provided data type for the scale
+        /// and shift memory objects.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param stat_desc Statistics memory descriptors.
+        /// @param scale_shift_data_type Data type of scale and shift memory.
+        ///     If neither scale nor shift flag are specified the parameter
+        ///     is ignored.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                const memory::desc &stat_desc,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, dst_desc,
+                    &stat_desc, scale_shift_data_type, epsilon, flags, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization forward
+        /// propagation primitive with a user-provided data type for the scale
+        /// and shift memory objects.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param scale_shift_data_type Data type of scale and shift memory.
+        ///     If neither scale nor shift flag are specified the parameter
+        ///     is ignored.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, dst_desc, nullptr,
+                    scale_shift_data_type, epsilon, flags, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization
+        /// forward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a layer normalization
+        ///     forward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::layer_normalization,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const
+        memory::desc mean_desc() const { return stat_desc(mean); }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const
+        memory::desc variance_desc() const { return stat_desc(var); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+
+    private:
+        enum {
+            mean = 1,
+            var = 2,
+        };
+        memory::desc stat_desc(int kind) const {
+            const bool use_global_stats
+                    = (get_flags() & normalization_flags::use_global_stats)
+                    != normalization_flags::none;
+            return query_md(
+                    use_global_stats ? query::src_md : query::dst_md, kind);
+        }
+
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                const memory::desc *stat_desc,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags, const primitive_attr &attr,
+                bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_layer_normalization_forward_primitive_desc_create_v2(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            src_desc.get(), dst_desc.get(),
+                            optional_arg(stat_desc),
+                            memory::convert_to_c(scale_shift_data_type),
+                            epsilon, convert_to_c(flags), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the layer normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    layer_normalization_forward() = default;
+
+    /// Constructs a layer normalization forward propagation primitive.
+    /// @param pd Primitive descriptor for a layer normalization forward
+    ///     propagation primitive.
+    layer_normalization_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a layer normalization forward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a layer normalization forward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    layer_normalization_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Layer normalization backward propagation primitive.
+struct layer_normalization_backward : public primitive {
+    /// Primitive descriptor for a layer normalization backward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a layer normalization backward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param stat_desc Statistics memory descriptors.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param hint_fwd_pd Primitive descriptor for a layer normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                const memory::desc &stat_desc, float epsilon,
+                normalization_flags flags,
+                const layer_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc,
+                    src_desc, &stat_desc, memory::data_type::f32,
+                    memory::data_type::f32, epsilon, flags, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization backward
+        /// propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param hint_fwd_pd Primitive descriptor for a layer normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                float epsilon, normalization_flags flags,
+                const layer_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc,
+                    src_desc, nullptr, memory::data_type::f32,
+                    memory::data_type::f32, epsilon, flags, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization backward
+        /// propagation primitive with a user-provided data type for the scale
+        /// and shift memory objects.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param stat_desc Statistics memory descriptors.
+        /// @param diff_scale_shift_data_type Data type of diff scale and shift
+        ///     memory. If neither scale nor shift flag are specified the
+        ///     parameter is ignored.
+        /// @param scale_shift_data_type Data type of scale and shift memory.
+        ///     If neither scale nor shift flag are specified the parameter
+        ///     is ignored.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param hint_fwd_pd Primitive descriptor for a layer normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                const memory::desc &stat_desc,
+                memory::data_type diff_scale_shift_data_type,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags,
+                const layer_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc,
+                    src_desc, &stat_desc, diff_scale_shift_data_type,
+                    scale_shift_data_type, epsilon, flags, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization backward
+        /// propagation primitive with a user-provided data type for the scale
+        /// and shift memory objects.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::backward_data and #dnnl::prop_kind::backward
+        ///     (diffs for all parameters are computed in this case).
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param src_desc Source memory descriptor.
+        /// @param diff_scale_shift_data_type Data type of diff scale and shift
+        ///     memory. If neither scale nor shift flag are specified the
+        ///     parameter is ignored.
+        /// @param scale_shift_data_type Data type of scale and shift memory.
+        ///     If neither scale nor shift flag are specified the parameter
+        ///     is ignored.
+        /// @param epsilon Layer normalization epsilon parameter.
+        /// @param flags Layer normalization flags (@ref
+        ///     dnnl::normalization_flags).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param hint_fwd_pd Primitive descriptor for a layer normalization
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                memory::data_type diff_scale_shift_data_type,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags,
+                const layer_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, diff_src_desc, diff_dst_desc,
+                    src_desc, nullptr, diff_scale_shift_data_type,
+                    scale_shift_data_type, epsilon, flags, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for a layer normalization
+        /// backward propagation primitive from a C API primitive descriptor
+        /// that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a layer normalization
+        ///     backward propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd,
+                    dnnl::primitive::kind::layer_normalization,
+                    dnnl::prop_kind::backward, dnnl::prop_kind::backward_data) {
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::mean_desc()const
+        memory::desc mean_desc() const { return query_md(query::src_md, 1); }
+
+        /// @copydoc dnnl::batch_normalization_forward::primitive_desc::variance_desc()const
+        memory::desc variance_desc() const {
+            return query_md(query::src_md, 2);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        dnnl::prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// Returns normalization flags.
+        /// @return Normalization flags.
+        normalization_flags get_flags() const {
+            return base::get_flags<normalization_flags>();
+        }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::desc &src_desc,
+                const memory::desc *stat_desc,
+                memory::data_type diff_scale_shift_data_type,
+                memory::data_type scale_shift_data_type, float epsilon,
+                normalization_flags flags,
+                const layer_normalization_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_layer_normalization_backward_primitive_desc_create_v2(
+                            &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            diff_src_desc.get(), diff_dst_desc.get(),
+                            src_desc.get(), optional_arg(stat_desc),
+                            memory::convert_to_c(diff_scale_shift_data_type),
+                            memory::convert_to_c(scale_shift_data_type),
+                            epsilon, convert_to_c(flags), hint_fwd_pd.get(),
+                            attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the layer normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    layer_normalization_backward() = default;
+
+    /// Constructs a layer normalization backward propagation primitive.
+    /// @param pd Primitive descriptor for a layer normalization backward
+    ///     propagation primitive.
+    layer_normalization_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a layer normalization backward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a layer normalization backward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    layer_normalization_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_layer_normalization
+
+/// @addtogroup dnnl_api_inner_product Inner Product
+///
+/// A primitive to compute an inner product.
+///
+/// @sa @ref dev_guide_inner_product in developer guide
+///
+/// @{
+
+/// Inner product forward propagation primitive.
+struct inner_product_forward : public primitive {
+    /// Primitive descriptor for an inner product forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an inner product forward
+        /// propagation primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Memory descriptor for src.
+        /// @param weights_desc Memory descriptor for weights.
+        /// @param bias_desc Memory descriptor for bias.
+        /// @param dst_desc Memory descriptor for dst.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &weights_desc,
+                const memory::desc &bias_desc, const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, weights_desc,
+                    &bias_desc, dst_desc, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an inner product forward
+        /// propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Memory descriptor for src.
+        /// @param weights_desc Memory descriptor for weights.
+        /// @param dst_desc Memory descriptor for dst.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &weights_desc,
+                const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, src_desc, weights_desc,
+                    nullptr, dst_desc, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an inner product forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an inner product forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const
+        memory::desc bias_desc() const { return base::weights_desc(1); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &weights_desc,
+                const memory::desc *bias_desc, const memory::desc &dst_desc,
+                const primitive_attr &attr, bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_inner_product_forward_primitive_desc_create(&pd,
+                            aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            src_desc.get(), weights_desc.get(),
+                            optional_arg(bias_desc), dst_desc.get(),
+                            attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the inner product forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    inner_product_forward() = default;
+
+    /// Constructs an inner product forward propagation primitive.
+    /// @param pd Primitive descriptor for an inner product forward
+    ///     propagation primitive.
+    inner_product_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an inner product forward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for an inner product forward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    inner_product_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Inner product backward propagation primitive.
+struct inner_product_backward_data : public primitive {
+    /// Primitive descriptor for an inner product backward propagation
+    /// primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an inner product backward
+        /// propagation primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param diff_src_desc Memory descriptor for diff src.
+        /// @param weights_desc Memory descriptor for weights.
+        /// @param diff_dst_desc Memory descriptor for diff dst.
+        /// @param hint_fwd_pd Primitive descriptor for an inner product
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &diff_src_desc,
+                const memory::desc &weights_desc,
+                const memory::desc &diff_dst_desc,
+                const inner_product_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_inner_product_backward_data_primitive_desc_create(
+                            &pd, aengine.get(), diff_src_desc.get(),
+                            weights_desc.get(), diff_dst_desc.get(),
+                            hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the inner product backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for an inner product backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an inner product backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const { return base::weights_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    inner_product_backward_data() = default;
+
+    /// Constructs an inner product backward propagation primitive.
+    /// @param pd Primitive descriptor for an inner product backward
+    ///     propagation primitive.
+    inner_product_backward_data(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an inner product backward propagation primitive from
+    /// a cache blob.
+    /// @param pd Primitive descriptor for an inner product backward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    inner_product_backward_data(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Inner product weights gradient primitive.
+struct inner_product_backward_weights : public primitive {
+    /// Primitive descriptor for an inner product weights gradient primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an inner product weights
+        /// update primitive with bias.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param src_desc Memory descriptor for src.
+        /// @param diff_weights_desc Memory descriptor for diff weights.
+        /// @param diff_bias_desc Memory descriptor for diff bias.
+        /// @param diff_dst_desc Memory descriptor for diff dst.
+        /// @param hint_fwd_pd Primitive descriptor for an inner product
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_desc,
+                const inner_product_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, src_desc, diff_weights_desc,
+                    &diff_bias_desc, diff_dst_desc, hint_fwd_pd, attr,
+                    allow_empty) {}
+
+        /// Constructs a primitive descriptor for an inner product weights
+        /// update primitive.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param src_desc Memory descriptor for src.
+        /// @param diff_weights_desc Memory descriptor for diff weights.
+        /// @param diff_dst_desc Memory descriptor for diff dst.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param hint_fwd_pd Primitive descriptor for an inner product
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc,
+                const inner_product_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, src_desc, diff_weights_desc, nullptr,
+                    diff_dst_desc, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an inner product weights
+        /// update primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an inner product weights
+        ///     gradient primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::inner_product,
+                    dnnl::prop_kind::backward_weights) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_weights_desc()const
+        memory::desc diff_weights_desc() const {
+            return base::diff_weights_desc(0);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::convolution_backward_weights::primitive_desc::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return base::diff_weights_desc(1);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+    private:
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc *diff_bias_desc,
+                const memory::desc &diff_dst_desc,
+                const inner_product_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_inner_product_backward_weights_primitive_desc_create(
+                            &pd, aengine.get(), src_desc.get(),
+                            diff_weights_desc.get(),
+                            optional_arg(diff_bias_desc), diff_dst_desc.get(),
+                            hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the inner product weights gradient primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    inner_product_backward_weights() = default;
+
+    /// Constructs an inner product weights gradient primitive.
+    /// @param pd Primitive descriptor for an inner product weights gradient
+    ///     primitive.
+    inner_product_backward_weights(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an inner product weights gradient primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for an inner product weights gradient
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    inner_product_backward_weights(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_inner_product
+
+/// @addtogroup dnnl_api_rnn RNN
+///
+/// A primitive to compute recurrent neural network layers.
+///
+/// @sa @ref dev_guide_rnn in developer guide
+///
+/// @{
+
+/// Base class for primitive descriptors for RNN primitives.
+struct rnn_primitive_desc_base : public primitive_desc {
+    using primitive_desc::primitive_desc;
+
+    /// Default constructor. Produces an empty object.
+    rnn_primitive_desc_base() = default;
+
+    /// Constructs an RNN primitive descriptor base from a C API primitive
+    /// descriptor while checking that it actually describes the expected
+    /// primitive by comparing propagation and primitive kinds.
+    ///
+    /// @param pd C API primitive descriptor.
+    /// @param aprop_kind Expected propagation kind.
+    /// @param cell_kind Expected cell kind.
+    rnn_primitive_desc_base(dnnl_primitive_desc_t pd,
+            dnnl::prop_kind aprop_kind, dnnl::algorithm cell_kind)
+        : rnn_primitive_desc_base(pd, aprop_kind, aprop_kind, cell_kind) {}
+
+    /// Returns source layer memory descriptor.
+    /// @returns Source layer memory descriptor.
+    memory::desc src_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_LAYER);
+    }
+
+    /// Returns AUGRU attention memory descriptor.
+    /// @returns AUGRU attention memory descriptor.
+    memory::desc augru_attention_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_AUGRU_ATTENTION);
+    }
+
+    /// Returns source iteration memory descriptor.
+    /// @returns Source iteration memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          source iteration parameter.
+    memory::desc src_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_ITER);
+    }
+
+    /// Returns source recurrent cell state memory descriptor.
+    /// @returns Source recurrent cell state memory descriptor.
+    memory::desc src_iter_c_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_SRC_ITER_C);
+    }
+
+    /// Returns weights layer memory descriptor.
+    /// @returns Weights layer memory descriptor.
+    memory::desc weights_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_LAYER);
+    }
+
+    /// Returns weights iteration memory descriptor.
+    /// @returns Weights iteration memory descriptor.
+    memory::desc weights_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_ITER);
+    }
+
+    /// Returns weights peephole memory descriptor.
+    /// @returns Weights peephole memory descriptor.
+    memory::desc weights_peephole_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_PEEPHOLE);
+    }
+
+    /// Returns weights projection memory descriptor.
+    /// @returns Weights projection memory descriptor.
+    memory::desc weights_projection_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_PROJECTION);
+    }
+
+    /// Returns bias memory descriptor.
+    /// @returns Bias memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          bias parameter.
+    memory::desc bias_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_BIAS);
+    }
+
+    /// Returns destination layer memory descriptor.
+    /// @returns Destination layer memory descriptor.
+    memory::desc dst_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DST_LAYER);
+    }
+
+    /// Returns destination iteration memory descriptor.
+    /// @returns Destination iteration memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          destination iteration parameter.
+    memory::desc dst_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DST_ITER);
+    }
+
+    /// Returns destination recurrent cell state memory descriptor.
+    /// @returns Destination recurrent cell state memory descriptor.
+    memory::desc dst_iter_c_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DST_ITER_C);
+    }
+
+    /// Returns diff source layer memory descriptor.
+    /// @returns Diff source layer memory descriptor.
+    memory::desc diff_src_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_LAYER);
+    }
+
+    /// Returns diff AUGRU attention memory descriptor.
+    /// @returns Diff AUGRU attention memory descriptor.
+    memory::desc diff_augru_attention_desc() const {
+        return base::query_md(
+                query::exec_arg_md, DNNL_ARG_DIFF_AUGRU_ATTENTION);
+    }
+
+    /// Returns diff source iteration memory descriptor.
+    /// @returns Diff source iteration memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          diff source iteration parameter.
+    memory::desc diff_src_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_ITER);
+    }
+
+    /// Returns diff source recurrent cell state memory descriptor.
+    /// @returns Diff source recurrent cell state memory descriptor.
+    memory::desc diff_src_iter_c_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_SRC_ITER_C);
+    }
+
+    /// Returns diff weights layer memory descriptor.
+    /// @returns Diff weights layer memory descriptor.
+    memory::desc diff_weights_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_LAYER);
+    }
+
+    /// Returns diff weights iteration memory descriptor.
+    /// @returns Diff weights iteration memory descriptor.
+    memory::desc diff_weights_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_ITER);
+    }
+
+    /// Returns diff weights peephole memory descriptor.
+    /// @returns Diff weights peephole memory descriptor.
+    memory::desc diff_weights_peephole_desc() const {
+        return base::query_md(
+                query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE);
+    }
+
+    /// Returns diff weights projection memory descriptor.
+    /// @returns Diff weights projection memory descriptor.
+    memory::desc diff_weights_projection_desc() const {
+        return base::query_md(
+                query::exec_arg_md, DNNL_ARG_DIFF_WEIGHTS_PROJECTION);
+    }
+
+    /// Returns diff bias memory descriptor.
+    /// @returns Diff bias memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          diff bias parameter.
+    memory::desc diff_bias_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_BIAS);
+    }
+
+    /// Returns diff destination layer memory descriptor.
+    /// @returns Diff destination layer memory descriptor.
+    memory::desc diff_dst_layer_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_LAYER);
+    }
+
+    /// Returns diff destination iteration memory descriptor.
+    /// @returns Diff destination iteration memory descriptor.
+    /// @returns A zero memory descriptor if the primitive does not have a
+    ///          diff destination iteration parameter.
+    memory::desc diff_dst_iter_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_ITER);
+    }
+
+    /// Returns diff destination recurrent cell state memory descriptor.
+    /// @returns Diff destination recurrent cell state memory descriptor.
+    memory::desc diff_dst_iter_c_desc() const {
+        return base::query_md(query::exec_arg_md, DNNL_ARG_DIFF_DST_ITER_C);
+    }
+
+protected:
+    using rnn_base = rnn_primitive_desc_base;
+
+    // (Deliberately not using doxygen comments)
+    //
+    // Constructs an RNN primitive descriptor base from a C API primitive
+    // descriptor while checking that it actually describes the expected
+    // primitive by comparing propagation and primitive kinds. Caller can
+    // pass two options propagation kinds. This is typically used to check
+    // that propagation kind is inference or training forward propagation.
+    //
+    // @param pd C API primitive descriptor.
+    // @param prop_kind1 Expected propagation kind.
+    // @param prop_kind2 Expected propagation kind.
+    // @param cell_kind Expected cell kind.
+    rnn_primitive_desc_base(dnnl_primitive_desc_t pd,
+            dnnl::prop_kind prop_kind1, dnnl::prop_kind prop_kind2,
+            dnnl::algorithm cell_kind) {
+
+        dnnl_status_t rc;
+
+        dnnl_primitive_kind_t q_primitive_kind;
+        rc = dnnl_primitive_desc_query(
+                pd, dnnl_query_primitive_kind, 0, &q_primitive_kind);
+        error::wrap_c_api(rc,
+                "could not retrieve a primitive kind from a primitive "
+                "descriptor for an RNN primitive");
+
+        dnnl_prop_kind_t q_prop_kind;
+        rc = dnnl_primitive_desc_query(
+                pd, dnnl_query_prop_kind, 0, &q_prop_kind);
+        error::wrap_c_api(rc,
+                "could not retrieve a propagation kind from a primitive "
+                "descriptor for an RNN primitive");
+
+        dnnl_alg_kind_t q_cell_kind;
+        rc = dnnl_primitive_desc_query(
+                pd, dnnl_query_cell_kind, 0, &q_cell_kind);
+        error::wrap_c_api(rc,
+                "could not retrieve a cell kind from a primitive descriptor "
+                "for an RNN primitive");
+
+        dnnl_prop_kind_t c_prop_kind1 = convert_to_c(prop_kind1);
+        dnnl_prop_kind_t c_prop_kind2 = convert_to_c(prop_kind2);
+        dnnl_alg_kind_t c_cell_kind = convert_to_c(cell_kind);
+
+        bool ok = q_primitive_kind == dnnl_rnn
+                && (q_prop_kind == c_prop_kind1 || q_prop_kind == c_prop_kind2)
+                && q_cell_kind == c_cell_kind;
+
+        if (!ok)
+            DNNL_THROW_ERROR(dnnl_invalid_arguments,
+                    "mismatch between expected and provided descriptors for an "
+                    "RNN primitive");
+
+        reset_with_clone(pd);
+    }
+
+    // Constructs an RNN forward propagation primitive descriptor base for
+    // any cell kind.
+    rnn_primitive_desc_base(const engine &aengine, algorithm cell_kind,
+            prop_kind aprop_kind, algorithm activation, rnn_direction direction,
+            const memory::desc &src_layer_desc,
+            const memory::desc &src_iter_desc,
+            const memory::desc *src_iter_c_desc,
+            const memory::desc *attention_desc,
+            const memory::desc &weights_layer_desc,
+            const memory::desc &weights_iter_desc,
+            const memory::desc *weights_peephole_desc,
+            const memory::desc *weights_projection_desc,
+            const memory::desc &bias_desc, const memory::desc &dst_layer_desc,
+            const memory::desc &dst_iter_desc,
+            const memory::desc *dst_iter_c_desc, rnn_flags flags, float alpha,
+            float beta, const primitive_attr &attr, bool allow_empty) {
+
+        dnnl_status_t status = dnnl_success;
+        const char *msg
+                = "could not create a primitive descriptor for a requested "
+                  "cell kind";
+
+        dnnl_primitive_desc_t pd = nullptr;
+        switch (cell_kind) {
+            case algorithm::vanilla_rnn:
+                status = dnnl_vanilla_rnn_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(activation),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        convert_to_c(flags), alpha, beta, attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the vanilla RNN forward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
+                break;
+            case algorithm::vanilla_lstm:
+                status = dnnl_lstm_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(src_iter_c_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        optional_arg(weights_peephole_desc),
+                        optional_arg(weights_projection_desc), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        optional_arg(dst_iter_c_desc), convert_to_c(flags),
+                        attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LSTM forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::vanilla_gru:
+                status = dnnl_gru_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        convert_to_c(flags), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the GRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::lbr_gru:
+                status = dnnl_lbr_gru_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        convert_to_c(flags), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LBR GRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::vanilla_augru:
+                status = dnnl_augru_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(attention_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        bias_desc.get(), dst_layer_desc.get(),
+                        dst_iter_desc.get(), convert_to_c(flags), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the AUGRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::lbr_augru:
+                status = dnnl_lbr_augru_forward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(attention_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        bias_desc.get(), dst_layer_desc.get(),
+                        dst_iter_desc.get(), convert_to_c(flags), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LBR AUGRU forward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
+                break;
+            default: status = dnnl_unimplemented;
+        }
+
+        if (!allow_empty) error::wrap_c_api(status, msg);
+        reset(pd);
+    }
+
+    // Constructs an RNN backward propagation primitive descriptor base for
+    // any cell kind.
+    rnn_primitive_desc_base(const engine &aengine, algorithm cell_kind,
+            prop_kind aprop_kind, algorithm activation, rnn_direction direction,
+            const memory::desc &src_layer_desc,
+            const memory::desc &src_iter_desc,
+            const memory::desc *src_iter_c_desc,
+            const memory::desc *attention_desc,
+            const memory::desc &weights_layer_desc,
+            const memory::desc &weights_iter_desc,
+            const memory::desc *weights_peephole_desc,
+            const memory::desc *weights_projection_desc,
+            const memory::desc &bias_desc, const memory::desc &dst_layer_desc,
+            const memory::desc &dst_iter_desc,
+            const memory::desc *dst_iter_c_desc,
+            const memory::desc &diff_src_layer_desc,
+            const memory::desc &diff_src_iter_desc,
+            const memory::desc *diff_src_iter_c_desc,
+            const memory::desc *diff_attention_desc,
+            const memory::desc &diff_weights_layer_desc,
+            const memory::desc &diff_weights_iter_desc,
+            const memory::desc *diff_weights_peephole_desc,
+            const memory::desc *diff_weights_projection_desc,
+            const memory::desc &diff_bias_desc,
+            const memory::desc &diff_dst_layer_desc,
+            const memory::desc &diff_dst_iter_desc,
+            const memory::desc *diff_dst_iter_c_desc, rnn_flags flags,
+            float alpha, float beta, const rnn_primitive_desc_base &hint_fwd_pd,
+            const primitive_attr &attr, bool allow_empty) {
+
+        dnnl_status_t status = dnnl_success;
+        const char *msg = "";
+
+        dnnl_primitive_desc_t pd = nullptr;
+        switch (cell_kind) {
+            case algorithm::vanilla_rnn:
+                status = dnnl_vanilla_rnn_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(activation),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        diff_src_layer_desc.get(), diff_src_iter_desc.get(),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(), diff_bias_desc.get(),
+                        diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
+                        convert_to_c(flags), alpha, beta, hint_fwd_pd.get(),
+                        attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the vanilla RNN backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
+                break;
+            case algorithm::vanilla_lstm:
+                status = dnnl_lstm_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(src_iter_c_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        optional_arg(weights_peephole_desc),
+                        optional_arg(weights_projection_desc), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        optional_arg(dst_iter_c_desc),
+                        diff_src_layer_desc.get(), diff_src_iter_desc.get(),
+                        optional_arg(diff_src_iter_c_desc),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(),
+                        optional_arg(diff_weights_peephole_desc),
+                        optional_arg(diff_weights_projection_desc),
+                        diff_bias_desc.get(), diff_dst_layer_desc.get(),
+                        diff_dst_iter_desc.get(),
+                        optional_arg(diff_dst_iter_c_desc), convert_to_c(flags),
+                        hint_fwd_pd.get(), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LSTM backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::vanilla_gru:
+                status = dnnl_gru_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        diff_src_layer_desc.get(), diff_src_iter_desc.get(),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(), diff_bias_desc.get(),
+                        diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
+                        convert_to_c(flags), hint_fwd_pd.get(), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the GRU backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::lbr_gru:
+                status = dnnl_lbr_gru_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), weights_layer_desc.get(),
+                        weights_iter_desc.get(), bias_desc.get(),
+                        dst_layer_desc.get(), dst_iter_desc.get(),
+                        diff_src_layer_desc.get(), diff_src_iter_desc.get(),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(), diff_bias_desc.get(),
+                        diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
+                        convert_to_c(flags), hint_fwd_pd.get(), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LBR GRU backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
+                break;
+            case algorithm::vanilla_augru:
+                status = dnnl_augru_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(attention_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        bias_desc.get(), dst_layer_desc.get(),
+                        dst_iter_desc.get(), diff_src_layer_desc.get(),
+                        diff_src_iter_desc.get(),
+                        optional_arg(diff_attention_desc),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(), diff_bias_desc.get(),
+                        diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
+                        convert_to_c(flags), hint_fwd_pd.get(), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the AUGRU backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
+                break;
+            case algorithm::lbr_augru:
+                status = dnnl_lbr_augru_backward_primitive_desc_create(&pd,
+                        aengine.get(), dnnl::convert_to_c(aprop_kind),
+                        dnnl::convert_to_c(direction), src_layer_desc.get(),
+                        src_iter_desc.get(), optional_arg(attention_desc),
+                        weights_layer_desc.get(), weights_iter_desc.get(),
+                        bias_desc.get(), dst_layer_desc.get(),
+                        dst_iter_desc.get(), diff_src_layer_desc.get(),
+                        diff_src_iter_desc.get(),
+                        optional_arg(diff_attention_desc),
+                        diff_weights_layer_desc.get(),
+                        diff_weights_iter_desc.get(), diff_bias_desc.get(),
+                        diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
+                        convert_to_c(flags), hint_fwd_pd.get(), attr.get());
+                msg = "could not create a primitive descriptor for "
+                      "the LBR AUGRU backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
+                break;
+            default: status = dnnl_unimplemented;
+        }
+        if (!allow_empty) error::wrap_c_api(status, msg);
+        reset(pd);
+    }
+};
+
+/// Vanilla RNN forward propagation primitive.
+struct vanilla_rnn_forward : public primitive {
+    /// Primitive descriptor for a vanilla RNN forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a vanilla RNN forward
+        ///     propagation primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the RNN forward propagation primitive
+        /// should not use them and should default to zero values instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc can be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param activation Activation kind. Possible values are
+        ///     #dnnl::algorithm::eltwise_relu,
+        ///     #dnnl::algorithm::eltwise_tanh, or
+        ///     #dnnl::algorithm::eltwise_logistic.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm activation, rnn_direction direction,
+                const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn,
+                    aprop_kind, activation, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef,
+                    0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a vanilla RNN forward
+        ///     propagation primitive with alpha parameter.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the RNN forward propagation primitive
+        /// should not use them and should default to zero values instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc can be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param activation Activation kind. Possible values are
+        ///     #dnnl::algorithm::eltwise_relu,
+        ///     #dnnl::algorithm::eltwise_tanh, or
+        ///     #dnnl::algorithm::eltwise_logistic.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param alpha Negative slope if activation is
+        ///     #dnnl::algorithm::eltwise_relu.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm activation, rnn_direction direction,
+                const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc, float alpha,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn,
+                    aprop_kind, activation, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef,
+                    alpha, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a vanilla RNN forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a vanilla RNN forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::vanilla_rnn) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_activation_kind()const
+        algorithm get_activation_kind() const {
+            return base::get_activation_kind();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    vanilla_rnn_forward() = default;
+
+    /// Constructs a vanilla RNN forward propagation primitive.
+    /// @param pd Primitive descriptor for a vanilla RNN forward
+    ///     propagation primitive.
+    vanilla_rnn_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a vanilla RNN forward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a vanilla RNN forward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    vanilla_rnn_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Vanilla RNN backward propagation primitive.
+struct vanilla_rnn_backward : public primitive {
+    /// Primitive descriptor for an RNN backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a vanilla RNN backward
+        ///     propagation primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the RNN backward propagation
+        /// primitive should not use the respective data and should use zero
+        /// values instead.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param activation Activation kind. Possible values are
+        ///     #dnnl::algorithm::eltwise_relu,
+        ///     #dnnl::algorithm::eltwise_tanh, or
+        ///     #dnnl::algorithm::eltwise_logistic.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param hint_fwd_pd Primitive descriptor for a vanilla RNN
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm activation, rnn_direction direction,
+                const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const vanilla_rnn_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn,
+                    aprop_kind, activation, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc,
+                    diff_src_iter_desc, nullptr, nullptr,
+                    diff_weights_layer_desc, diff_weights_iter_desc, nullptr,
+                    nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a vanilla RNN backward
+        ///     propagation primitive with an alpha parameter.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the RNN backward propagation
+        /// primitive should not use the respective data and should use zero
+        /// values instead.
+        ///
+        /// @note
+        ///     All the memory descriptors may be initialized with the
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param activation Activation kind. Possible values are
+        ///     #dnnl::algorithm::eltwise_relu,
+        ///     #dnnl::algorithm::eltwise_tanh, or
+        ///     #dnnl::algorithm::eltwise_logistic.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param alpha Negative slope if activation is
+        ///     #dnnl::algorithm::eltwise_relu.
+        /// @param hint_fwd_pd Primitive descriptor for a vanilla RNN
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm activation, rnn_direction direction,
+                const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc, float alpha,
+                const vanilla_rnn_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_rnn,
+                    aprop_kind, activation, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc,
+                    diff_src_iter_desc, nullptr, nullptr,
+                    diff_weights_layer_desc, diff_weights_iter_desc, nullptr,
+                    nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, alpha, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a vanilla RNN backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a vanilla RNN backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward,
+                    dnnl::algorithm::vanilla_rnn) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_activation_kind()const
+        algorithm get_activation_kind() const {
+            return base::get_activation_kind();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_alpha()const
+        float get_alpha() const { return base::get_alpha(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_beta()const
+        float get_beta() const { return base::get_beta(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    vanilla_rnn_backward() = default;
+
+    /// Constructs a vanilla RNN backward propagation primitive.
+    /// @param pd Primitive descriptor for a vanilla RNN backward
+    ///     propagation primitive.
+    vanilla_rnn_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a vanilla RNN backward propagation primitive from
+    ///     a cache blob.
+    /// @param pd Primitive descriptor for a vanilla RNN backward
+    ///     propagation primitive.
+    /// @param cache_blob Cache blob.
+    vanilla_rnn_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LSTM forward propagation primitive.
+struct lstm_forward : public primitive {
+    /// Primitive descriptor for an LSTM forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an LSTM (with or without
+        ///     peephole and with or without projection) forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        /// - @p weights_peephole_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// The @p weights_projection_desc may point to a zero memory
+        /// descriptor. This would then indicate that the LSTM doesn't have
+        /// recurrent projection layer.
+        ///
+        /// @note
+        ///     All memory descriptors can be initialized with an
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param weights_peephole_desc Memory descriptor for the weights
+        ///     applied to the cell states (according to the Peephole LSTM
+        ///     formula).
+        /// @param weights_projection_desc Memory descriptor for the weights
+        ///     applied to the hidden states to get the recurrent projection
+        ///     (according to the Projection LSTM formula).
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &weights_peephole_desc,
+                const memory::desc &weights_projection_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc,
+                    &weights_peephole_desc, &weights_projection_desc, bias_desc,
+                    dst_layer_desc, dst_iter_desc, &dst_iter_c_desc,
+                    rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LSTM (with or without
+        ///     peephole) forward propagation primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        /// - @p weights_peephole_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors can be initialized with an
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param weights_peephole_desc Memory descriptor for the weights
+        ///     applied to the cell states (according to the Peephole LSTM
+        ///     formula).
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &weights_peephole_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc,
+                    &weights_peephole_desc, nullptr, bias_desc, dst_layer_desc,
+                    dst_iter_desc, &dst_iter_c_desc, rnn_flags::undef, 0.0f,
+                    0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LSTM forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors can be initialized with an
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc, nullptr, nullptr,
+                    bias_desc, dst_layer_desc, dst_iter_desc, &dst_iter_c_desc,
+                    rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LSTM forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LSTM forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::vanilla_lstm) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_c_desc() const {
+            return rnn_base::src_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_peephole_desc()const
+        memory::desc weights_peephole_desc() const {
+            return rnn_base::weights_peephole_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_projection_desc()const
+        memory::desc weights_projection_desc() const {
+            return rnn_base::weights_projection_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc dst_iter_c_desc() const {
+            return rnn_base::dst_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lstm_forward() = default;
+
+    /// Constructs an LSTM forward propagation primitive.
+    /// @param pd Primitive descriptor for an LSTM forward propagation
+    ///     primitive.
+    lstm_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LSTM forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LSTM forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lstm_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LSTM backward propagation primitive.
+struct lstm_backward : public primitive {
+    /// Primitive descriptor for an LSTM backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs an LSTM (with or without peephole and with or without
+        ///     projection) primitive descriptor for backward propagation
+        ///     using @p prop_kind, @p direction, and memory descriptors.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        ///   @p diff_src_iter_desc, and @p diff_src_iter_c_desc,
+        /// - @p weights_peephole_desc together with
+        ///   @p diff_weights_peephole_desc
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc,
+        ///   @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// The @p weights_projection_desc together with @p
+        /// diff_weights_projection_desc may point to a zero memory descriptor.
+        /// This would then indicate that the LSTM doesn't have recurrent
+        /// projection layer.
+        ///
+        /// @note
+        ///     All memory descriptors can be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param weights_peephole_desc Memory descriptor for the weights
+        ///     applied to the cell states (according to the Peephole LSTM
+        ///     formula).
+        /// @param weights_projection_desc Memory descriptor for the weights
+        ///     applied to the hidden states to get the recurrent projection
+        ///     (according to the Projection LSTM formula).
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_src_iter_c_desc Memory descriptor for the diff of
+        ///     input recurrent cell state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_weights_peephole_desc Memory descriptor for the diff of
+        ///     weights applied to the cell states (according to the Peephole
+        ///     LSTM formula).
+        /// @param diff_weights_projection_desc Memory descriptor for the diff
+        ///     of weights applied to the hidden states to get the recurrent
+        ///     projection (according to the Projection LSTM formula).
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param diff_dst_iter_c_desc Memory descriptor for the diff of
+        ///     output recurrent cell state vector.
+        /// @param hint_fwd_pd Primitive descriptor for an LSTM
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &weights_peephole_desc,
+                const memory::desc &weights_projection_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_src_iter_c_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_weights_peephole_desc,
+                const memory::desc &diff_weights_projection_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const memory::desc &diff_dst_iter_c_desc,
+                const lstm_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc,
+                    &weights_peephole_desc, &weights_projection_desc, bias_desc,
+                    dst_layer_desc, dst_iter_desc, &dst_iter_c_desc,
+                    diff_src_layer_desc, diff_src_iter_desc,
+                    &diff_src_iter_c_desc, nullptr, diff_weights_layer_desc,
+                    diff_weights_iter_desc, &diff_weights_peephole_desc,
+                    &diff_weights_projection_desc, diff_bias_desc,
+                    diff_dst_layer_desc, diff_dst_iter_desc,
+                    &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs an LSTM (with or without peephole) primitive descriptor
+        ///     for backward propagation using @p prop_kind, @p direction,
+        ///     and memory descriptors.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        ///   @p diff_src_iter_desc, and @p diff_src_iter_c_desc,
+        /// - @p weights_peephole_desc together with
+        ///   @p diff_weights_peephole_desc
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc,
+        ///   @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param weights_peephole_desc Memory descriptor for the weights
+        ///     applied to the cell states (according to the Peephole LSTM
+        ///     formula).
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_src_iter_c_desc Memory descriptor for the diff of
+        ///     input recurrent cell state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_weights_peephole_desc Memory descriptor for the diff of
+        ///     weights applied to the cell states (according to the Peephole
+        ///     LSTM formula).
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param diff_dst_iter_c_desc Memory descriptor for the diff of
+        ///     output recurrent cell state vector.
+        /// @param hint_fwd_pd Primitive descriptor for an LSTM
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &weights_peephole_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_src_iter_c_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_weights_peephole_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const memory::desc &diff_dst_iter_c_desc,
+                const lstm_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc,
+                    &weights_peephole_desc, nullptr, bias_desc, dst_layer_desc,
+                    dst_iter_desc, &dst_iter_c_desc, diff_src_layer_desc,
+                    diff_src_iter_desc, &diff_src_iter_c_desc, nullptr,
+                    diff_weights_layer_desc, diff_weights_iter_desc,
+                    &diff_weights_peephole_desc, nullptr, diff_bias_desc,
+                    diff_dst_layer_desc, diff_dst_iter_desc,
+                    &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs an LSTM primitive descriptor for backward propagation
+        ///     using @p prop_kind, @p direction, and memory descriptors.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p src_iter_c_desc,
+        ///   @p diff_src_iter_desc, and @p diff_src_iter_c_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p dst_iter_c_desc,
+        ///   @p diff_dst_iter_desc, and @p diff_dst_iter_c_desc.
+        ///
+        /// This would then indicate that the LSTM backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param src_iter_c_desc Memory descriptor for the input recurrent
+        ///     cell state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param dst_iter_c_desc Memory descriptor for the output recurrent
+        ///     cell state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_src_iter_c_desc Memory descriptor for the diff of
+        ///     input recurrent cell state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param diff_dst_iter_c_desc Memory descriptor for the diff of
+        ///     output recurrent cell state vector.
+        /// @param hint_fwd_pd Primitive descriptor for a convolution
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &src_iter_c_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &dst_iter_c_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_src_iter_c_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const memory::desc &diff_dst_iter_c_desc,
+                const lstm_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_lstm,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, &src_iter_c_desc, nullptr,
+                    weights_layer_desc, weights_iter_desc, nullptr, nullptr,
+                    bias_desc, dst_layer_desc, dst_iter_desc, &dst_iter_c_desc,
+                    diff_src_layer_desc, diff_src_iter_desc,
+                    &diff_src_iter_c_desc, nullptr, diff_weights_layer_desc,
+                    diff_weights_iter_desc, nullptr, nullptr, diff_bias_desc,
+                    diff_dst_layer_desc, diff_dst_iter_desc,
+                    &diff_dst_iter_c_desc, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LSTM backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LSTM backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward,
+                    dnnl::algorithm::vanilla_lstm) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_c_desc() const {
+            return rnn_base::src_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_peephole_desc()const
+        memory::desc weights_peephole_desc() const {
+            return rnn_base::weights_peephole_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_projection_desc()const
+        memory::desc weights_projection_desc() const {
+            return rnn_base::weights_projection_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc dst_iter_c_desc() const {
+            return rnn_base::dst_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_c_desc()const
+        memory::desc diff_src_iter_c_desc() const {
+            return rnn_base::diff_src_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_peephole_desc()const
+        memory::desc diff_weights_peephole_desc() const {
+            return rnn_base::diff_weights_peephole_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_projection_desc()const
+        memory::desc diff_weights_projection_desc() const {
+            return rnn_base::diff_weights_projection_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_c_desc()const
+        memory::desc diff_dst_iter_c_desc() const {
+            return rnn_base::diff_dst_iter_c_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lstm_backward() = default;
+
+    /// Constructs an LSTM backward propagation primitive.
+    /// @param pd Primitive descriptor for an LSTM backward propagation
+    ///     primitive.
+    lstm_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LSTM backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LSTM backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lstm_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// GRU forward propagation primitive.
+struct gru_forward : public primitive {
+    /// Primitive descriptor for a GRU forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a GRU forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the GRU forward propagation primitive
+        /// should not use them and should default to zero values instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc may be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_gru,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef,
+                    0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a GRU forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a GRU forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::vanilla_gru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    gru_forward() = default;
+
+    /// Constructs a GRU forward propagation primitive.
+    /// @param pd Primitive descriptor for a GRU forward propagation
+    ///     primitive.
+    gru_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a GRU forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a GRU forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    gru_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// GRU backward propagation primitive.
+struct gru_backward : public primitive {
+    /// Primitive descriptor for a GRU backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a GRU backward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the GRU backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param hint_fwd_pd Primitive descriptor for a GRU
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const gru_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_gru,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, nullptr, nullptr, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc,
+                    diff_src_iter_desc, nullptr, nullptr,
+                    diff_weights_layer_desc, diff_weights_iter_desc, nullptr,
+                    nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a GRU backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a GRU backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward,
+                    dnnl::algorithm::vanilla_gru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    gru_backward() = default;
+
+    /// Constructs a GRU backward propagation primitive.
+    /// @param pd Primitive descriptor for a GRU backward propagation
+    ///     primitive.
+    gru_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a GRU backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a GRU backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    gru_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LBR GRU forward propagation primitive.
+struct lbr_gru_forward : public primitive {
+    /// Primitive descriptor for an LBR GRU forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for LBR GRU forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the LBR GRU forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc may be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::lbr_gru, aprop_kind,
+                    algorithm::undef, direction, src_layer_desc, src_iter_desc,
+                    nullptr, nullptr, weights_layer_desc, weights_iter_desc,
+                    nullptr, nullptr, bias_desc, dst_layer_desc, dst_iter_desc,
+                    nullptr, rnn_flags::undef, 0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a LBR GRU forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a LBR GRU forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::lbr_gru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lbr_gru_forward() = default;
+
+    /// Constructs an LBR GRU forward propagation primitive.
+    /// @param pd Primitive descriptor for an LBR GRU forward propagation
+    ///     primitive.
+    lbr_gru_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LBR GRU forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LBR GRU forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lbr_gru_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LBR GRU backward propagation primitive.
+struct lbr_gru_backward : public primitive {
+    /// Primitive descriptor for an LBR GRU backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for LBR GRU backward propagation
+        /// primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the LBR GRU backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param hint_fwd_pd Primitive descriptor for an LBR GRU
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const lbr_gru_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::lbr_gru, aprop_kind,
+                    algorithm::undef, direction, src_layer_desc, src_iter_desc,
+                    nullptr, nullptr, weights_layer_desc, weights_iter_desc,
+                    nullptr, nullptr, bias_desc, dst_layer_desc, dst_iter_desc,
+                    nullptr, diff_src_layer_desc, diff_src_iter_desc, nullptr,
+                    nullptr, diff_weights_layer_desc, diff_weights_iter_desc,
+                    nullptr, nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a LBR GRU backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a LBR GRU backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(
+                    pd, dnnl::prop_kind::backward, dnnl::algorithm::lbr_gru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lbr_gru_backward() = default;
+
+    /// Constructs an LBR GRU backward propagation primitive.
+    /// @param pd Primitive descriptor for an LBR GRU backward propagation
+    ///     primitive.
+    lbr_gru_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LBR GRU backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LBR GRU backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lbr_gru_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// AUGRU forward propagation primitive.
+struct augru_forward : public primitive {
+    /// Primitive descriptor for an AUGRU forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an AUGRU forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the AUGRU forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc may be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param attention_desc Memory descriptor for the attention vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &attention_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_augru,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, nullptr, &attention_desc, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef,
+                    0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an AUGRU forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an AUGRU forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::vanilla_augru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const
+        memory::desc attention_desc() const {
+            return rnn_base::augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    augru_forward() = default;
+
+    /// Constructs an AUGRU forward propagation primitive.
+    /// @param pd Primitive descriptor for an AUGRU forward propagation
+    ///     primitive.
+    augru_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an AUGRU forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an AUGRU forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    augru_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// AUGRU backward propagation primitive.
+struct augru_backward : public primitive {
+    /// Descriptor for an AUGRU backward propagation primitive.
+    /// Primitive descriptor for an AUGRU backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an AUGRU backward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the AUGRU backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param attention_desc Memory descriptor for the attention vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_attention_desc Memory descriptor for the diff of
+        ///     attention vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param hint_fwd_pd Primitive descriptor for an AUGRU
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &attention_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_attention_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const augru_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::vanilla_augru,
+                    aprop_kind, algorithm::undef, direction, src_layer_desc,
+                    src_iter_desc, nullptr, &attention_desc, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc,
+                    diff_src_iter_desc, nullptr, &diff_attention_desc,
+                    diff_weights_layer_desc, diff_weights_iter_desc, nullptr,
+                    nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an AUGRU backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an AUGRU backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward,
+                    dnnl::algorithm::vanilla_augru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const
+        memory::desc attention_desc() const {
+            return rnn_base::augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_augru_attention_desc()const
+        memory::desc diff_attention_desc() const {
+            return rnn_base::diff_augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    augru_backward() = default;
+
+    /// Constructs an AUGRU backward propagation primitive.
+    /// @param pd Primitive descriptor for an AUGRU backward propagation
+    ///     primitive.
+    augru_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an AUGRU backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an AUGRU backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    augru_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LBR AUGRU forward propagation primitive.
+struct lbr_augru_forward : public primitive {
+    /// Descriptor for an LBR AUGRU forward propagation primitive.
+
+    /// Primitive descriptor for an LBR AUGRU forward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for LBR AUGRU forward propagation
+        ///     primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc,
+        /// - @p bias_desc,
+        /// - @p dst_iter_desc.
+        ///
+        /// This would then indicate that the LBR AUGRU forward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors except @p src_iter_desc may be
+        ///     initialized with an #dnnl::memory::format_tag::any value of @p
+        ///     format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param attention_desc Memory descriptor for the attention vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &attention_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::lbr_augru, aprop_kind,
+                    algorithm::undef, direction, src_layer_desc, src_iter_desc,
+                    nullptr, &attention_desc, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, rnn_flags::undef,
+                    0.0f, 0.0f, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LBR AUGRU forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LBR AUGRU forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference,
+                    dnnl::algorithm::lbr_augru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const
+        memory::desc attention_desc() const {
+            return rnn_base::augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lbr_augru_forward() = default;
+
+    /// Constructs an LBR AUGRU forward propagation primitive.
+    /// @param pd Primitive descriptor for an LBR AUGRU forward propagation
+    ///     primitive.
+    lbr_augru_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LBR AUGRU forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LBR AUGRU forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lbr_augru_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// LBR AUGRU backward propagation primitive.
+struct lbr_augru_backward : public primitive {
+    /// Primitive descriptor for an LBR AUGRU backward propagation primitive.
+    struct primitive_desc : public rnn_primitive_desc_base {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for LBR AUGRU backward propagation
+        /// primitive.
+        ///
+        /// The following arguments may point to a zero memory descriptor:
+        /// - @p src_iter_desc together with @p diff_src_iter_desc,
+        /// - @p bias_desc together with @p diff_bias_desc,
+        /// - @p dst_iter_desc together with @p diff_dst_iter_desc.
+        ///
+        /// This would then indicate that the LBR AUGRU backward propagation
+        /// primitive should not use them and should default to zero values
+        /// instead.
+        ///
+        /// @note
+        ///     All memory descriptors may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Must be
+        ///     #dnnl::prop_kind::backward.
+        /// @param direction RNN direction. See @ref dnnl::rnn_direction for
+        ///     more info.
+        /// @param src_layer_desc Memory descriptor for the input vector.
+        /// @param src_iter_desc Memory descriptor for the input recurrent
+        ///     hidden state vector.
+        /// @param attention_desc Memory descriptor for the attention vector.
+        /// @param weights_layer_desc Memory descriptor for the weights
+        ///     applied to the layer input.
+        /// @param weights_iter_desc Memory descriptor for the weights applied
+        ///     to the recurrent input.
+        /// @param bias_desc Bias memory descriptor.
+        /// @param dst_layer_desc Memory descriptor for the output vector.
+        /// @param dst_iter_desc Memory descriptor for the output recurrent
+        ///     hidden state vector.
+        /// @param diff_src_layer_desc Memory descriptor for the diff of input
+        ///     vector.
+        /// @param diff_src_iter_desc Memory descriptor for the diff of input
+        ///     recurrent hidden state vector.
+        /// @param diff_attention_desc Memory descriptor for the diff of
+        ///     attention vector.
+        /// @param diff_weights_layer_desc Memory descriptor for the diff of
+        ///     weights applied to the layer input.
+        /// @param diff_weights_iter_desc Memory descriptor for the diff of
+        ///     weights applied to the recurrent input.
+        /// @param diff_bias_desc Diff bias memory descriptor.
+        /// @param diff_dst_layer_desc Memory descriptor for the diff of
+        ///     output vector.
+        /// @param diff_dst_iter_desc Memory descriptor for the diff of output
+        ///     recurrent hidden state vector.
+        /// @param hint_fwd_pd Primitive descriptor for an LBR AUGRU
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                rnn_direction direction, const memory::desc &src_layer_desc,
+                const memory::desc &src_iter_desc,
+                const memory::desc &attention_desc,
+                const memory::desc &weights_layer_desc,
+                const memory::desc &weights_iter_desc,
+                const memory::desc &bias_desc,
+                const memory::desc &dst_layer_desc,
+                const memory::desc &dst_iter_desc,
+                const memory::desc &diff_src_layer_desc,
+                const memory::desc &diff_src_iter_desc,
+                const memory::desc &diff_attention_desc,
+                const memory::desc &diff_weights_layer_desc,
+                const memory::desc &diff_weights_iter_desc,
+                const memory::desc &diff_bias_desc,
+                const memory::desc &diff_dst_layer_desc,
+                const memory::desc &diff_dst_iter_desc,
+                const lbr_augru_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : rnn_primitive_desc_base(aengine, algorithm::lbr_augru, aprop_kind,
+                    algorithm::undef, direction, src_layer_desc, src_iter_desc,
+                    nullptr, &attention_desc, weights_layer_desc,
+                    weights_iter_desc, nullptr, nullptr, bias_desc,
+                    dst_layer_desc, dst_iter_desc, nullptr, diff_src_layer_desc,
+                    diff_src_iter_desc, nullptr, &diff_attention_desc,
+                    diff_weights_layer_desc, diff_weights_iter_desc, nullptr,
+                    nullptr, diff_bias_desc, diff_dst_layer_desc,
+                    diff_dst_iter_desc, nullptr, rnn_flags::undef, 0.0f, 0.0f,
+                    hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for an LBR AUGRU backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for an LBR AUGRU backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : rnn_primitive_desc_base(pd, dnnl::prop_kind::backward,
+                    dnnl::algorithm::lbr_augru) {}
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_layer_desc()const
+        memory::desc src_layer_desc() const {
+            return rnn_base::src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::src_iter_desc()const
+        memory::desc src_iter_desc() const { return rnn_base::src_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::augru_attention_desc()const
+        memory::desc attention_desc() const {
+            return rnn_base::augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_layer_desc()const
+        memory::desc weights_layer_desc() const {
+            return rnn_base::weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::weights_iter_desc()const
+        memory::desc weights_iter_desc() const {
+            return rnn_base::weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::bias_desc()const
+        memory::desc bias_desc() const { return rnn_base::bias_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_layer_desc()const
+        memory::desc dst_layer_desc() const {
+            return rnn_base::dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::dst_iter_desc()const
+        memory::desc dst_iter_desc() const { return rnn_base::dst_iter_desc(); }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const {
+            return rnn_base::workspace_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_layer_desc()const
+        memory::desc diff_src_layer_desc() const {
+            return rnn_base::diff_src_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_src_iter_desc()const
+        memory::desc diff_src_iter_desc() const {
+            return rnn_base::diff_src_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_augru_attention_desc()const
+        memory::desc diff_attention_desc() const {
+            return rnn_base::diff_augru_attention_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_layer_desc()const
+        memory::desc diff_weights_layer_desc() const {
+            return rnn_base::diff_weights_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_weights_iter_desc()const
+        memory::desc diff_weights_iter_desc() const {
+            return rnn_base::diff_weights_iter_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_bias_desc()const
+        memory::desc diff_bias_desc() const {
+            return rnn_base::diff_bias_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_layer_desc()const
+        memory::desc diff_dst_layer_desc() const {
+            return rnn_base::diff_dst_layer_desc();
+        }
+
+        /// @copydoc dnnl::rnn_primitive_desc_base::diff_dst_iter_desc()const
+        memory::desc diff_dst_iter_desc() const {
+            return rnn_base::diff_dst_iter_desc();
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::get_cell_kind()const
+        algorithm get_cell_kind() const { return base::get_cell_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_direction()const
+        rnn_direction get_direction() const { return base::get_direction(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    lbr_augru_backward() = default;
+
+    /// Constructs an LBR AUGRU backward propagation primitive.
+    /// @param pd Primitive descriptor for an LBR AUGRU backward propagation
+    ///     primitive.
+    lbr_augru_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an LBR AUGRU backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an LBR AUGRU backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    lbr_augru_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_rnn
+
+/// @addtogroup dnnl_api_shuffle Shuffle
+///
+/// A primitive to shuffle tensor data along an axis.
+///
+/// @sa @ref dev_guide_shuffle in developer guide
+///
+/// @{
+
+/// Shuffle forward propagation primitive.
+struct shuffle_forward : public primitive {
+    /// Primitive descriptor for a shuffle forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a shuffle forward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param axis The axis along which the data is shuffled.
+        /// @param group_size Shuffle group size.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                int axis, int group_size,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_shuffle_forward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    src_desc.get(), dst_desc.get(), axis, group_size,
+                    attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the shuffle forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a shuffle forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a shuffle forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::shuffle,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_axis()const
+        int get_axis() const { return base::get_axis(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_group_size()const
+        memory::dim get_group_size() const { return base::get_group_size(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    shuffle_forward() = default;
+
+    /// Constructs a shuffle forward propagation primitive.
+    /// @param pd Primitive descriptor for a shuffle forward propagation
+    ///     primitive.
+    shuffle_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a shuffle forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a shuffle forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    shuffle_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Shuffle backward propagation primitive.
+struct shuffle_backward : public primitive {
+    /// Primitive descriptor for a shuffle backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a shuffle backward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param axis The axis along which the data is shuffled.
+        /// @param group_size Shuffle group size.
+        /// @param hint_fwd_pd Primitive descriptor for a shuffle forward
+        ///     propagation primitive. It is used as a hint for deciding which
+        ///     memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, int axis, int group_size,
+                const shuffle_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_shuffle_backward_primitive_desc_create(
+                    &pd, aengine.get(), diff_src_desc.get(),
+                    diff_dst_desc.get(), axis, group_size, hint_fwd_pd.get(),
+                    attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the shuffle backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a shuffle backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a shuffle backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::shuffle,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_axis()const
+        int get_axis() const { return base::get_axis(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_group_size()const
+        memory::dim get_group_size() const { return base::get_group_size(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    shuffle_backward() = default;
+
+    /// Constructs a shuffle backward propagation primitive.
+    /// @param pd Primitive descriptor for a shuffle backward propagation
+    ///     primitive.
+    shuffle_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a shuffle backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a shuffle backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    shuffle_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_shuffle
+
+/// @addtogroup dnnl_api_binary Binary
+///
+/// A primitive to perform tensor operations over two tensors.
+///
+/// @sa @ref dev_guide_binary in developer guide
+///
+/// @{
+
+/// Elementwise binary operator primitive.
+struct binary : public primitive {
+    /// Primitive descriptor for an elementwise binary operator primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for an elementwise binary operator
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise binary algorithm.
+        /// @param src0 Memory descriptor for source tensor #0.
+        /// @param src1 Memory descriptor for source tensor #1.
+        /// @param dst Memory descriptor for destination tensor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src0, const memory::desc &src1,
+                const memory::desc &dst,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_binary_primitive_desc_create(&pd,
+                    aengine.get(), dnnl::convert_to_c(aalgorithm), src0.get(),
+                    src1.get(), dst.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the binary operation primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for an elementwise binary operator
+        /// primitive with support of ternary operators.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise binary algorithm.
+        /// @param src0 Memory descriptor for source tensor #0.
+        /// @param src1 Memory descriptor for source tensor #1.
+        /// @param src2 Memory descriptor for source tensor #2 for ternary
+        ///     operations. Might be empty.
+        /// @param dst Memory descriptor for destination tensor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src0, const memory::desc &src1,
+                const memory::desc &src2, const memory::desc &dst,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_binary_primitive_desc_create_v2(&pd,
+                    aengine.get(), dnnl::convert_to_c(aalgorithm), src0.get(),
+                    src1.get(), src2.get(), dst.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the binary v2 operation primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a binary primitive from a C
+        /// API primitive descriptor that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a binary primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::binary) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc(int)const
+        memory::desc src_desc(int idx = 0) const { return base::src_desc(idx); }
+
+        /// Returns the memory descriptor for source #0.
+        memory::desc src0_desc() const { return base::src_desc(0); }
+
+        /// Returns the memory descriptor for source #1.
+        memory::desc src1_desc() const { return base::src_desc(1); }
+
+        /// Returns the memory descriptor for source #2.
+        memory::desc src2_desc() const { return base::src_desc(2); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    binary() = default;
+
+    /// Constructs an elementwise binary operation primitive.
+    /// @param pd Primitive descriptor for an elementwise binary operation
+    ///     primitive.
+    binary(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs an elementwise binary operation primitive from a cache blob.
+    /// @param pd Primitive descriptor for an elementwise binary operation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    binary(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_binary
+
+/// @addtogroup dnnl_api_matmul Matrix Multiplication
+///
+/// A primitive to perform matrix-matrix multiplication. The batched mode
+/// is supported with 3D tensors.
+///
+/// @sa @ref dev_guide_matmul in developer guide
+///
+///
+/// @{
+
+/// Matrix multiplication (matmul) primitive.
+struct matmul : public primitive {
+    /// Primitive descriptor for a matmul primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a matmul primitive
+        ///     without bias.
+        ///
+        /// @param aengine Engine to use.
+        /// @param src_desc Memory descriptor for source (matrix A).
+        /// @param weights_desc Memory descriptor for weights (matrix B).
+        /// @param dst_desc Memory descriptor for destination (matrix C).
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, src_desc, weights_desc, nullptr, dst_desc,
+                    attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a matmul primitive with bias.
+        ///
+        /// @param aengine Engine to use.
+        /// @param src_desc Memory descriptor for source (matrix A).
+        /// @param weights_desc Memory descriptor for weights (matrix B).
+        /// @param dst_desc Memory descriptor for destination (matrix C).
+        /// @param bias_desc Memory descriptor for bias.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc &bias_desc,
+                const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, src_desc, weights_desc, &bias_desc,
+                    dst_desc, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a matmul primitive from a C
+        /// API primitive descriptor that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a matmul primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::matmul) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return query_md(query::src_md, 0); }
+
+        /// @copydoc dnnl::primitive_desc_base::weights_desc()const
+        memory::desc weights_desc() const {
+            return query_md(query::weights_md, 0);
+        }
+
+        /// @copydoc dnnl::convolution_forward::primitive_desc::bias_desc()const
+        memory::desc bias_desc() const {
+            return query_md(query::weights_md, 1);
+        }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return query_md(query::dst_md, 0); }
+
+    private:
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &weights_desc, const memory::desc *bias_desc,
+                const memory::desc &dst_desc, const primitive_attr &attr,
+                bool allow_empty) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_matmul_primitive_desc_create(&pd,
+                    aengine.get(), src_desc.get(), weights_desc.get(),
+                    optional_arg(bias_desc), dst_desc.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the matmul primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    matmul() = default;
+
+    /// Constructs a matmul primitive.
+    /// @param pd Primitive descriptor for a matmul primitive.
+    matmul(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a matmul primitive from a cache blob.
+    /// @param pd Primitive descriptor for a matmul primitive.
+    /// @param cache_blob Cache blob.
+    matmul(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_matmul
+
+/// @addtogroup dnnl_api_resampling Resampling
+///
+/// A primitive to compute resampling operation on 1D, 2D or 3D data tensor
+/// using Nearest Neighbor, or Linear (Bilinear, Trilinear) interpolation
+/// method.
+///
+/// @sa @ref dev_guide_resampling in developer guide
+///
+/// @{
+
+/// Resampling forward propagation.
+struct resampling_forward : public primitive {
+    /// Primitive descriptor for a resampling forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a resampling forward
+        ///     propagation primitive using source and destination memory
+        ///     descriptors.
+        ///
+        /// @note
+        ///     Destination memory descriptor may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm resampling algorithm kind: either
+        ///     #dnnl::algorithm::resampling_nearest, or
+        ///     #dnnl::algorithm::resampling_linear
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, nullptr, src_desc,
+                    &dst_desc, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a resampling forward
+        ///     propagation primitive using source memory descriptor and
+        ///     factors.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm resampling algorithm kind: either
+        ///     #dnnl::algorithm::resampling_nearest, or
+        ///     #dnnl::algorithm::resampling_linear
+        /// @param factors Vector of scaling factors for spatial dimension.
+        /// @param src_desc Source memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const std::vector<float> &factors,
+                const memory::desc &src_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, &factors,
+                    src_desc, nullptr, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a resampling forward
+        ///     propagation primitive.
+        ///
+        /// @note
+        ///     The destination memory descriptor may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm resampling algorithm kind: either
+        ///     #dnnl::algorithm::resampling_nearest, or
+        ///     #dnnl::algorithm::resampling_linear
+        /// @param factors Vector of scaling factors for spatial dimension.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const std::vector<float> &factors,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aprop_kind, aalgorithm, &factors,
+                    src_desc, &dst_desc, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a resampling forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a resampling forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::resampling,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+    private:
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const std::vector<float> *factors,
+                const memory::desc &src_desc, const memory::desc *dst_desc,
+                const primitive_attr &attr, bool allow_empty) {
+
+            if (factors)
+                memory::validate_dims(*factors, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_resampling_forward_primitive_desc_create(&pd,
+                            aengine.get(), dnnl::convert_to_c(aprop_kind),
+                            convert_to_c(aalgorithm), optional_arg(factors),
+                            src_desc.get(), optional_arg(dst_desc), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the resampling forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    resampling_forward() = default;
+
+    /// Constructs a resampling forward propagation primitive.
+    /// @param pd Primitive descriptor for a resampling forward propagation
+    ///     primitive.
+    resampling_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a resampling forward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a resampling forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    resampling_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Resampling backward propagation primitive.
+struct resampling_backward : public primitive {
+    /// Primitive descriptor for resampling backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a resampling backward
+        ///     propagation primitive using source and destination memory
+        ///     descriptors.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm resampling algorithm kind: either
+        ///     #dnnl::algorithm::resampling_nearest, or
+        ///     #dnnl::algorithm::resampling_linear
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param hint_fwd_pd Primitive descriptor for a resampling
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const resampling_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, nullptr, diff_src_desc,
+                    diff_dst_desc, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for resampling backward
+        ///     propagation primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm resampling algorithm kind: either
+        ///     #dnnl::algorithm::resampling_nearest, or
+        ///     #dnnl::algorithm::resampling_linear
+        /// @param factors Vector of scaling factors for spatial dimension.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param hint_fwd_pd Primitive descriptor for a resampling
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const std::vector<float> &factors,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const resampling_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false)
+            : primitive_desc(aengine, aalgorithm, &factors, diff_src_desc,
+                    diff_dst_desc, hint_fwd_pd, attr, allow_empty) {}
+
+        /// Constructs a primitive descriptor for a resampling backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a resampling backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::resampling,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+    private:
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const std::vector<float> *factors,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc,
+                const resampling_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr, bool allow_empty) {
+
+            if (factors)
+                memory::validate_dims(*factors, diff_src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status
+                    = dnnl_resampling_backward_primitive_desc_create(&pd,
+                            aengine.get(), convert_to_c(aalgorithm),
+                            optional_arg(factors), diff_src_desc.get(),
+                            diff_dst_desc.get(), hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the resampling backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    resampling_backward() = default;
+
+    /// Constructs a resampling backward propagation primitive.
+    /// @param pd Primitive descriptor for a resampling backward propagation
+    ///     primitive.
+    resampling_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a resampling backward propagation primitive from a cache
+    ///     blob.
+    /// @param pd Primitive descriptor for a resampling backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    resampling_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_resampling
+
+/// @addtogroup dnnl_api_pooling Pooling
+///
+/// A primitive to perform max or average pooling with dilation.
+///
+/// @sa @ref dev_guide_pooling in developer guide
+///
+/// @{
+
+/// Pooling forward propagation primitive.
+struct pooling_forward : public primitive {
+    /// Primitive descriptor for a pooling forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for pooling forward propagation
+        ///     primitive.
+        ///
+        /// Arrays @p strides, @p kernel, @p dilation, @p padding_l
+        /// and @p padding_r contain values for spatial dimensions only and
+        /// hence must have the same number of elements as there are spatial
+        /// dimensions. The order of values is the same as in the tensor:
+        /// depth (for 3D tensors), height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param aalgorithm Pooling algorithm kind: either
+        ///     #dnnl::algorithm::pooling_max,
+        ///     #dnnl::algorithm::pooling_avg_include_padding,
+        ///     or #dnnl::algorithm::pooling_avg_exclude_padding.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param kernel Vector of kernel spatial dimensions.
+        /// @param dilation Array of dilations for spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                algorithm aalgorithm, const memory::desc &src_desc,
+                const memory::desc &dst_desc, const memory::dims &strides,
+                const memory::dims &kernel, const memory::dims &dilation,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            memory::validate_dims(strides, src_desc.get_ndims() - 2);
+            memory::validate_dims(kernel, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, src_desc.get_ndims() - 2);
+            memory::validate_dims(dilation, src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_pooling_forward_primitive_desc_create(
+                    &pd, aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    convert_to_c(aalgorithm), src_desc.get(), dst_desc.get(),
+                    &strides[0], &kernel[0], &dilation[0], &padding_l[0],
+                    &padding_r[0], attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a descriptor for a pooling forward "
+                        "propagation primitive");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a pooling forward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a pooling forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::pooling,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_kernel()const
+        memory::dims get_kernel() const { return base::get_kernel(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    pooling_forward() = default;
+
+    /// Constructs a pooling forward propagation primitive.
+    ///
+    /// @param pd Primitive descriptor for a pooling forward propagation
+    ///     primitive.
+    pooling_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a pooling forward propagation primitive from a cache blob.
+    ///
+    /// @param pd Primitive descriptor for a pooling forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    pooling_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// Pooling backward propagation primitive.
+struct pooling_backward : public primitive {
+    /// Primitive descriptor for a pooling backward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a pooling backward propagation
+        ///     primitive.
+        ///
+        /// Arrays @p strides, @p kernel, @p dilation, @p padding_l
+        /// and @p padding_r contain values for spatial dimensions only and
+        /// hence must have the same number of elements as there are spatial
+        /// dimensions. The order of values is the same as in the tensor:
+        /// depth (for 3D tensors), height (for 3D and 2D tensors), and width.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Pooling algorithm kind: either
+        ///     #dnnl::algorithm::pooling_max,
+        ///     #dnnl::algorithm::pooling_avg_include_padding,
+        ///     or #dnnl::algorithm::pooling_avg_exclude_padding.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param strides Vector of strides for spatial dimension.
+        /// @param kernel Vector of kernel spatial dimensions.
+        /// @param dilation Array of dilations for spatial dimension.
+        /// @param padding_l Vector of padding values for low indices for each
+        ///     spatial dimension `([[front,] top,] left)`.
+        /// @param padding_r Vector of padding values for high indices for
+        ///     each spatial dimension `([[back,] bottom,] right)`.
+        /// @param hint_fwd_pd Primitive descriptor for a pooling
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_dst_desc, const memory::dims &strides,
+                const memory::dims &kernel, const memory::dims &dilation,
+                const memory::dims &padding_l, const memory::dims &padding_r,
+                const pooling_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            memory::validate_dims(strides, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(kernel, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_l, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(padding_r, diff_src_desc.get_ndims() - 2);
+            memory::validate_dims(dilation, diff_src_desc.get_ndims() - 2);
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_pooling_backward_primitive_desc_create(
+                    &pd, aengine.get(), convert_to_c(aalgorithm),
+                    diff_src_desc.get(), diff_dst_desc.get(), &strides[0],
+                    &kernel[0], &dilation[0], &padding_l[0], &padding_r[0],
+                    hint_fwd_pd.get(), attr.get());
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a descriptor for a pooling backward "
+                        "propagation primitive");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a pooling backward propagation
+        /// primitive from a C API primitive descriptor that must have a
+        /// matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a pooling backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::pooling,
+                    dnnl::prop_kind::backward_data) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::workspace_desc()const
+        memory::desc workspace_desc() const { return base::workspace_desc(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_strides()const
+        memory::dims get_strides() const { return base::get_strides(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_kernel()const
+        memory::dims get_kernel() const { return base::get_kernel(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_dilations()const
+        memory::dims get_dilations() const { return base::get_dilations(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_l()const
+        memory::dims get_padding_l() const { return base::get_padding_l(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_padding_r()const
+        memory::dims get_padding_r() const { return base::get_padding_r(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    pooling_backward() = default;
+
+    /// Constructs a pooling backward propagation primitive.
+    ///
+    /// @param pd Primitive descriptor for a pooling backward propagation
+    ///     primitive.
+    pooling_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a pooling backward propagation primitive from a cache blob.
+    ///
+    /// @param pd Primitive descriptor for a pooling backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    pooling_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_pooling
+
+/// @addtogroup dnnl_api_prelu PReLU
+///
+/// PReLU primitive
+/// A primitive to perform PReLU (leaky ReLU with trainable alpha parameter)
+///
+/// @sa @ref dev_guide_prelu in developer guide
+///
+/// @{
+
+/// PReLU forward propagation primitive.
+struct prelu_forward : public primitive {
+    /// Primitive descriptor for a PReLU forward propagation primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a PReLU forward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aprop_kind Propagation kind. Possible values are
+        ///     #dnnl::prop_kind::forward_training, and
+        ///     #dnnl::prop_kind::forward_inference.
+        /// @param src_desc Source memory descriptor.
+        /// @param weight_desc Alpha parameters memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, prop_kind aprop_kind,
+                const memory::desc &src_desc, const memory::desc &weight_desc,
+                const memory::desc &dst_desc,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_prelu_forward_primitive_desc_create(&pd,
+                    aengine.get(), dnnl::convert_to_c(aprop_kind),
+                    src_desc.get(), weight_desc.get(), dst_desc.get(),
+                    attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the prelu forward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a prelu forward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a prelu forward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::prelu,
+                    dnnl::prop_kind::forward_training,
+                    dnnl::prop_kind::forward_inference) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    prelu_forward() = default;
+
+    /// Constructs a prelu forward propagation primitive.
+    /// @param pd Primitive descriptor for a prelu forward propagation
+    ///     primitive.
+    prelu_forward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a prelu forward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a prelu forward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    prelu_forward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// PReLU backward propagation primitive.
+struct prelu_backward : public primitive {
+    /// Primitive descriptor for prelu backward propagation.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a descriptor for a PReLU backward propagation
+        /// primitive.
+        ///
+        /// @param aengine Engine to use.
+        /// @param src_desc Source memory descriptor.
+        /// @param weight_desc Alpha parameters memory descriptor.
+        /// @param diff_src_desc Diff source memory descriptor.
+        /// @param diff_weights_desc Diff alpha parameters memory descriptor.
+        /// @param diff_dst_desc Diff destination memory descriptor.
+        /// @param hint_fwd_pd Primitive descriptor for a PReLU
+        ///     forward propagation primitive. It is used as a hint for
+        ///     deciding which memory format to use.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, const memory::desc &src_desc,
+                const memory::desc &weight_desc,
+                const memory::desc &diff_src_desc,
+                const memory::desc &diff_weights_desc,
+                const memory::desc &diff_dst_desc,
+                const prelu_forward::primitive_desc &hint_fwd_pd,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_prelu_backward_primitive_desc_create(
+                    &pd, aengine.get(), src_desc.get(), weight_desc.get(),
+                    diff_src_desc.get(), diff_weights_desc.get(),
+                    diff_dst_desc.get(), hint_fwd_pd.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the prelu backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a prelu backward
+        /// propagation primitive from a C API primitive descriptor that must
+        /// have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a prelu backward
+        ///     propagation primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::prelu,
+                    dnnl::prop_kind::backward) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_src_desc()const
+        memory::desc diff_src_desc() const { return base::diff_src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::diff_dst_desc()const
+        memory::desc diff_dst_desc() const { return base::diff_dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_prop_kind()const
+        prop_kind get_prop_kind() const { return base::get_prop_kind(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    prelu_backward() = default;
+
+    /// Constructs a prelu backward propagation primitive.
+    /// @param pd Primitive descriptor for a prelu backward propagation
+    ///     primitive.
+    prelu_backward(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a prelu backward propagation primitive from a cache blob.
+    /// @param pd Primitive descriptor for a prelu backward propagation
+    ///     primitive.
+    /// @param cache_blob Cache blob.
+    prelu_backward(
+            const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_prelu
+
+/// @addtogroup dnnl_api_reduction Reduction
+///
+/// A primitive to compute reduction operation on data tensor
+/// using min, max, mul, sum, mean and norm_lp operations.
+///
+/// @sa @ref dev_guide_reduction in developer guide
+///
+/// @{
+
+/// Reduction.
+struct reduction : public primitive {
+    /// Primitive descriptor for a reduction primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        /// Constructs a primitive descriptor for a reduction primitive using
+        ///     algorithm specific parameters, source and destination memory
+        ///     descriptors.
+        ///
+        /// @note
+        ///     Destination memory descriptor may be initialized with
+        ///     #dnnl::memory::format_tag::any value of @p format_tag.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm reduction algorithm kind. Possible values:
+        ///     #dnnl_reduction_max, #dnnl_reduction_min, #dnnl_reduction_sum,
+        ///     #dnnl_reduction_mul, #dnnl_reduction_mean,
+        ///     #dnnl_reduction_norm_lp_max, #dnnl_reduction_norm_lp_sum,
+        ///     #dnnl_reduction_norm_lp_power_p_max,
+        ///     #dnnl_reduction_norm_lp_power_p_sum.
+        /// @param p algorithm specific parameter.
+        /// @param eps algorithm specific parameter.
+        /// @param src_desc Source memory descriptor.
+        /// @param dst_desc Destination memory descriptor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src_desc, const memory::desc &dst_desc,
+                float p, float eps, const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_reduction_primitive_desc_create(&pd,
+                    aengine.get(), convert_to_c(aalgorithm), src_desc.get(),
+                    dst_desc.get(), p, eps, attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the reduction primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for a reduction primitive from a C
+        /// API primitive descriptor that must have a matching kind.
+        ///
+        /// @param pd C API primitive descriptor for a reduction primitive.
+        primitive_desc(dnnl_primitive_desc_t pd)
+            : dnnl::primitive_desc(pd, dnnl::primitive::kind::reduction) {}
+
+        /// @copydoc dnnl::primitive_desc_base::src_desc()const
+        memory::desc src_desc() const { return base::src_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::dst_desc()const
+        memory::desc dst_desc() const { return base::dst_desc(0); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_p()const
+        float get_p() const { return base::get_p(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_epsilon()const
+        float get_epsilon() const { return base::get_epsilon(); }
+
+        /// @copydoc dnnl::primitive_desc_base::get_algorithm()const
+        algorithm get_algorithm() const { return base::get_algorithm(); }
+    };
+
+    /// Default constructor. Produces an empty object.
+    reduction() = default;
+
+    /// Constructs a reduction primitive.
+    /// @param pd Primitive descriptor for a reduction primitive.
+    reduction(const primitive_desc &pd) : primitive(pd) {}
+
+    /// Constructs a reduction primitive from a cache blob.
+    /// @param pd Primitive descriptor for a reduction primitive.
+    /// @param cache_blob Cache blob.
+    reduction(const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+        : primitive(pd, cache_blob) {}
+};
+
+/// @} dnnl_api_reduction
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_service Service
+///
+/// A set of functions that aid in oneDNN debugging and profiling.
+///
+/// @{
+
+/// @copydoc dnnl_version_t
+using version_t = dnnl_version_t;
+
+/// Status values returned by the library functions.
+enum class status {
+    /// @copydoc dnnl_success
+    success = dnnl_success,
+    /// @copydoc dnnl_out_of_memory
+    out_of_memory = dnnl_out_of_memory,
+    /// @copydoc dnnl_invalid_arguments
+    invalid_arguments = dnnl_invalid_arguments,
+    /// @copydoc dnnl_unimplemented
+    unimplemented = dnnl_unimplemented,
+    /// @copydoc dnnl_last_impl_reached
+    last_impl_reached = dnnl_last_impl_reached,
+    /// @copydoc dnnl_runtime_error
+    runtime_error = dnnl_runtime_error,
+    /// @copydoc dnnl_not_required
+    not_required = dnnl_not_required,
+};
+
+/// @copydoc dnnl_set_verbose()
+inline status set_verbose(int level) {
+    return static_cast<status>(dnnl_set_verbose(level));
+}
+
+/// @copydoc dnnl_version()
+inline const version_t *version() {
+    return dnnl_version();
+}
+
+/// Returns the floating-point math mode that will be used by default
+/// for all subsequently created primitives.
+///
+/// @returns Output FP math mode.
+inline fpmath_mode get_default_fpmath_mode() {
+    dnnl_fpmath_mode_t mode;
+    error::wrap_c_api(dnnl_get_default_fpmath_mode(&mode),
+            "could not get a default fpmath mode");
+    return static_cast<fpmath_mode>(mode);
+}
+
+/// @copydoc dnnl_set_default_fpmath_mode()
+inline status set_default_fpmath_mode(fpmath_mode mode) {
+    return static_cast<status>(
+            dnnl_set_default_fpmath_mode(convert_to_c(mode)));
+}
+
+/// @copydoc dnnl_set_jit_dump()
+inline status set_jit_dump(int enable) {
+    return static_cast<status>(dnnl_set_jit_dump(enable));
+}
+
+/// @copydoc dnnl_set_jit_profiling_flags()
+inline status set_jit_profiling_flags(unsigned flags) {
+    return static_cast<status>(dnnl_set_jit_profiling_flags(flags));
+}
+
+/// @copydoc dnnl_set_jit_profiling_jitdumpdir()
+inline status set_jit_profiling_jitdumpdir(const std::string &dir) {
+    return static_cast<status>(dnnl_set_jit_profiling_jitdumpdir(dir.c_str()));
+}
+
+/// @copydoc dnnl_cpu_isa_t
+enum class cpu_isa {
+    /// @copydoc dnnl_cpu_isa_default
+    isa_default = dnnl_cpu_isa_default,
+    /// @copydoc dnnl_cpu_isa_sse41
+    sse41 = dnnl_cpu_isa_sse41,
+    /// @copydoc dnnl_cpu_isa_avx
+    avx = dnnl_cpu_isa_avx,
+    /// @copydoc dnnl_cpu_isa_avx2
+    avx2 = dnnl_cpu_isa_avx2,
+    /// @copydoc dnnl_cpu_isa_avx2_vnni
+    avx2_vnni = dnnl_cpu_isa_avx2_vnni,
+    /// @copydoc dnnl_cpu_isa_avx2_vnni_2
+    avx2_vnni_2 = dnnl_cpu_isa_avx2_vnni_2,
+    /// @copydoc dnnl_cpu_isa_avx512_core
+    avx512_core = dnnl_cpu_isa_avx512_core,
+    /// @copydoc dnnl_cpu_isa_avx512_core_vnni
+    avx512_core_vnni = dnnl_cpu_isa_avx512_core_vnni,
+    /// @copydoc dnnl_cpu_isa_avx512_core_bf16
+    avx512_core_bf16 = dnnl_cpu_isa_avx512_core_bf16,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512
+    avx10_1_512 = dnnl_cpu_isa_avx10_1_512,
+    /// @copydoc dnnl_cpu_isa_avx512_core_fp16
+    avx512_core_fp16 = dnnl_cpu_isa_avx512_core_fp16,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx
+    avx10_1_512_amx = dnnl_cpu_isa_avx10_1_512_amx,
+    /// @copydoc dnnl_cpu_isa_avx512_core_amx
+    avx512_core_amx = dnnl_cpu_isa_avx512_core_amx,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16
+    avx10_1_512_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16,
+    /// @copydoc dnnl_cpu_isa_avx512_core_amx_fp16
+    avx512_core_amx_fp16 = dnnl_cpu_isa_avx512_core_amx_fp16,
+};
+
+/// @copydoc dnnl_set_max_cpu_isa()
+inline status set_max_cpu_isa(cpu_isa isa) {
+    return static_cast<status>(
+            dnnl_set_max_cpu_isa(static_cast<dnnl_cpu_isa_t>(isa)));
+}
+
+/// @copydoc dnnl_get_effective_cpu_isa()
+inline cpu_isa get_effective_cpu_isa() {
+    return static_cast<cpu_isa>(dnnl_get_effective_cpu_isa());
+}
+
+/// @copydoc dnnl_cpu_isa_hints_t
+enum class cpu_isa_hints {
+    /// @copydoc dnnl_cpu_isa_no_hints
+    no_hints = dnnl_cpu_isa_no_hints,
+    /// @copydoc dnnl_cpu_isa_prefer_ymm
+    prefer_ymm = dnnl_cpu_isa_prefer_ymm,
+};
+
+/// @copydoc dnnl_set_cpu_isa_hints()
+inline status set_cpu_isa_hints(cpu_isa_hints isa_hints) {
+    return static_cast<status>(dnnl_set_cpu_isa_hints(
+            static_cast<dnnl_cpu_isa_hints_t>(isa_hints)));
+}
+
+/// @copydoc dnnl_get_cpu_isa_hints()
+inline cpu_isa_hints get_cpu_isa_hints() {
+    return static_cast<cpu_isa_hints>(dnnl_get_cpu_isa_hints());
+}
+
+/// @} dnnl_api_service
+
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+/// @addtogroup dnnl_api_profiling Profiling
+/// @{
+
+/// Profiling data kind.
+enum class profiling_data_kind {
+    /// Undefined profiling data kind.
+    undef = dnnl_profiling_data_kind_undef,
+    /// Data kind to query an execution time in nanoseconds.
+    time = dnnl_profiling_data_kind_time,
+};
+
+/// Resets a profiler's state.
+///
+/// @param stream Stream associated with the profiler.
+inline void reset_profiling(stream &stream) {
+    error::wrap_c_api(
+            dnnl_reset_profiling(stream.get()), "could not reset profiling");
+}
+
+/// Returns requested profiling data. The profiling data accumulates for each
+/// primitive execution. The size of the vector will be equal to the number
+/// of executions since the last `dnnl::reset_profiling` call.
+///
+/// The profiling data can be reset by calling #dnnl::reset_profiling.
+///
+/// @note
+///     It is required to wait for all submitted primitives to complete
+///     using #dnnl::stream::wait prior to querying profiling data.
+///
+/// @param stream Stream that was used for executing a primitive that
+///     is being profiled.
+/// @param data_kind Profiling data kind to query.
+///
+/// @returns A vector with the requested profiling data.
+inline std::vector<uint64_t> get_profiling_data(
+        stream &stream, profiling_data_kind data_kind) {
+    int num_entries = 0;
+    error::wrap_c_api(
+            dnnl_query_profiling_data(stream.get(),
+                    static_cast<dnnl_profiling_data_kind_t>(data_kind),
+                    &num_entries, nullptr),
+            "could not get number of entries for profiling data");
+
+    if (num_entries == 0) return {};
+
+    std::vector<uint64_t> data(num_entries);
+    error::wrap_c_api(
+            dnnl_query_profiling_data(stream.get(),
+                    static_cast<dnnl_profiling_data_kind_t>(data_kind),
+                    &num_entries, data.data()),
+            "could not get profiling data");
+    return data;
+}
+
+/// @} dnnl_api_profiling
+#endif
+
+/// @addtogroup dnnl_api_primitive_cache Primitive Cache
+///
+/// A set of functions that provide primitive cache control.
+///
+/// @{
+
+/// Returns the number of primitives that can be held in the primitive cache
+/// at the same time.
+inline int get_primitive_cache_capacity() {
+    int result = 0;
+    error::wrap_c_api(dnnl_get_primitive_cache_capacity(&result),
+            "could not get primitive cache capacity");
+    return result;
+}
+
+/// @copydoc dnnl_set_primitive_cache_capacity(int capacity)
+inline void set_primitive_cache_capacity(int capacity) {
+    error::wrap_c_api(dnnl_set_primitive_cache_capacity(capacity),
+            "could not set primitive cache capacity");
+}
+
+/// @} dnnl_api_primitive_cache
+
+/// @addtogroup dnnl_api_blas BLAS functions
+///
+/// A subset of Basic Linear Algebra (BLAS) functions that perform
+/// matrix-matrix multiplication.
+///
+/// @{
+
+/// @copydoc dnnl_sgemm()
+inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N,
+        dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda,
+        const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc) {
+    return static_cast<status>(dnnl_sgemm(
+            transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc));
+}
+
+/// @copydoc dnnl_gemm_u8s8s32()
+inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A,
+        dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co) {
+    return static_cast<status>(dnnl_gemm_u8s8s32(transa, transb, offsetc, M, N,
+            K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co));
+}
+
+/// @copydoc dnnl_gemm_s8s8s32()
+inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A,
+        dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co) {
+    return static_cast<status>(dnnl_gemm_s8s8s32(transa, transb, offsetc, M, N,
+            K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co));
+}
+
+/// @} dnnl_api_blas
+
+// implementation section
+
+/// @cond DO_NOT_DOCUMENT_THIS
+inline primitive::primitive(const_dnnl_primitive_desc_t c_pd) {
+    dnnl_primitive_t result;
+    error::wrap_c_api(dnnl_primitive_create(&result, c_pd),
+            "could not create a primitive");
+    reset(result);
+}
+
+inline primitive::primitive(const_dnnl_primitive_desc_t c_pd,
+        const std::vector<uint8_t> &cache_blob) {
+    dnnl_primitive_t result;
+    size_t size = cache_blob.size();
+    const uint8_t *cache_blob_data = cache_blob.data();
+    error::wrap_c_api(dnnl_primitive_create_from_cache_blob(
+                              &result, c_pd, size, cache_blob_data),
+            "could not create a primitive from a cache blob");
+    reset(result);
+}
+
+inline primitive::primitive(const primitive_desc &pd) : primitive(pd.get()) {}
+inline primitive::primitive(
+        const primitive_desc &pd, const std::vector<uint8_t> &cache_blob)
+    : primitive(pd.get(), cache_blob) {}
+
+inline void primitive::execute(const stream &astream,
+        const std::unordered_map<int, memory> &args) const {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get(true)});
+
+    error::wrap_c_api(dnnl_primitive_execute(get(), astream.get(),
+                              (int)c_args.size(), c_args.data()),
+            "could not execute a primitive");
+}
+
+/// @endcond
+
+#undef DNNL_DEFINE_BITMASK_OPS
+
+} // namespace dnnl
+
+/// oneAPI namespace
+
+/// The oneAPI namespace.
+/// Contains the oneapi::dnnl namespace as an alias to the ::dnnl namespace.
+namespace oneapi {
+// Note: without this guard, doxygen warns of potentially recursive namespace
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+/// oneDNN alias namespace
+namespace dnnl = ::dnnl;
+#endif
+} // namespace oneapi
+
+/// @} dnnl_api
+
+#endif /* ONEAPI_DNNL_DNNL_HPP */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..611e025c4bbe2a194fb799dfe7d1a45583bfb182
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.h
@@ -0,0 +1,180 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C common API
+
+#ifndef ONEAPI_DNNL_DNNL_COMMON_H
+#define ONEAPI_DNNL_DNNL_COMMON_H
+
+#include "oneapi/dnnl/dnnl_common_types.h"
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_version.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api  oneDNN API
+/// @{
+
+/// @addtogroup dnnl_api_common Common API
+/// @{
+
+/// @addtogroup dnnl_api_engine Engine
+/// @{
+
+/// Returns the number of engines of a particular kind.
+///
+/// @param kind Kind of engines to count.
+/// @returns Count of the engines.
+size_t DNNL_API dnnl_engine_get_count(dnnl_engine_kind_t kind);
+
+/// Creates an engine.
+///
+/// @param engine Output engine.
+/// @param kind Engine kind.
+/// @param index Engine index that should be between 0 and the count of
+///     engines of the requested kind.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_engine_create(
+        dnnl_engine_t *engine, dnnl_engine_kind_t kind, size_t index);
+
+/// Returns the kind of an engine.
+///
+/// @param engine Engine to query.
+/// @param kind Output engine kind.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_engine_get_kind(
+        dnnl_engine_t engine, dnnl_engine_kind_t *kind);
+
+/// Destroys an engine.
+///
+/// @param engine Engine to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_engine_destroy(dnnl_engine_t engine);
+
+/// @} dnnl_api_engine
+
+/// @addtogroup dnnl_api_stream Stream
+/// @{
+
+/// Creates an execution stream.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param flags Stream behavior flags (@sa dnnl_stream_flags_t).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, unsigned flags);
+
+/// Returns the engine of a stream object.
+///
+/// @param stream Stream object.
+/// @param engine Output engine on which the stream is created.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_stream_get_engine(
+        const_dnnl_stream_t stream, dnnl_engine_t *engine);
+
+/// Waits for all primitives in the execution stream to finish computations.
+///
+/// @param stream Execution stream.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_stream_wait(dnnl_stream_t stream);
+
+/// Destroys an execution stream.
+///
+/// @param stream Execution stream to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_stream_destroy(dnnl_stream_t stream);
+
+/// @} dnnl_api_stream
+
+/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode
+/// @{
+
+/// Returns the floating-point math mode that will be used by default
+/// for all subsequently created primitives.
+///
+/// @param mode Output FP math mode.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_get_default_fpmath_mode(dnnl_fpmath_mode_t *mode);
+
+/// Sets the floating-point math mode that will be used by default
+/// for all subsequently created primitives.
+///
+/// @param mode FP math mode. The possible values are:
+///     #dnnl_fpmath_mode_strict,
+///     #dnnl_fpmath_mode_bf16,
+///     #dnnl_fpmath_mode_f16,
+///     #dnnl_fpmath_mode_tf32,
+///     #dnnl_fpmath_mode_any.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_set_default_fpmath_mode(dnnl_fpmath_mode_t mode);
+
+/// @} dnnl_api_fpmath_mode
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// Configures verbose output to stdout.
+///
+/// @note
+///     Enabling verbose output affects performance.
+///     This setting overrides the ONEDNN_VERBOSE environment variable.
+///
+/// @param level Verbosity level:
+///  - 0: no verbose output (default),
+///  - 1: primitive and graph information at execution,
+///  - 2: primitive and graph information at creation/compilation and execution.
+/// @returns #dnnl_invalid_arguments/#dnnl::status::invalid_arguments if the
+///     @p level value is invalid, and #dnnl_success/#dnnl::status::success on
+///     success.
+dnnl_status_t DNNL_API dnnl_set_verbose(int level);
+
+/// Returns library version information.
+/// @returns Pointer to a constant structure containing
+///  - major: major version number,
+///  - minor: minor version number,
+///  - patch: patch release number,
+///  - hash: git commit hash.
+const dnnl_version_t DNNL_API *dnnl_version(void);
+
+/// @} dnnl_api_service
+
+/// @} dnnl_api_common
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_COMMON_H */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..694fe419c786be7a8449c3a7083dfd865627871c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common.hpp
@@ -0,0 +1,484 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2022-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C++ common API
+
+#ifndef ONEAPI_DNNL_DNNL_COMMON_HPP
+#define ONEAPI_DNNL_DNNL_COMMON_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_common.h"
+
+/// @endcond
+
+// __cpp_exceptions is referred from
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_exceptions.html
+// gcc < 5 does not define __cpp_exceptions but __EXCEPTIONS,
+// Microsoft C++ Compiler does not provide an option to disable exceptions
+#ifndef DNNL_ENABLE_EXCEPTIONS
+#if __cpp_exceptions || __EXCEPTIONS \
+        || (defined(_MSC_VER) && !defined(__clang__))
+#define DNNL_ENABLE_EXCEPTIONS 1
+#else
+#define DNNL_ENABLE_EXCEPTIONS 0
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define DNNL_TRAP() __builtin_trap()
+#elif defined(__INTEL_COMPILER) || defined(_MSC_VER)
+#define DNNL_TRAP() __debugbreak()
+#else
+#error "unknown compiler"
+#endif
+
+#if DNNL_ENABLE_EXCEPTIONS
+#define DNNL_THROW_ERROR(status, msg) throw error(status, msg)
+#else
+#include <cstdio>
+#define DNNL_THROW_ERROR(status, msg) \
+    do { \
+        fputs(msg, stderr); \
+        DNNL_TRAP(); \
+    } while (0)
+#endif
+
+/// @addtogroup dnnl_api oneDNN API
+/// @{
+
+/// oneDNN namespace
+namespace dnnl {
+
+/// @addtogroup dnnl_api_common Common API
+/// @{
+
+/// @addtogroup dnnl_api_utils Utilities
+/// Utility types and definitions.
+/// @{
+
+/// oneDNN exception class.
+///
+/// This class captures the status returned by a failed C API function and
+/// the error message from the call site.
+struct error : public std::exception {
+    dnnl_status_t status;
+    const char *message;
+
+    /// Constructs an instance of an exception class.
+    ///
+    /// @param status The error status returned by a C API function.
+    /// @param message The error message.
+    error(dnnl_status_t status, const char *message)
+        : status(status), message(message) {}
+
+    /// Returns the explanatory string.
+    const char *what() const noexcept override { return message; }
+
+    /// A convenience function for wrapping calls to C API functions. Checks
+    /// the return status and throws an dnnl::error in case of failure.
+    ///
+    /// @param status The error status returned by a C API function.
+    /// @param message The error message.
+    static void wrap_c_api(dnnl_status_t status, const char *message) {
+        if (status != dnnl_success) DNNL_THROW_ERROR(status, message);
+    }
+};
+
+/// A class that provides the destructor for a oneDNN C API handle.
+template <typename T>
+struct handle_traits {};
+
+/// oneDNN C API handle wrapper class.
+///
+/// This class is used as the base class for primitive (dnnl::primitive),
+/// engine (dnnl::engine), and stream (dnnl::stream) classes, as well as
+/// others. An object of the dnnl::handle class can be passed by value.
+///
+/// A handle can be weak, in which case it follows std::weak_ptr semantics.
+/// Otherwise, it follows `std::shared_ptr` semantics.
+///
+/// @note
+///     The implementation stores oneDNN C API handles in a `std::shared_ptr`
+///     with deleter set to a dummy function in the weak mode.
+///
+template <typename T, typename traits = handle_traits<T>>
+struct handle {
+private:
+    static dnnl_status_t dummy_destructor(T) { return dnnl_success; }
+    std::shared_ptr<typename std::remove_pointer<T>::type> data_ {0};
+
+protected:
+    bool operator==(const T other) const { return other == data_.get(); }
+    bool operator!=(const T other) const { return !(*this == other); }
+
+public:
+    /// Constructs an empty handle object.
+    ///
+    /// @warning
+    ///     Uninitialized object cannot be used in most library calls and is
+    ///     equivalent to a null pointer. Any attempt to use its methods, or
+    ///     passing it to the other library function, will cause an exception
+    ///     to be thrown.
+    handle() = default;
+
+    /// Copy constructor.
+    handle(const handle<T, traits> &) = default;
+    /// Assignment operator.
+    handle<T, traits> &operator=(const handle<T, traits> &) = default;
+    /// Move constructor.
+    handle(handle<T, traits> &&) = default;
+    /// Move assignment operator.
+    handle<T, traits> &operator=(handle<T, traits> &&) = default;
+
+    /// Constructs a handle wrapper object from a C API handle.
+    ///
+    /// @param t The C API handle to wrap.
+    /// @param weak A flag specifying whether to construct a weak wrapper;
+    ///     defaults to @c false.
+    explicit handle(T t, bool weak = false) { reset(t, weak); }
+
+    /// Resets the handle wrapper objects to wrap a new C API handle.
+    ///
+    /// @param t The new value of the C API handle.
+    /// @param weak A flag specifying whether the wrapper should be weak;
+    ///     defaults to @c false.
+    void reset(T t, bool weak = false) {
+        data_.reset(t, weak ? &dummy_destructor : traits::destructor);
+    }
+
+    /// Returns the underlying C API handle.
+    ///
+    /// @param allow_empty A flag signifying whether the method is allowed to
+    ///     return an empty (null) object without throwing an exception.
+    /// @returns The underlying C API handle.
+    T get(bool allow_empty = false) const {
+        T result = data_.get();
+        if (allow_empty == false && result == nullptr)
+            DNNL_THROW_ERROR(
+                    dnnl_invalid_arguments, "object is not initialized");
+        return result;
+    }
+
+    /// Converts a handle to the underlying C API handle type. Does not throw
+    /// and returns `nullptr` if the object is empty.
+    ///
+    /// @returns The underlying C API handle.
+    explicit operator T() const { return get(true); }
+
+    /// Checks whether the object is not empty.
+    ///
+    /// @returns Whether the object is not empty.
+    explicit operator bool() const { return get(true) != nullptr; }
+
+    /// Equality operator.
+    ///
+    /// @param other Another handle wrapper.
+    /// @returns @c true if this and the other handle wrapper manage the same
+    ///     underlying C API handle, and @c false otherwise. Empty handle
+    ///     objects are considered to be equal.
+    bool operator==(const handle<T, traits> &other) const {
+        return other.data_.get() == data_.get();
+    }
+
+    /// Inequality operator.
+    ///
+    /// @param other Another handle wrapper.
+    /// @returns @c true if this and the other handle wrapper manage different
+    ///     underlying C API handles, and @c false otherwise. Empty handle
+    ///     objects are considered to be equal.
+    bool operator!=(const handle &other) const { return !(*this == other); }
+};
+
+/// @} dnnl_api_utils
+
+/// @addtogroup dnnl_api_engine Engine
+///
+/// An abstraction of a computational device: a CPU, a specific GPU
+/// card in the system, etc. Most primitives are created to execute
+/// computations on one specific engine. The only exceptions are reorder
+/// primitives that transfer data between two different engines.
+///
+/// @sa @ref dev_guide_basic_concepts
+///
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <>
+struct handle_traits<dnnl_engine_t> {
+    static dnnl_status_t destructor(dnnl_engine_t p) {
+        return dnnl_engine_destroy(p);
+    }
+};
+/// @endcond
+
+/// An execution engine.
+struct engine : public handle<dnnl_engine_t> {
+    friend struct primitive;
+    friend struct reorder;
+
+    /// Kinds of engines.
+    enum class kind {
+        /// An unspecified engine
+        any = dnnl_any_engine,
+        /// CPU engine
+        cpu = dnnl_cpu,
+        /// GPU engine
+        gpu = dnnl_gpu,
+    };
+
+    using handle::handle;
+
+    /// Constructs an empty engine. An empty engine cannot be used in any
+    /// operations.
+    engine() = default;
+
+    /// Returns the number of engines of a certain kind.
+    ///
+    /// @param akind The kind of engines to count.
+    /// @returns The number of engines of the specified kind.
+    static size_t get_count(kind akind) {
+        return dnnl_engine_get_count(convert_to_c(akind));
+    }
+
+    /// Constructs an engine.
+    ///
+    /// @param akind The kind of engine to construct.
+    /// @param index The index of the engine. Must be less than the value
+    ///     returned by #get_count() for this particular kind of engine.
+    engine(kind akind, size_t index) {
+        dnnl_engine_t engine;
+        error::wrap_c_api(
+                dnnl_engine_create(&engine, convert_to_c(akind), index),
+                "could not create an engine");
+        reset(engine);
+    }
+
+    /// Returns the kind of the engine.
+    /// @returns The kind of the engine.
+    kind get_kind() const {
+        dnnl_engine_kind_t kind;
+        error::wrap_c_api(dnnl_engine_get_kind(get(), &kind),
+                "could not get kind of an engine");
+        return static_cast<engine::kind>(kind);
+    }
+
+private:
+    static dnnl_engine_kind_t convert_to_c(kind akind) {
+        return static_cast<dnnl_engine_kind_t>(akind);
+    }
+};
+
+/// Converts engine kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API engine kind enum value.
+/// @returns Corresponding C API engine kind enum value.
+inline dnnl_engine_kind_t convert_to_c(engine::kind akind) {
+    return static_cast<dnnl_engine_kind_t>(akind);
+}
+
+/// @} dnnl_api_engine
+
+/// @addtogroup dnnl_api_stream Stream
+///
+/// An encapsulation of execution context tied to a particular engine.
+///
+/// @sa @ref dev_guide_basic_concepts
+///
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+template <>
+struct handle_traits<dnnl_stream_t> {
+    static dnnl_status_t destructor(dnnl_stream_t p) {
+        return dnnl_stream_destroy(p);
+    }
+};
+/// @endcond
+
+/// An execution stream.
+struct stream : public handle<dnnl_stream_t> {
+    using handle::handle;
+
+    /// Stream flags. Can be combined using the bitwise OR operator.
+    enum class flags : unsigned {
+        /// In-order execution.
+        in_order = dnnl_stream_in_order,
+        /// Out-of-order execution.
+        out_of_order = dnnl_stream_out_of_order,
+        /// Default stream configuration.
+        default_flags = dnnl_stream_default_flags,
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+        /// Enables profiling capabilities.
+        profiling = dnnl_stream_profiling,
+#endif
+    };
+
+    /// Constructs an empty stream. An empty stream cannot be used in any
+    /// operations.
+    stream() = default;
+
+    /// Constructs a stream for the specified engine and with behavior
+    /// controlled by the specified flags.
+    ///
+    /// @param aengine Engine to create the stream on.
+    /// @param aflags Flags controlling stream behavior.
+    explicit stream(
+            const engine &aengine, flags aflags = flags::default_flags) {
+        dnnl_stream_t stream;
+        error::wrap_c_api(dnnl_stream_create(&stream, aengine.get(),
+                                  static_cast<dnnl_stream_flags_t>(aflags)),
+                "could not create a stream");
+        reset(stream);
+    }
+
+    /// Returns the associated engine.
+    engine get_engine() const {
+        dnnl_engine_t c_engine;
+        error::wrap_c_api(dnnl_stream_get_engine(get(), &c_engine),
+                "could not get an engine from a stream object");
+        return engine(c_engine, true);
+    }
+
+    /// Waits for all primitives executing in the stream to finish.
+    /// @returns The stream itself.
+    stream &wait() {
+        error::wrap_c_api(
+                dnnl_stream_wait(get()), "could not wait on a stream");
+        return *this;
+    }
+};
+
+#define DNNL_DEFINE_BITMASK_OPS(enum_name) \
+    inline enum_name operator|(enum_name lhs, enum_name rhs) { \
+        return static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) | static_cast<unsigned>(rhs)); \
+    } \
+\
+    inline enum_name operator&(enum_name lhs, enum_name rhs) { \
+        return static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) & static_cast<unsigned>(rhs)); \
+    } \
+\
+    inline enum_name operator^(enum_name lhs, enum_name rhs) { \
+        return static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) ^ static_cast<unsigned>(rhs)); \
+    } \
+\
+    inline enum_name &operator|=(enum_name &lhs, enum_name rhs) { \
+        lhs = static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) | static_cast<unsigned>(rhs)); \
+        return lhs; \
+    } \
+\
+    inline enum_name &operator&=(enum_name &lhs, enum_name rhs) { \
+        lhs = static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) & static_cast<unsigned>(rhs)); \
+        return lhs; \
+    } \
+\
+    inline enum_name &operator^=(enum_name &lhs, enum_name rhs) { \
+        lhs = static_cast<enum_name>( \
+                static_cast<unsigned>(lhs) ^ static_cast<unsigned>(rhs)); \
+        return lhs; \
+    } \
+\
+    inline enum_name operator~(enum_name rhs) { \
+        return static_cast<enum_name>(~static_cast<unsigned>(rhs)); \
+    }
+
+DNNL_DEFINE_BITMASK_OPS(stream::flags)
+
+/// @} dnnl_api_stream
+
+/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode
+/// @{
+
+/// Floating-point math mode
+enum class fpmath_mode {
+    /// Default behavior, no downconversions allowed
+    strict = dnnl_fpmath_mode_strict,
+    /// Implicit f32->bf16 conversions allowed
+    bf16 = dnnl_fpmath_mode_bf16,
+    /// Implicit f32->f16 conversions allowed
+    f16 = dnnl_fpmath_mode_f16,
+    /// Implicit f32->tf32 conversions allowed
+    tf32 = dnnl_fpmath_mode_tf32,
+    /// Implicit f32->f16, f32->tf32 or f32->bf16 conversions allowed
+    any = dnnl_fpmath_mode_any
+};
+
+/// Converts an fpmath mode enum value from C++ API to C API type.
+///
+/// @param mode C++ API fpmath mode enum value.
+/// @returns Corresponding C API fpmath mode enum value.
+inline dnnl_fpmath_mode_t convert_to_c(fpmath_mode mode) {
+    return static_cast<dnnl_fpmath_mode_t>(mode);
+}
+
+/// @} dnnl_api_fpmath_mode
+
+/// @addtogroup dnnl_api_accumulation_mode Accumulation Mode
+/// @{
+
+/// Accumulation mode
+enum class accumulation_mode {
+    /// Default behavior, f32 for floating point computation, s32 for integer
+    strict = dnnl_accumulation_mode_strict,
+    /// same as strict except some partial accumulators can be rounded to
+    /// src/dst datatype in memory.
+    relaxed = dnnl_accumulation_mode_relaxed,
+    /// uses fastest implementation, could use src/dst datatype or
+    /// wider datatype for accumulators
+    any = dnnl_accumulation_mode_any,
+    /// use s32 accumulators during computation
+    s32 = dnnl_accumulation_mode_s32,
+    /// use f32 accumulators during computation
+    f32 = dnnl_accumulation_mode_f32,
+    /// use f16 accumulators during computation
+    f16 = dnnl_accumulation_mode_f16
+};
+
+/// Converts an accumulation mode enum value from C++ API to C API type.
+///
+/// @param mode C++ API accumulation mode enum value.
+/// @returns Corresponding C API accumulation mode enum value.
+inline dnnl_accumulation_mode_t convert_to_c(accumulation_mode mode) {
+    return static_cast<dnnl_accumulation_mode_t>(mode);
+}
+
+/// @} dnnl_api_accumulation_mode
+
+/// @} dnnl_api_common
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..116dd79eae86acfc0f153d0dc11970f55fb38571
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_common_types.h
@@ -0,0 +1,268 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C API common types definitions
+
+#ifndef ONEAPI_DNNL_DNNL_COMMON_TYPES_H
+#define ONEAPI_DNNL_DNNL_COMMON_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <stddef.h>
+#include <stdint.h>
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+/// @endcond
+
+/// @addtogroup dnnl_api oneDNN API
+/// @{
+
+/// @addtogroup dnnl_api_common Common API
+/// @{
+
+/// @addtogroup dnnl_api_utils
+/// @{
+
+/// Status values returned by the library functions.
+typedef enum {
+    /// The operation was successful
+    dnnl_success = 0,
+    /// The operation failed due to an out-of-memory condition
+    dnnl_out_of_memory = 1,
+    /// The operation failed because of incorrect function arguments
+    dnnl_invalid_arguments = 2,
+    /// The operation failed because requested functionality is not implemented
+    dnnl_unimplemented = 3,
+    /// The last available implementation is reached
+    dnnl_last_impl_reached = 4,
+    /// Primitive or engine failed on execution
+    dnnl_runtime_error = 5,
+    /// Queried element is not required for given primitive
+    dnnl_not_required = 6,
+    /// The graph is not legitimate
+    dnnl_invalid_graph = 7,
+    /// The operation is not legitimate according to op schema
+    dnnl_invalid_graph_op = 8,
+    /// The shape cannot be inferred or compiled
+    dnnl_invalid_shape = 9,
+    /// The data type cannot be inferred or compiled
+    dnnl_invalid_data_type = 10,
+} dnnl_status_t;
+
+/// @} dnnl_api_utils
+
+/// @addtogroup dnnl_api_data_types Data types
+/// @{
+
+/// Data type specification
+typedef enum {
+    /// Undefined data type, used for empty memory descriptors.
+    dnnl_data_type_undef = 0,
+    /// 16-bit/half-precision floating point.
+    dnnl_f16 = 1,
+    /// non-standard 16-bit (bfloat16 w/ 7 bit mantissa) floating point.
+    dnnl_bf16 = 2,
+    /// 32-bit/single-precision floating point.
+    dnnl_f32 = 3,
+    /// 32-bit signed integer.
+    dnnl_s32 = 4,
+    /// 8-bit signed integer.
+    dnnl_s8 = 5,
+    /// 8-bit unsigned integer.
+    dnnl_u8 = 6,
+    /// 64-bit/double-precision floating point.
+    dnnl_f64 = 7,
+    /// Boolean data type. Size is C++ implementation defined.
+    dnnl_boolean = 8,
+    /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+    /// with a 5-bit exponent and a 2-bit mantissa.
+    dnnl_f8_e5m2 = 9,
+    /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+    /// with a 4-bit exponent and a 3-bit mantissa.
+    dnnl_f8_e4m3 = 10,
+    /// 4-bit signed integer.
+    dnnl_s4 = 11,
+    /// 4-bit unsigned integer.
+    dnnl_u4 = 12,
+    /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent.
+    dnnl_e8m0 = 13,
+    /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa.
+    dnnl_f4_e2m1 = 14,
+    /// 4-bit float data type with 3-bit exponent and 0 bit mantissa.
+    dnnl_f4_e3m0 = 15,
+
+    /// Parameter to allow internal only data_types without undefined behavior.
+    /// This parameter is chosen to be valid for so long as sizeof(int) >= 2.
+    dnnl_data_type_max = 0x7fff,
+} dnnl_data_type_t;
+
+/// Maximum number of dimensions a tensor can have. Only restricts the amount
+/// of space used for the tensor description. Individual computational
+/// primitives may support only tensors of certain dimensions.
+#define DNNL_MAX_NDIMS 12
+
+/// A type to describe tensor dimension.
+typedef int64_t dnnl_dim_t;
+
+/// A type to describe tensor dimensions.
+typedef dnnl_dim_t dnnl_dims_t[DNNL_MAX_NDIMS];
+
+/// @} dnnl_api_data_types
+
+/// @addtogroup dnnl_api_fpmath_mode Floating-point Math Mode
+/// @{
+
+/// Floating-point math mode
+typedef enum {
+    /// Default behavior, no downconversions allowed
+    dnnl_fpmath_mode_strict,
+    /// Implicit f32->bf16 conversions allowed
+    dnnl_fpmath_mode_bf16,
+    /// Implicit f32->f16 conversions allowed
+    dnnl_fpmath_mode_f16,
+    /// Implicit f32->f16, f32->tf32 or f32->bf16 conversions allowed
+    dnnl_fpmath_mode_any,
+    /// Implicit f32->tf32 conversions allowed
+    dnnl_fpmath_mode_tf32,
+} dnnl_fpmath_mode_t;
+
+/// @} dnnl_api_fpmath_mode
+
+/// @addtogroup dnnl_api_accumulation_mode Accumulation Mode
+/// @{
+
+/// Accumulation mode
+typedef enum {
+    /// Default behavior, f32/f64 for floating point computation, s32
+    /// for integer
+    dnnl_accumulation_mode_strict,
+    /// Same as strict but allows some partial accumulators to be
+    /// rounded to src/dst datatype in memory.
+    dnnl_accumulation_mode_relaxed,
+    /// uses fastest implementation, could use src/dst datatype or
+    /// wider datatype for accumulators
+    dnnl_accumulation_mode_any,
+    /// use s32 accumulators during computation
+    dnnl_accumulation_mode_s32,
+    /// use f32 accumulators during computation
+    dnnl_accumulation_mode_f32,
+    /// use f16 accumulators during computation
+    dnnl_accumulation_mode_f16
+} dnnl_accumulation_mode_t;
+
+/// @} dnnl_api_accumulation_mode
+
+/// @addtogroup dnnl_api_engine Engine
+/// @{
+
+/// @brief Kinds of engines.
+typedef enum {
+    /// An unspecified engine.
+    dnnl_any_engine,
+    /// CPU engine.
+    dnnl_cpu,
+    /// GPU engine.
+    dnnl_gpu,
+} dnnl_engine_kind_t;
+
+/// @struct dnnl_engine
+/// @brief An opaque structure to describe an engine.
+struct dnnl_engine;
+/// @brief An engine handle.
+typedef struct dnnl_engine *dnnl_engine_t;
+#if 0
+// FIXME: looks like this never happens
+/// @brief A constant engine handle.
+typedef const struct dnnl_engine *const_dnnl_engine_t;
+#endif
+
+/// @} dnnl_api_engine
+
+/// @addtogroup dnnl_api_stream Stream
+/// @{
+
+/// @brief Stream flags.
+typedef enum {
+    // In-order execution.
+    dnnl_stream_in_order = 0x1U,
+    /// Out-of-order execution.
+    dnnl_stream_out_of_order = 0x2U,
+    /// Default stream configuration.
+    dnnl_stream_default_flags = dnnl_stream_in_order,
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+    /// Enables profiling capabilities.
+    dnnl_stream_profiling = 0x4U,
+#endif
+} dnnl_stream_flags_t;
+
+/// @struct dnnl_stream
+/// An opaque structure to describe an execution stream.
+struct dnnl_stream;
+/// An execution stream handle.
+typedef struct dnnl_stream *dnnl_stream_t;
+/// A constant execution stream handle.
+typedef const struct dnnl_stream *const_dnnl_stream_t;
+
+/// @} dnnl_api_stream
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// Structure containing version information as per [Semantic
+/// Versioning](https://semver.org)
+typedef struct {
+    int major; ///< Major version
+    int minor; ///< Minor version
+    int patch; ///< Patch version
+    const char *hash; ///< Git hash of the sources (may be absent)
+    unsigned cpu_runtime; ///< CPU runtime
+    unsigned gpu_runtime; ///< GPU runtime
+} dnnl_version_t;
+
+/// @} dnnl_api_service
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// Special pointer value that indicates that a memory object should not have
+/// an underlying buffer.
+#define DNNL_MEMORY_NONE (NULL)
+
+/// Special pointer value that indicates that the library needs to allocate an
+/// underlying buffer for a memory object.
+#define DNNL_MEMORY_ALLOCATE ((void *)(size_t)-1)
+
+/// @} dnnl_api_memory
+
+/// @} dnnl_api_common
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..700cf6ce936c072fb56a50d1dbf83431a9a5ad5c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_config.h
@@ -0,0 +1,237 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_CONFIG_H
+#define ONEAPI_DNNL_DNNL_CONFIG_H
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+// All symbols shall be internal unless marked as DNNL_API
+#if defined _WIN32 || defined __CYGWIN__
+#define DNNL_HELPER_DLL_IMPORT __declspec(dllimport)
+#define DNNL_HELPER_DLL_EXPORT __declspec(dllexport)
+#else
+#if __GNUC__ >= 4
+#define DNNL_HELPER_DLL_IMPORT __attribute__((visibility("default")))
+#define DNNL_HELPER_DLL_EXPORT __attribute__((visibility("default")))
+#else
+#define DNNL_HELPER_DLL_IMPORT
+#define DNNL_HELPER_DLL_EXPORT
+#endif
+#endif
+
+#ifdef DNNL_DLL
+#ifdef DNNL_DLL_EXPORTS
+#define DNNL_API DNNL_HELPER_DLL_EXPORT
+#else
+#define DNNL_API DNNL_HELPER_DLL_IMPORT
+#endif
+#else
+#define DNNL_API
+#endif
+
+#if defined(__GNUC__)
+#define DNNL_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define DNNL_DEPRECATED __declspec(deprecated)
+#else
+#define DNNL_DEPRECATED
+#endif
+
+/// @endcond
+
+// clang-format off
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// No runtime (disabled)
+#define DNNL_RUNTIME_NONE 0u
+
+/// Sequential runtime (CPU only)
+#define DNNL_RUNTIME_SEQ 1u
+
+/// OpenMP runtime (CPU only)
+#define DNNL_RUNTIME_OMP 2u
+
+/// TBB runtime (CPU only)
+#define DNNL_RUNTIME_TBB 4u
+
+/// Threadpool runtime (CPU only)
+#define DNNL_RUNTIME_THREADPOOL 8u
+
+/// OpenCL runtime
+#define DNNL_RUNTIME_OCL 256u
+
+/// SYCL runtime
+#define DNNL_RUNTIME_SYCL 512u
+
+/// DPC++ runtime
+#define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL
+
+/// No vendor (corresponding runtime is disabled)
+#define DNNL_VENDOR_NONE 0u
+
+/// Intel vendor
+#define DNNL_VENDOR_INTEL 1u
+
+/// NVIDIA vendor
+#define DNNL_VENDOR_NVIDIA 2u
+
+/// AMD vendor
+#define DNNL_VENDOR_AMD 4u
+
+/// Generic vendor
+#define DNNL_VENDOR_GENERIC 8u
+
+/// @} dnnl_api_service
+
+// oneDNN CPU threading runtime
+#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP
+
+// oneDNN CPU engine runtime
+#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP
+
+// oneDNN GPU engine runtime
+#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE
+
+// oneDNN GPU vendor
+#define DNNL_GPU_VENDOR DNNL_VENDOR_NONE
+
+// clang-format on
+
+#if defined(DNNL_CPU_RUNTIME) && defined(DNNL_GPU_RUNTIME)
+#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_OCL)
+#error "Unexpected DNNL_CPU_RUNTIME"
+#endif
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
+#error "Unexpected DNNL_GPU_RUNTIME"
+#endif
+#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
+        && DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE)
+#error "At least one runtime must be specified"
+#endif
+#else
+#error "BOTH DNNL_CPU_RUNTIME and DNNL_GPU_RUNTIME must be defined"
+#endif
+
+// For SYCL CPU, a primitive may be created and executed in different threads
+// hence the global scratchpad does not work. This enables concurrent execution
+// when CPU runtime is SYCL to avoid the issue.
+#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
+#ifndef DNNL_ENABLE_CONCURRENT_EXEC
+#define DNNL_ENABLE_CONCURRENT_EXEC
+#endif
+#endif
+
+// When defined, primitive cache stores runtime objects.
+/* #undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE */
+
+// When defined, DPCPP is supported.
+/* #undef DNNL_WITH_SYCL */
+
+// When defined, Level Zero is supported.
+/* #undef DNNL_WITH_LEVEL_ZERO */
+
+// When defined, SYCL CUDA backend is used.
+/* #undef DNNL_SYCL_CUDA */
+
+// When defined, SYCL HIP backend is used.
+/* #undef DNNL_SYCL_HIP */
+
+// When defined, SYCL Generic backend is used.
+/* #undef DNNL_SYCL_GENERIC */
+
+// When defined, stack checker is enabled.
+/* #undef DNNL_ENABLE_STACK_CHECKER */
+
+// When defined, experimental features are enabled.
+/* #undef DNNL_EXPERIMENTAL */
+
+// When defined, experimental functionality for sparse domain is enabled.
+/* #undef DNNL_EXPERIMENTAL_SPARSE */
+
+// When defined, experimental functionality for ukernels is enabled.
+#define DNNL_EXPERIMENTAL_UKERNEL
+
+// When defined, graph component is enabled.
+#define ONEDNN_BUILD_GRAPH
+
+// When defined, experimental profiling capabilities are enabled.
+/* #undef DNNL_EXPERIMENTAL_PROFILING */
+
+// When defined, experimental logging capabilities are enabled.
+/* #undef DNNL_EXPERIMENTAL_LOGGING */
+// When defined, it disables GPU compute reference kernels.
+/* #undef DNNL_DISABLE_GPU_REF_KERNELS */
+
+// List of configurating build controls
+// Workload controls
+#define BUILD_TRAINING 1
+#define BUILD_INFERENCE 0
+// Primitive controls
+#define BUILD_PRIMITIVE_ALL 1
+#define BUILD_BATCH_NORMALIZATION 0
+#define BUILD_BINARY 0
+#define BUILD_CONCAT 0
+#define BUILD_CONVOLUTION 0
+#define BUILD_DECONVOLUTION 0
+#define BUILD_ELTWISE 0
+#define BUILD_GROUP_NORMALIZATION 0
+#define BUILD_INNER_PRODUCT 0
+#define BUILD_LAYER_NORMALIZATION 0
+#define BUILD_LRN 0
+#define BUILD_MATMUL 0
+#define BUILD_POOLING 0
+#define BUILD_PRELU 0
+#define BUILD_REDUCTION 0
+#define BUILD_REORDER 0
+#define BUILD_RESAMPLING 0
+#define BUILD_RNN 0
+#define BUILD_SDPA 0
+#define BUILD_SHUFFLE 0
+#define BUILD_SOFTMAX 0
+#define BUILD_SUM 0
+// Primitives CPU ISA controls
+#define BUILD_PRIMITIVE_CPU_ISA_ALL 1
+#define BUILD_SSE41 0
+#define BUILD_AVX2 0
+#define BUILD_AVX512 0
+#define BUILD_AMX 0
+// Primitives GPU ISA controls
+#define BUILD_PRIMITIVE_GPU_ISA_ALL 1
+#define BUILD_GEN9 0
+#define BUILD_GEN11 0
+#define BUILD_XELP 0
+#define BUILD_XEHP 0
+#define BUILD_XEHPG 0
+#define BUILD_XEHPC 0
+#define BUILD_XE2 0
+#define BUILD_XE3 0
+// GeMM kernels ISA controls
+#define BUILD_GEMM_KERNELS_ALL 1
+#define BUILD_GEMM_KERNELS_NONE 0
+#define BUILD_GEMM_SSE41 0
+#define BUILD_GEMM_AVX2 0
+#define BUILD_GEMM_AVX512 0
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..6447ade13989ceceff4ead69f7a8c11d27c13cd2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_debug.h
@@ -0,0 +1,66 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2018-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// DO NOT EDIT, AUTO-GENERATED
+// Use this script to update the file: scripts/generate_dnnl_debug.py
+
+// clang-format off
+
+#ifndef ONEAPI_DNNL_DNNL_DEBUG_H
+#define ONEAPI_DNNL_DNNL_DEBUG_H
+
+/// @file
+/// Debug capabilities
+
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char DNNL_API *dnnl_status2str(dnnl_status_t v);
+const char DNNL_API *dnnl_dt2str(dnnl_data_type_t v);
+const char DNNL_API *dnnl_fpmath_mode2str(dnnl_fpmath_mode_t v);
+const char DNNL_API *dnnl_accumulation_mode2str(dnnl_accumulation_mode_t v);
+const char DNNL_API *dnnl_engine_kind2str(dnnl_engine_kind_t v);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+const char DNNL_API *dnnl_sparse_encoding2str(dnnl_sparse_encoding_t v);
+#endif
+const char DNNL_API *dnnl_fmt_tag2str(dnnl_format_tag_t v);
+const char DNNL_API *dnnl_prop_kind2str(dnnl_prop_kind_t v);
+const char DNNL_API *dnnl_prim_kind2str(dnnl_primitive_kind_t v);
+const char DNNL_API *dnnl_alg_kind2str(dnnl_alg_kind_t v);
+const char DNNL_API *dnnl_rnn_flags2str(dnnl_rnn_flags_t v);
+const char DNNL_API *dnnl_rnn_direction2str(dnnl_rnn_direction_t v);
+const char DNNL_API *dnnl_scratchpad_mode2str(dnnl_scratchpad_mode_t v);
+const char DNNL_API *dnnl_rounding_mode2str(dnnl_rounding_mode_t v);
+const char DNNL_API *dnnl_cpu_isa2str(dnnl_cpu_isa_t v);
+const char DNNL_API *dnnl_cpu_isa_hints2str(dnnl_cpu_isa_hints_t v);
+
+const char DNNL_API *dnnl_runtime2str(unsigned v);
+const char DNNL_API *dnnl_fmt_kind2str(dnnl_format_kind_t v);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc34d2713c00890d3de8a5a87153c0d81a8dfb6d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.h
@@ -0,0 +1,777 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// Graph C API
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_H
+#define ONEAPI_DNNL_DNNL_GRAPH_H
+
+#include "oneapi/dnnl/dnnl_common.h"
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_graph_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_allocator
+/// @{
+
+/// Creates a host allocator with the given allocation and deallocation
+/// call-back function pointers.
+///
+/// @param allocator Output allocator.
+/// @param host_malloc A pointer to malloc function for host.
+/// @param host_free A pointer to free function for host.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_allocator_create(
+        dnnl_graph_allocator_t *allocator,
+        dnnl_graph_host_allocate_f host_malloc,
+        dnnl_graph_host_deallocate_f host_free);
+
+/// Destroys an allocator.
+///
+/// @param allocator The allocator to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_allocator_destroy(
+        dnnl_graph_allocator_t allocator);
+
+/// @} dnnl_graph_api_allocator
+
+/// @addtogroup dnnl_graph_api_engine
+/// @{
+
+/// This API is a supplement for existing onednn engine API.
+dnnl_status_t DNNL_API dnnl_graph_make_engine_with_allocator(
+        dnnl_engine_t *engine, dnnl_engine_kind_t kind, size_t index,
+        const_dnnl_graph_allocator_t alloc);
+
+/// @} dnnl_graph_api_engine
+
+/// @addtogroup dnnl_graph_api_logical_tensor
+/// @{
+
+/// Initializes a logical tensor with id, data type, number of dimensions,
+/// layout type, and property. The logical tensor's dims are unknown with this
+/// interface.
+///
+/// @param logical_tensor Output logical tensor.
+/// @param tid The unique id of the output logical tensor.
+/// @param dtype Elements data type.
+/// @param ndims Number of dimensions.
+/// @param ltype Layout type of the underlying tensor buffer.
+/// @param ptype Tensor property type.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init(
+        dnnl_graph_logical_tensor_t *logical_tensor, size_t tid,
+        dnnl_data_type_t dtype, int32_t ndims, dnnl_graph_layout_type_t ltype,
+        dnnl_graph_tensor_property_t ptype);
+
+/// Initializes a logical tensor with basic information and dims. The logical
+/// tensor's dimensions and layout will be initialized according to the input
+/// arguments.
+///
+/// @note
+///     If dims contains all valid values and layout type is
+///     #dnnl_graph_layout_type_strided. The strides field in
+///     #dnnl_graph_logical_tensor_t will be calculated in a row major and
+///     contiguous way. Otherwise, Accessing the strides field is an undefined
+///     behavior.
+///
+///     Eg. dims (2, 3, 4, 5) will get strides (60, 20, 5, 1)
+///
+/// @param logical_tensor Output logical tensor.
+/// @param tid The unique id of output logical tensor.
+/// @param dtype Elements data type.
+/// @param ndims Number of dimensions.
+/// @param dims Array of dimensions.
+/// @param ltype Layout type of the underlying tensor memory.
+/// @param ptype Tensor property type.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init_with_dims(
+        dnnl_graph_logical_tensor_t *logical_tensor, size_t tid,
+        dnnl_data_type_t dtype, int32_t ndims, const dnnl_dims_t dims,
+        dnnl_graph_layout_type_t ltype, dnnl_graph_tensor_property_t ptype);
+
+/// Initializes a logical tensor with dimensions and strides provided by user.
+///
+/// @note
+///     Once strides are explicitly provided through the API, the `layout_type`
+///     in #dnnl_graph_logical_tensor_t can only be
+///     #dnnl_graph_layout_type_strided or #dnnl_graph_layout_type_any.
+///
+/// @param logical_tensor Output logical tensor.
+/// @param tid The unique id of output logical tensor.
+/// @param dtype Elements data type.
+/// @param ndims Number of dimensions.
+/// @param dims Array of dimensions.
+/// @param strides Array of strides.
+/// @param ptype Tensor property type.
+/// @returns #dnnl_success on success or a status describing the error
+/// otherwise.
+dnnl_status_t DNNL_API dnnl_graph_logical_tensor_init_with_strides(
+        dnnl_graph_logical_tensor_t *logical_tensor, size_t tid,
+        dnnl_data_type_t dtype, int32_t ndims, const dnnl_dims_t dims,
+        const dnnl_dims_t strides, dnnl_graph_tensor_property_t ptype);
+
+/// Returns the memory size described by the logical tensor. If it's a strided
+/// layout, the size will be calculated by `dims` and `strides`. If it's an
+/// opaque layout, the size will be decided by `layout_id`.
+///
+/// @param logical_tensor Logical tensor.
+/// @param size Output memory size in bytes.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_logical_tensor_get_mem_size(
+        const dnnl_graph_logical_tensor_t *logical_tensor, size_t *size);
+
+/// Compares if two logical tenors are equal. Users can decide accordingly
+/// if layout reordering is needed for two logical tensors. The method will
+/// return true for below two circumstances:
+///
+/// 1. the two logical tensors are equal regarding each field in the struct,
+/// eg. id, ndims, dims, layout type, property, etc.
+/// 2. If all other fields are equal but the layout types in two logical
+/// tensors are different, the method will return true when the underlying
+/// memory layout is the same. For example, one logical tensor has strided
+/// layout type while the other one has opaque layout type, but underneath,
+/// both layouts are NHWC, the method will still return true for this case.
+///
+/// @param lt1 The handle of first logical tensor.
+/// @param lt2 The handle of second logical tensor.
+/// @param is_equal 1 if these two logical tensors are equal, 0 otherwise.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_logical_tensor_is_equal(
+        const dnnl_graph_logical_tensor_t *lt1,
+        const dnnl_graph_logical_tensor_t *lt2, uint8_t *is_equal);
+
+/// @} dnnl_graph_api_logical_tensor
+
+/// @addtogroup dnnl_graph_api_tensor
+/// @{
+
+/// Creates a tensor with logical tensor, engine, and data handle.
+///
+/// @param tensor Output tensor.
+/// @param logical_tensor Description for this tensor.
+/// @param engine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the tensor. In this case the library
+///       owns the buffer.
+///     - DNNL_MEMORY_NONE to create tensor without an underlying buffer.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_create(dnnl_graph_tensor_t *tensor,
+        const dnnl_graph_logical_tensor_t *logical_tensor, dnnl_engine_t engine,
+        void *handle);
+
+/// Destroys a tensor.
+///
+/// @param tensor The tensor to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_destroy(dnnl_graph_tensor_t tensor);
+
+/// Gets the data handle of a tensor.
+///
+/// @param tensor The input tensor.
+/// @param handle Pointer to the data of input tensor.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_get_data_handle(
+        const_dnnl_graph_tensor_t tensor, void **handle);
+
+/// Set data handle for a tensor.
+///
+/// @param tensor The input tensor.
+/// @param handle New data handle for tensor.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_set_data_handle(
+        dnnl_graph_tensor_t tensor, void *handle);
+
+/// Returns the engine of a tensor object.
+///
+/// @param tensor The input tensor.
+/// @param engine Output engine on which the tensor is located.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_get_engine(
+        const_dnnl_graph_tensor_t tensor, dnnl_engine_t *engine);
+
+/// Returns the logical tensor of a tensor object.
+///
+/// @param tensor The input tensor.
+/// @param logical_tensor Output logical tensor of the tensor object.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_tensor_get_logical_tensor(
+        const_dnnl_graph_tensor_t tensor,
+        dnnl_graph_logical_tensor_t *logical_tensor);
+
+/// @} dnnl_graph_api_tensor
+
+/// @addtogroup dnnl_graph_api_op
+/// @{
+
+/// Initializes an op with unique id, kind, and name.
+///
+/// @param op Output op
+/// @param id The unique id of the output op.
+/// @param kind The op kind.
+/// @param verbose_name The string added as the op name.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_create(dnnl_graph_op_t *op, size_t id,
+        dnnl_graph_op_kind_t kind, const char *verbose_name);
+
+/// Destroys an op.
+///
+/// @param op The op to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_destroy(dnnl_graph_op_t op);
+
+/// Adds input logical tensor to the op.
+///
+/// @param op Input op.
+/// @param input The input logical tensor to be added.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_add_input(
+        dnnl_graph_op_t op, const dnnl_graph_logical_tensor_t *input);
+
+/// Adds output logical tensor to the op.
+///
+/// @param op Input op.
+/// @param output The output logical tensor to be added.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_add_output(
+        dnnl_graph_op_t op, const dnnl_graph_logical_tensor_t *output);
+
+/// Sets floating point attribute to an op.
+///
+/// @param op Input op.
+/// @param name The attribute's name.
+/// @param value The attribute's value.
+/// @param value_len The number of value element.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_set_attr_f32(dnnl_graph_op_t op,
+        dnnl_graph_op_attr_t name, const float *value, size_t value_len);
+
+/// Sets boolean attribute to an op.
+///
+/// @param op Input op.
+/// @param name The attribute's name.
+/// @param value The attribute's value.
+/// @param value_len The number of value element.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_set_attr_bool(dnnl_graph_op_t op,
+        dnnl_graph_op_attr_t name, const uint8_t *value, size_t value_len);
+
+/// Sets integer attribute to an op.
+///
+/// @param op Input op.
+/// @param name The attribute's name.
+/// @param value The attribute's value.
+/// @param value_len The number of value element.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_set_attr_s64(dnnl_graph_op_t op,
+        dnnl_graph_op_attr_t name, const int64_t *value, size_t value_len);
+
+/// Sets string attribute to an op.
+///
+/// @param op Input op.
+/// @param name The attribute's name.
+/// @param value The attribute's value.
+/// @param value_len The length of the string value.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_set_attr_str(dnnl_graph_op_t op,
+        dnnl_graph_op_attr_t name, const char *value, size_t value_len);
+
+/// Returns the unique id of an op.
+///
+/// @param op Input op.
+/// @param id Output the unique id.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_get_id(
+        const_dnnl_graph_op_t op, size_t *id);
+
+/// Returns the kind of an op.
+///
+/// @param op Input op.
+/// @param kind Output op kind.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_op_get_kind(
+        const_dnnl_graph_op_t op, dnnl_graph_op_kind_t *kind);
+
+/// @} dnnl_graph_api_op
+
+/// @addtogroup dnnl_graph_api_partition
+/// @{
+
+/// Creates a new partition with a given operator and engine kind. The API is
+/// used to create a partition from an operation directly without creating the
+/// graph and calling `get_partitions()`. The output partition contains only one
+/// operation specified by the parameter. The output partition instance should
+/// be destroyed via #dnnl_graph_partition_destroy after use.
+///
+/// @param partition The handle of output partition.
+/// @param op The operation used to create partition.
+/// @param ekind The engine kind used to create partition.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_create_with_op(
+        dnnl_graph_partition_t *partition, const_dnnl_graph_op_t op,
+        dnnl_engine_kind_t ekind);
+
+/// Destroys a partition.
+///
+/// @param partition The partition to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_destroy(
+        dnnl_graph_partition_t partition);
+
+/// Returns the number of operations in a partition.
+///
+/// @param partition The target partition.
+/// @param num Output the number of operations.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_op_num(
+        const_dnnl_graph_partition_t partition, size_t *num);
+
+/// Returns the list of op IDs of the partition.
+///
+/// @param partition The target partition.
+/// @param num The number of ops.
+/// @param ids Output the op IDs.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_ops(
+        dnnl_graph_partition_t partition, size_t num, size_t *ids);
+
+/// Returns the ID of a partition.
+///
+/// @param partition The target partition.
+/// @param id Output the ID of the partition.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_id(
+        const_dnnl_graph_partition_t partition, size_t *id);
+
+/// Compiles a partition with given input and output logical tensors. The output
+/// logical tensors can contain unknown dimensions. For this case, the
+/// compilation will deduce the output shapes according to input shapes. The
+/// output logical tensors can also have layout type `any`. The compilation will
+/// choose the optimal layout for output tensors. The optimal layout will be
+/// represented as an opaque layout ID saved in the output logical tensor.
+///
+/// @param partition The target partition.
+/// @param compiled_partition Output compiled partition.
+/// @param in_num The number of input logical tensors.
+/// @param inputs A list of input logical tensors.
+/// @param out_num The number of output logical tensors.
+/// @param outputs A list of output logical tensors.
+/// @param engine The target engine of the compilation.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_compile(
+        dnnl_graph_partition_t partition,
+        dnnl_graph_compiled_partition_t compiled_partition, size_t in_num,
+        const dnnl_graph_logical_tensor_t **inputs, size_t out_num,
+        const dnnl_graph_logical_tensor_t **outputs, dnnl_engine_t engine);
+
+/// Returns the number of input logical tensors of a partition.
+///
+/// @param partition The target partition.
+/// @param num Output the number of input logical tensors.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_input_ports_num(
+        const_dnnl_graph_partition_t partition, size_t *num);
+
+/// Returns a list of input logical tensors from a partition.
+///
+/// @param partition The target partition.
+/// @param num The number of input logical tensors.
+/// @param inputs The list of input logical tensors.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_input_ports(
+        const_dnnl_graph_partition_t partition, size_t num,
+        dnnl_graph_logical_tensor_t *inputs);
+
+/// Returns the number of output logical tensors of a partition.
+///
+/// @param partition The target partition.
+/// @param num Output the number of output logical tensors.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_output_ports_num(
+        const_dnnl_graph_partition_t partition, size_t *num);
+
+/// Returns a list of output logical tensors from a partition.
+///
+/// @param partition The target partition.
+/// @param num The number of output logical tensors.
+/// @param outputs The list of output logical tensors.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_output_ports(
+        const_dnnl_graph_partition_t partition, size_t num,
+        dnnl_graph_logical_tensor_t *outputs);
+
+/// Returns the supporting status of a partition. Some operations may not be
+/// supported by the library under certain circumstances. During partitioning
+/// stage, unsupported partitions will be returned to users with each containing
+/// an unsupported operation. Users should check the supporting status of a
+/// partition before transforming the computation graph or compiling the
+/// partition.
+///
+/// @param partition The target partition.
+/// @param is_supported Output flag to indicate the supporting status. 0 means
+///     unsupported while 1 means supported.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_is_supported(
+        const_dnnl_graph_partition_t partition, uint8_t *is_supported);
+
+/// Returns the engine kind of a partition.
+///
+/// @param partition The target partition.
+/// @param kind The output engine kind.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_partition_get_engine_kind(
+        const_dnnl_graph_partition_t partition, dnnl_engine_kind_t *kind);
+
+/// @} dnnl_graph_api_partition
+
+/// @addtogroup dnnl_graph_api_compiled_partition
+/// @{
+
+/// Creates a new compiled partition handle.
+///
+/// @param compiled_partition The handle of output compiled partition.
+/// @param partition The handle of input partition.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_compiled_partition_create(
+        dnnl_graph_compiled_partition_t *compiled_partition,
+        dnnl_graph_partition_t partition);
+
+/// Executes a compiled partition.
+///
+/// @param compiled_partition The handle of target compiled partition.
+/// @param stream The stream used for execution.
+/// @param num_inputs The number of input tensors.
+/// @param inputs A list of input tensors.
+/// @param num_outputs The number of output tensors.
+/// @param outputs A non-empty list of output tensors.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_compiled_partition_execute(
+        const_dnnl_graph_compiled_partition_t compiled_partition,
+        dnnl_stream_t stream, size_t num_inputs,
+        const_dnnl_graph_tensor_t *inputs, size_t num_outputs,
+        const_dnnl_graph_tensor_t *outputs);
+
+/// Destroys a compiled partition.
+///
+/// @param compiled_partition The compiled partition to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_compiled_partition_destroy(
+        dnnl_graph_compiled_partition_t compiled_partition);
+
+/// Queries an input or output logical tensor according to tensor ID. If the
+/// tensor ID doesn't belong to any input or output of the compiled partition,
+/// an error status #dnnl_invalid_arguments will be returned by the API.
+///
+/// @param compiled_partition The handle of target compiled_partition.
+/// @param tid The unique id of required tensor.
+/// @param lt The output logical tensor.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_compiled_partition_query_logical_tensor(
+        const_dnnl_graph_compiled_partition_t compiled_partition, size_t tid,
+        dnnl_graph_logical_tensor_t *lt);
+
+/// Returns the hint of in-place pairs from a compiled partition. It indicates
+/// that an input and an output of the partition can share the same memory
+/// buffer for computation. In-place computation helps to reduce the memory
+/// footprint and improves cache locality. But since the library may not have a
+/// global view of user's application, it's possible that the tensor with
+/// `input_id` is used at other places in user's computation graph. In this
+/// case, the user should take the in-place pair as a hint and pass a different
+/// memory buffer for output tensor to avoid overwriting the input memory buffer
+/// which will probably cause unexpected incorrect results.
+///
+/// @param compiled_partition The handle of target compiled_partition.
+/// @param num_inplace_pairs The number of in-place pairs.
+/// @param inplace_pairs The handle of in-place pairs.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_compiled_partition_get_inplace_ports(
+        const_dnnl_graph_compiled_partition_t compiled_partition,
+        size_t *num_inplace_pairs,
+        const dnnl_graph_inplace_pair_t **inplace_pairs);
+
+/// @} dnnl_graph_api_compiled_partition
+
+/// @addtogroup dnnl_graph_api_graph
+/// @{
+
+/// Creates a new empty graph. A graph is associated to a specific engine kind.
+/// The partitions returned from the graph will inherit the engine kind of the
+/// graph.
+///
+/// @param graph The handle of output graph.
+/// @param engine_kind The target engine kind.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_create(
+        dnnl_graph_graph_t *graph, dnnl_engine_kind_t engine_kind);
+
+/// Creates a new empty graph with an engine kind and a floating-point math
+/// mode. All partitions returned from the graph will inherit the engine kind
+/// and floating-point math mode.
+///
+/// @param graph The handle of output graph.
+/// @param engine_kind The kind for engine.
+/// @param mode The floating-point math mode.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_create_with_fpmath_mode(
+        dnnl_graph_graph_t *graph, dnnl_engine_kind_t engine_kind,
+        dnnl_fpmath_mode_t mode);
+
+/// Destroys a graph.
+///
+/// @param graph The graph to be destroyed.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_destroy(dnnl_graph_graph_t graph);
+
+/// Set the floating point math mode for a graph.
+///
+/// @param graph The target graph.
+/// @param mode The floating-point math mode.
+/// @param apply_to_int The flag that controls whether to use floating-point
+///     arithmetic for integral operations.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_set_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t mode, int apply_to_int);
+
+/// Get the floating point math mode for a graph.
+///
+/// @param graph The target graph.
+/// @param mode The floating-point math mode.
+/// @param apply_to_int The flag that controls whether to use floating-point
+///     arithmetic for integral operations.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_get_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t *mode, int *apply_to_int);
+
+/// Adds an operation into a graph. The API will return failure if the operator
+/// has already been added to the graph or the operation cannot pass the schema
+/// check in the library (eg. input and output numbers and data types, the
+/// attributes of the operation, etc.).
+///
+/// @param graph The target graph.
+/// @param op The operation to be added.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_add_op(
+        dnnl_graph_graph_t graph, dnnl_graph_op_t op);
+
+/// Finalizes a graph. It means users have finished adding operations into the
+/// graph and the graph is ready for partitioning. Adding a new operation into a
+/// finalized graph will return failures. Similarly, partitioning on a
+/// un-finalized graph will also return failures.
+///
+/// @param graph The target graph to be finalized.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_finalize(dnnl_graph_graph_t graph);
+
+/// Checks if a graph is finalized.
+///
+/// @param graph The target graph to be finalized.
+/// @param finalized Output the finalization status. 0 means then graph is not
+///     finalized. Other values means the graph is finalized.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_is_finalized(
+        dnnl_graph_graph_t graph, uint8_t *finalized);
+
+/// Filters a graph. Partitions will be claimed internally according to the
+/// capability of the library, the engine kind, and the policy.
+///
+/// @param graph The target graph.
+/// @param policy The partition policy.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_filter(
+        dnnl_graph_graph_t graph, dnnl_graph_partition_policy_t policy);
+
+/// Returns the number of partitions of a graph. The API should be called after
+/// a partition is already filtered. Otherwise, the output number is zero.
+///
+/// @param graph The graph.
+/// @param num Output the number of partitions.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_get_partition_num(
+        const_dnnl_graph_graph_t graph, size_t *num);
+
+/// Returns the partitions from a filtered graph. Output partition instances
+/// will be written into the parameter `partitions`. Users need to make sure
+/// `partitions` is valid and has enough space to accept the partition
+/// instances. Each output partition instance should be destroyed via
+/// #dnnl_graph_partition_destroy explicitly after use.
+///
+/// @param graph The target graph.
+/// @param num The number of partitions.
+/// @param partitions Output the partitions.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_get_partitions(dnnl_graph_graph_t graph,
+        size_t num, dnnl_graph_partition_t *partitions);
+
+/// @} dnnl_graph_api_graph
+
+/// @addtogroup dnnl_graph_api_compiled_partition_cache
+/// @{
+
+/// Returns the number of compiled partitions that can be held in the compiled
+/// partition cache at the same time.
+///
+/// @param capacity Compiled partition cache capacity to query. Concurrently
+/// accessing @p capacity is safe.
+/// @returns #dnnl_invalid_arguments if the @p capacity value
+///     is invalid, and #dnnl_success on success.
+dnnl_status_t DNNL_API dnnl_graph_get_compiled_partition_cache_capacity(
+        int *capacity);
+
+/// Sets a number of compiled partitions that can be held in the compiled
+/// partition cache at the same time. The default capacity of compiled partition
+/// cache is 1024.
+///
+/// @param capacity Compiled partition cache capacity to set. The default cache
+/// capacity is 1024. If a new @p capacity is less than a number of compiled
+/// partition that the compiled partition cache already has, then the excess
+/// entries will be evicted. Setting the @p capacity to 0 clears the compiled
+/// partition cache and disables it. Concurrently modifying @p capacity is safe.
+/// @returns #dnnl_invalid_arguments if the @p capacity value
+/// is invalid, and #dnnl_success on success.
+dnnl_status_t DNNL_API dnnl_graph_set_compiled_partition_cache_capacity(
+        int capacity);
+
+/// @} dnnl_graph_api_compiled_partition_cache
+
+/// @addtogroup dnnl_graph_api_constant_tensor_cache
+/// @{
+
+/// Control the enabling or disabling of constant tensor cache. This API must
+/// be called once before compilation stage. By default, constant tensor cache is
+/// disabled in the library.
+///
+/// @param flag Set to positive value to enable the cache and set to 0 to
+/// disable the cache. Negative values are invalid.
+/// @returns #dnnl_invalid_arguments if the @p flag value is
+/// invalid, and #dnnl_success on success.
+/// @note This API is deprecated and will be removed in future release, please
+/// use the dnnl_graph_set_constant_tensor_cache_capacity API to disable
+/// constant tensor cache by setting it's capacity to zero.
+dnnl_status_t DNNL_API dnnl_graph_set_constant_tensor_cache(int flag);
+
+/// Return the enabling or disabling status of constant tensor cache.
+///
+/// @param flag The constant tensor cache enabling status to query.
+/// @returns #dnnl_invalid_arguments if the @p flag value is
+/// nullptr, and #dnnl_success on success.
+/// @note This API is deprecated and will be removed in future release, please
+/// use the dnnl_graph_get_constant_tensor_cache_capacity API to check the
+/// enabling status by checking it's capacity.
+dnnl_status_t DNNL_API dnnl_graph_get_constant_tensor_cache(int *flag);
+
+/// Control the capacity for the constant tensor cache that used for specific
+/// engine kind. This API is thread safe and can be called multiple times at
+/// runtime. The capacity is set to zero by default which means the cache is
+/// disabled. When calling this API, the corresponding cache will be flushed.
+/// Setting capacity to 0 means to clear all cached tensors and disable cache.
+/// Once the capacity limit is reached, no new tensors will be cached. If there
+/// are multiple devices for an engine kind, the capacity set here is for each
+/// device.
+///
+/// @param eng_kind The engine kind that the constant tensor cache used for.
+/// @param size The constant tensor cache capacity size to set.
+/// @returns #dnnl_invalid_arguments if the @p eng_kind value is invalid, and
+/// #dnnl_success on success.
+dnnl_status_t DNNL_API dnnl_graph_set_constant_tensor_cache_capacity(
+        dnnl_engine_kind_t eng_kind, size_t size);
+
+/// Return the current capacity of constant tensor cache.
+///
+/// @param eng_kind The engine kind that the constant tensor cache used for.
+/// @param size The constant tensor cache capacity size to query.
+/// @returns #dnnl_invalid_arguments if the @p eng_kind value is
+/// nullptr or the @p size is nullptr, and #dnnl_success on success.
+dnnl_status_t DNNL_API dnnl_graph_get_constant_tensor_cache_capacity(
+        dnnl_engine_kind_t eng_kind, size_t *size);
+
+/// @} dnnl_graph_api_constant_tensor_cache
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9db7c9bbb0742d3c564951c6f60aa80f80967e6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph.hpp
@@ -0,0 +1,1639 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// Graph C++ API
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_HPP
+#define ONEAPI_DNNL_DNNL_GRAPH_HPP
+
+#include "oneapi/dnnl/dnnl_common.hpp"
+#include "oneapi/dnnl/dnnl_graph.h"
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_graph_api Graph API
+/// oneDNN Graph API
+/// @{
+
+/// oneDNN Graph namespace
+namespace graph {
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+// Alias for common engine and stream API.
+using engine = dnnl::engine;
+using stream = dnnl::stream;
+using fpmath_mode = dnnl::fpmath_mode;
+
+/// @endcond
+
+/// @addtogroup dnnl_graph_api_utils Utilities
+/// Utility types and definitions
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+/// A class that provides the destructor for a oneDNN graph C API handle.
+template <typename T>
+struct graph_handle_traits : public dnnl::handle_traits<T> {};
+
+template <>
+struct graph_handle_traits<dnnl_graph_op_t> {
+    static dnnl_status_t destructor(dnnl_graph_op_t p) {
+        return dnnl_graph_op_destroy(p);
+    }
+};
+
+template <>
+struct graph_handle_traits<dnnl_graph_graph_t> {
+    static dnnl_status_t destructor(dnnl_graph_graph_t p) {
+        return dnnl_graph_graph_destroy(p);
+    }
+};
+
+template <>
+struct graph_handle_traits<dnnl_graph_tensor_t> {
+    static dnnl_status_t destructor(dnnl_graph_tensor_t p) {
+        return dnnl_graph_tensor_destroy(p);
+    }
+};
+
+template <>
+struct graph_handle_traits<dnnl_graph_partition_t> {
+    static dnnl_status_t destructor(dnnl_graph_partition_t p) {
+        return dnnl_graph_partition_destroy(p);
+    }
+};
+
+template <>
+struct graph_handle_traits<dnnl_graph_compiled_partition_t> {
+    static dnnl_status_t destructor(dnnl_graph_compiled_partition_t p) {
+        return dnnl_graph_compiled_partition_destroy(p);
+    }
+};
+
+template <>
+struct graph_handle_traits<dnnl_graph_allocator_t> {
+    static dnnl_status_t destructor(dnnl_graph_allocator_t p) {
+        return dnnl_graph_allocator_destroy(p);
+    }
+};
+
+#define DNNL_GRAPH_HANDLE_ALIAS(type) \
+    using type##_handle = dnnl::handle<dnnl_graph_##type##_t, \
+            graph_handle_traits<dnnl_graph_##type##_t>>
+
+DNNL_GRAPH_HANDLE_ALIAS(allocator);
+DNNL_GRAPH_HANDLE_ALIAS(graph);
+DNNL_GRAPH_HANDLE_ALIAS(op);
+DNNL_GRAPH_HANDLE_ALIAS(tensor);
+DNNL_GRAPH_HANDLE_ALIAS(compiled_partition);
+DNNL_GRAPH_HANDLE_ALIAS(partition);
+
+#undef DNNL_GRAPH_HANDLE_ALIAS
+
+template <bool B>
+using req = typename std::enable_if<B, bool>::type;
+
+/// @endcond
+
+/// @} dnnl_graph_api_utils
+
+/// @addtogroup dnnl_graph_api_status Status
+/// Definitions of status values returned by the library functions.
+/// @{
+
+/// Status values returned by the library functions.
+enum class status {
+    /// The operation was successful
+    success = dnnl_success,
+    /// The operation failed due to an out-of-memory condition
+    out_of_memory = dnnl_out_of_memory,
+    /// The operation failed because of incorrect function arguments
+    invalid_arguments = dnnl_invalid_arguments,
+    /// The operation failed because requested functionality is not implemented
+    unimplemented = dnnl_unimplemented,
+    /// The last available implementation is reached
+    last_impl_reached = dnnl_last_impl_reached,
+    /// Primitive or engine failed on execution
+    runtime_error = dnnl_runtime_error,
+    /// Queried element is not required for given primitive
+    not_required = dnnl_not_required,
+    /// The graph is not legitimate
+    invalid_graph = dnnl_invalid_graph,
+    /// The operation is not legitimate according to op schema
+    invalid_graph_op = dnnl_invalid_graph_op,
+    /// The shape cannot be inferred or compiled
+    invalid_shape = dnnl_invalid_shape,
+    /// The data type cannot be inferred or compiled
+    invalid_data_type = dnnl_invalid_data_type,
+};
+
+/// @} dnnl_graph_api_status
+
+/// @addtogroup dnnl_graph_api_allocator Allocator
+///
+/// Definitions of allocator which is used to acquire memory resources in
+/// partition compilation and execution. SYCL allocator
+/// (#dnnl::graph::sycl_interop::make_allocator) should be used for SYCL runtime
+/// and host allocator should be used for non-SYCL.
+///
+/// @{
+
+/// Allocator
+class allocator : public allocator_handle {
+public:
+    using allocator_handle::handle;
+
+    /// Constructs an allocator according to given function pointers
+    ///
+    /// @param host_malloc A pointer to malloc function for CPU
+    /// @param host_free A pointer to free function for CPU
+    allocator(dnnl_graph_host_allocate_f host_malloc,
+            dnnl_graph_host_deallocate_f host_free) {
+        dnnl_graph_allocator_t a = nullptr;
+        error::wrap_c_api(
+                dnnl_graph_allocator_create(&a, host_malloc, host_free),
+                "could not create allocator for cpu");
+        reset(a);
+    }
+
+    /// Default constructor
+    allocator() {
+        dnnl_graph_allocator_t a = nullptr;
+        error::wrap_c_api(dnnl_graph_allocator_create(&a, nullptr, nullptr),
+                "could not create allocator");
+        reset(a);
+    }
+};
+
+/// @} dnnl_graph_api_allocator
+
+/// @addtogroup dnnl_graph_api_engine Engine
+/// @{
+
+/// This API is a supplement for existing onednn engine API.
+inline engine make_engine_with_allocator(
+        engine::kind kind, size_t index, const allocator &alloc) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_graph_make_engine_with_allocator(&c_engine,
+                    static_cast<dnnl_engine_kind_t>(kind), index, alloc.get()),
+            "could not make an engine with allocator");
+    return engine(c_engine);
+}
+
+/// @} dnnl_graph_api_engine
+
+/// @addtogroup dnnl_graph_api_logical_tensor Logical Tensor
+///
+/// Logical tensor describes the meta-data of the input or output tensor, like
+/// elements data type, number of dimensions, size for each dimension (shape),
+/// layout, and the property of the tensor.
+///
+/// Each logical tensor has an unique ID. The library uses logical tensor IDs to
+/// build up the connections between operations if the output of one operation
+/// has the same ID as the input of another operation. The meta-data in a
+/// logical tensor may be enriched in the framework graph as it progresses
+/// toward final execution. For example, the library doesn't require detailed
+/// shape information at the operation and graph creation stage. But shape
+/// information of input logical tensor will be required at partition
+/// compilation stage. Logical tensor is not mutable. Users must create a new
+/// logical tensor with the same ID to pass any new additional information to
+/// oneDNN Graph API. Please note that the library also has unique IDs for
+/// operations. The ID should be unique among different logical tensors, but it
+/// can have the same value between a logical tensor and an operation.
+///
+/// @{
+
+/// Logical tensor object
+class logical_tensor {
+    friend class op;
+    friend class tensor;
+    friend class partition;
+    friend class compiled_partition;
+
+    dnnl_graph_logical_tensor_t data;
+
+public:
+    /// Integer type for representing dimension sizes and indices.
+    using dim = dnnl_dim_t;
+    /// Vector of dimensions. Implementations are free to force a limit on the
+    /// vector's length.
+    using dims = std::vector<dim>;
+
+    /// Data Type
+    enum class data_type {
+        undef = dnnl_data_type_undef,
+        /// 16-bit/half-precision floating point.
+        f16 = dnnl_f16,
+        /// non-standard 16-bit (bfloat16 w/ 7 bit mantissa) floating point.
+        bf16 = dnnl_bf16,
+        /// 32-bit/single-precision floating point.
+        f32 = dnnl_f32,
+        /// 32-bit signed integer.
+        s32 = dnnl_s32,
+        /// 8-bit signed integer.
+        s8 = dnnl_s8,
+        /// 8-bit unsigned integer.
+        u8 = dnnl_u8,
+        /// Boolean data type. Size is C++ implementation defined.
+        boolean = dnnl_boolean,
+        /// [OFP8 standard 8-bit
+        /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+        /// with a 5-bit exponent and a 2-bit mantissa.
+        f8_e5m2 = dnnl_f8_e5m2,
+        /// [OFP8 standard 8-bit
+        /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
+        /// with a 4-bit exponent and a 3-bit mantissa.
+        f8_e4m3 = dnnl_f8_e4m3,
+        /// 4-bit signed integer.
+        s4 = dnnl_s4,
+        /// 4-bit unsigned integer.
+        u4 = dnnl_u4,
+    };
+
+    /// Layout type
+    enum class layout_type {
+        /// Undefined layout type.
+        undef = dnnl_graph_layout_type_undef,
+        /// Any means to let the library to decide the layout for a tensor
+        /// during partition compilation.
+        any = dnnl_graph_layout_type_any,
+        /// Strided means that the layout of a tensor is determined by the
+        /// strides field in the logical tensor.
+        strided = dnnl_graph_layout_type_strided,
+        /// Opaque means that the layout of a tensor is the library specific.
+        /// Usually, an opaque layout is generated by a partition which is
+        /// compiled with layout type any.
+        opaque = dnnl_graph_layout_type_opaque,
+    };
+
+    /// Tensor property
+    enum class property_type {
+        /// Undefined tensor property.
+        undef = dnnl_graph_tensor_property_undef,
+        /// Variable means the tensor may be changed during computation or
+        /// between different iterations.
+        variable = dnnl_graph_tensor_property_variable,
+        /// Constant means the tensor will keep unchanged during computation and
+        /// between different iterations. It's useful for the library to apply
+        /// optimizations for constant tensors or cache constant tensors inside
+        /// the library. For example, constant weight tensors in inference
+        /// scenarios.
+        constant = dnnl_graph_tensor_property_constant,
+    };
+
+    /// default constructor
+    /// construct an empty object
+    logical_tensor() = default;
+
+    /// Constructs a logical tensor object
+    explicit logical_tensor(const dnnl_graph_logical_tensor_t &c_data)
+        : data(c_data) {}
+
+    /// Copy
+    logical_tensor(const logical_tensor &other) = default;
+
+    /// Assign
+    logical_tensor &operator=(const logical_tensor &other) = default;
+
+    /// Constructs a logical tensor object with ID, data type, ndims, layout
+    /// type, and property type.
+    ///
+    /// @param tid Logical tensor ID.
+    /// @param dtype Elements data type.
+    /// @param ndims Number of dimensions. -1 means unknown (see
+    ///     #DNNL_GRAPH_UNKNOWN_NDIMS) and 0 means a scalar tensor.
+    /// @param ltype Layout type.
+    /// @param ptype Property type.
+    logical_tensor(size_t tid, data_type dtype, int32_t ndims,
+            layout_type ltype, property_type ptype = property_type::undef) {
+        dnnl_graph_logical_tensor_t val;
+        error::wrap_c_api(
+                dnnl_graph_logical_tensor_init(&val, tid, convert_to_c(dtype),
+                        ndims, convert_to_c(ltype), convert_to_c(ptype)),
+                "could not create logical_tensor with property");
+        data = val;
+    }
+
+    /// Delegated constructor.
+    ///
+    /// @param tid Logical tensor ID.
+    /// @param dtype Elements data type.
+    /// @param ltype Layout type.
+    logical_tensor(
+            size_t tid, data_type dtype, layout_type ltype = layout_type::undef)
+        : logical_tensor(tid, dtype, DNNL_GRAPH_UNKNOWN_NDIMS, ltype) {}
+
+    /// Constructs a logical tensor object with basic information and detailed
+    /// dims.
+    ///
+    /// @param tid Logical tensor ID.
+    /// @param dtype Elements data type.
+    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
+    ///     the size of that dimension is unknown. 0 is used to define
+    ///     zero-dimension tensor.
+    /// @param ltype Layout type. If it's strided, the strides field in the
+    ///     output logical tensor will be deduced accordingly.
+    /// @param ptype Property type.
+    logical_tensor(size_t tid, data_type dtype, const dims &adims,
+            layout_type ltype, property_type ptype = property_type::undef) {
+        dnnl_graph_logical_tensor_t val;
+        // if dimension size equals to 0, it's a scalar
+        if (adims.size() == 0)
+            error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
+                                      convert_to_c(dtype), 0,
+                                      convert_to_c(ltype), convert_to_c(ptype)),
+                    "could not create logical_tensor with property");
+        else
+            error::wrap_c_api(
+                    dnnl_graph_logical_tensor_init_with_dims(&val, tid,
+                            convert_to_c(dtype),
+                            static_cast<int32_t>(adims.size()), adims.data(),
+                            convert_to_c(ltype), convert_to_c(ptype)),
+                    "could not create logical_tensor with dims and property");
+        data = val;
+    }
+
+    /// Constructs a logical tensor object with detailed dims and strides. The
+    /// layout_type of the output logical tensor object will always be strided.
+    ///
+    /// @param tid Logical tensor ID.
+    /// @param dtype Elements data type.
+    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
+    ///     the size of that dimension is unknown. 0 is used to define
+    ///     zero-dimension tensor.
+    /// @param strides Logical tensor strides.  #DNNL_GRAPH_UNKNOWN_DIM means
+    ///     the stride of the dimension is unknown. The library currently
+    ///      doesn't support other negative stride values.
+    /// @param ptype Property type.
+    logical_tensor(size_t tid, data_type dtype, const dims &adims,
+            const dims &strides, property_type ptype = property_type::undef) {
+        dnnl_graph_logical_tensor_t val;
+        // TODO(lvtao): check the size of adims and strides.
+        // They should be same.
+        error::wrap_c_api(
+                dnnl_graph_logical_tensor_init_with_strides(&val, tid,
+                        convert_to_c(dtype), static_cast<int32_t>(adims.size()),
+                        adims.data(), strides.data(), convert_to_c(ptype)),
+                "could not create logical_tensor with strides and property");
+        data = val;
+    }
+
+    /// Constructs a logical tensor object with detailed dims and an opaque
+    /// layout ID. layout_type of the output logical tensor object will always
+    /// be opaque.
+    ///
+    /// @param tid Logical tensor ID.
+    /// @param dtype Elements data type.
+    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
+    ///     the size of that dimension is unknown. 0 is used to define
+    ///     zero-dimension tensor.
+    /// @param lid Opaque layout id.
+    /// @param ptype Property type
+    logical_tensor(size_t tid, data_type dtype, const dims &adims, size_t lid,
+            property_type ptype = property_type::undef) {
+        dnnl_graph_logical_tensor_t val;
+
+        if (adims.size() == 0) {
+            error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
+                                      convert_to_c(dtype), 0,
+                                      convert_to_c(layout_type::opaque),
+                                      convert_to_c(ptype)),
+                    "could not create logical_tensor");
+        } else {
+            error::wrap_c_api(
+                    dnnl_graph_logical_tensor_init_with_dims(&val, tid,
+                            convert_to_c(dtype),
+                            static_cast<int32_t>(adims.size()), adims.data(),
+                            convert_to_c(layout_type::opaque),
+                            convert_to_c(ptype)),
+                    "could not create logical_tensor with dims");
+        }
+
+        val.layout.layout_id = lid;
+        data = val;
+    }
+
+    /// Returns dimensions of a logical tensor.
+    ///
+    /// @returns A vector describing the size of each dimension.
+    dims get_dims() const {
+        if (data.ndims < 0) {
+            error::wrap_c_api(dnnl_invalid_arguments,
+                    "cannot return dims when ndims < 0");
+        }
+
+        return {data.dims, data.dims + data.ndims};
+    }
+
+    /// Returns the unique id of a logical tensor.
+    ///
+    /// @returns An integer value describing the ID.
+    size_t get_id() const { return data.id; }
+
+    /// Returns the data type of a logical tensor.
+    ///
+    /// @returns The data type.
+    data_type get_data_type() const {
+        return static_cast<data_type>(data.data_type);
+    }
+
+    /// Returns the property type of a logical tensor.
+    ///
+    /// @returns The property type.
+    property_type get_property_type() const {
+        return static_cast<property_type>(data.property);
+    }
+
+    /// Returns the layout type of a logical tensor.
+    ///
+    /// @returns The layout type.
+    layout_type get_layout_type() const {
+        return static_cast<layout_type>(data.layout_type);
+    }
+
+    /// Returns the layout ID of a logical tensor. The API should be called on a
+    /// logical tensor with opaque layout type. Otherwise, an exception will be
+    /// raised.
+    ///
+    /// @returns Layout ID.
+    size_t get_layout_id() const {
+        if (get_layout_type() != layout_type::opaque) {
+            error::wrap_c_api(
+                    dnnl_invalid_arguments, "layout type should be opaque");
+        }
+
+        return data.layout.layout_id;
+    }
+
+    /// Returns the strides of a logical tensor. The API should be called on a
+    /// logical tensor with strided layout type. Otherwise, an exception will be
+    /// raised.
+    ///
+    /// @returns A vector describing the stride size of each dimension.
+    dims get_strides() const {
+        if (get_layout_type() != layout_type::strided) {
+            error::wrap_c_api(
+                    dnnl_invalid_arguments, "layout type should be strided");
+        }
+
+        if (data.ndims < 0) {
+            error::wrap_c_api(dnnl_invalid_arguments,
+                    "cannot return strides when ndims < 0");
+        }
+
+        return {data.layout.strides, data.layout.strides + data.ndims};
+    }
+
+    /// Returns memory size in bytes required by this logical tensor.
+    ///
+    /// @returns The memory size in bytes.
+    size_t get_mem_size() const {
+        size_t size = 0;
+        error::wrap_c_api(dnnl_graph_logical_tensor_get_mem_size(&data, &size),
+                "could not get memory size from the logical_tensor");
+        return size;
+    }
+
+    /// Compares if two logical tenors are equal. Users can decide accordingly
+    /// if layout reordering is needed for two logical tensors. The method will
+    /// return true for below two circumstances:
+    ///
+    /// 1. the two logical tensors are equal regarding each field in the struct,
+    /// eg. id, ndims, dims, layout type, property, etc.
+    /// 2. If all other fields are equal but the layout types in two logical
+    /// tensors are different, the method will return true when the underlying
+    /// memory layout is the same. For example, one logical tensor has strided
+    /// layout type while the other one has opaque layout type, but underneath,
+    /// both layouts are NHWC, the method will still return true for this case.
+    ///
+    /// @param lt The input logical tensor to be compared.
+    /// @returns @c true if the two logical tensors are equal. @c false otherwise
+    bool is_equal(const logical_tensor &lt) const {
+        uint8_t equal = 0;
+        error::wrap_c_api(
+                dnnl_graph_logical_tensor_is_equal(&data, &lt.data, &equal),
+                "could not compare between the two logical tensors");
+        return equal != 0;
+    }
+
+private:
+    static dnnl_data_type_t convert_to_c(data_type dtype) {
+        return static_cast<dnnl_data_type_t>(dtype);
+    }
+
+    static dnnl_graph_layout_type_t convert_to_c(layout_type ltype) {
+        return static_cast<dnnl_graph_layout_type_t>(ltype);
+    }
+
+    static dnnl_graph_tensor_property_t convert_to_c(property_type ptype) {
+        return static_cast<dnnl_graph_tensor_property_t>(ptype);
+    }
+};
+
+/// @} dnnl_graph_api_logical_tensor
+
+/// @addtogroup dnnl_graph_api_tensor Tensor
+///
+/// Tensor is an abstraction for multi-dimensional input and output data needed
+/// in the execution of a compiled partition. A tensor object encapsulates a
+/// handle to a memory buffer allocated on a specific engine and a logical
+/// tensor which describes the dimensions, elements data type, and memory
+/// layout.
+///
+/// @{
+
+/// A tensor object
+class tensor : public tensor_handle {
+public:
+    /// Default constructor. Constructs an empty object.
+    tensor() = default;
+
+    /// Constructs a tensor object according to a given logical tensor, an
+    /// engine, and a memory handle.
+    ///
+    /// @param lt The given logical tensor
+    /// @param aengine Engine to store the data on.
+    /// @param handle Handle of memory buffer to use as an underlying storage.
+    ///     - A pointer to the user-allocated buffer. In this case the library
+    ///       doesn't own the buffer.
+    ///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+    ///       allocate the buffer for the tensor. In this case the library
+    ///       owns the buffer.
+    ///     - DNNL_MEMORY_NONE to create tensor without an underlying buffer.
+    tensor(const logical_tensor &lt, const engine &aengine, void *handle) {
+        dnnl_graph_tensor_t t = nullptr;
+        error::wrap_c_api(
+                dnnl_graph_tensor_create(&t, &(lt.data), aengine.get(), handle),
+                "could not create tensor object with the logical_tensor, "
+                "engine, and handle");
+        reset(t);
+    }
+
+    /// Constructs a tensor object.
+    /// The underlying buffer for the memory will be allocated by the library.
+    ///
+    /// @param lt The given logical tensor
+    /// @param aengine Engine to store the data on.
+    tensor(const logical_tensor &lt, const engine &aengine)
+        : tensor(lt, aengine, DNNL_MEMORY_ALLOCATE) {}
+
+    /// Returns the underlying memory buffer.
+    ///
+    /// On the CPU engine, or when using USM, this is a pointer to the
+    /// allocated memory.
+    void *get_data_handle() const {
+        void *handle = nullptr;
+        error::wrap_c_api(dnnl_graph_tensor_get_data_handle(get(), &handle),
+                "could not get data handle from the tensor");
+        return handle;
+    }
+
+    /// Sets the underlying memory handle.
+    ///
+    /// @param handle Memory handle.
+    void set_data_handle(void *handle) {
+        error::wrap_c_api(dnnl_graph_tensor_set_data_handle(get(), handle),
+                "setting data handle to the tensor failed");
+    }
+
+    /// Returns the associated engine.
+    ///
+    /// @returns An engine object
+    engine get_engine() const {
+        dnnl_engine_t c_engine = nullptr;
+        error::wrap_c_api(dnnl_graph_tensor_get_engine(get(), &c_engine),
+                "could not get an engine from a tensor object");
+        return engine(c_engine, true);
+    }
+
+    /// Returns the logical tensor of a tensor object.
+    ///
+    /// @returns A logical_tensor object.
+    logical_tensor get_logical_tensor() const {
+        dnnl_graph_logical_tensor_t lt;
+        error::wrap_c_api(dnnl_graph_tensor_get_logical_tensor(get(), &lt),
+                "could not get logical tensor from a tensor object");
+        return logical_tensor(lt);
+    }
+};
+
+/// @} dnnl_graph_api_tensor
+
+/// @addtogroup dnnl_graph_api_compiled_partition Compiled Partition
+///
+/// A compiled partition represents the generated kernels specialized for a
+/// partition on a target hardware (engine) with input and output information
+/// specified by the logical tensors.
+///
+/// @{
+
+/// A compiled partition object.
+class compiled_partition : public compiled_partition_handle {
+public:
+    /// Default constructor. Constructs an empty object.
+    compiled_partition() = default;
+
+    /// Constructs a compiled partition object
+    compiled_partition(dnnl_graph_compiled_partition_t compiled_partition) {
+        reset(compiled_partition, false);
+    }
+
+    /// Queries an input or output logical tensor according to tensor ID. If the
+    /// tensor ID doesn't belong to any input or output of the compiled
+    /// partition, an exception will be raised by the API.
+    ///
+    /// @param tid The unique id of required tensor.
+    /// @returns The logical tensor.
+    logical_tensor query_logical_tensor(size_t tid) const {
+        dnnl_graph_logical_tensor_t lt;
+        error::wrap_c_api(dnnl_graph_compiled_partition_query_logical_tensor(
+                                  get(), tid, &lt),
+                "query logical tensor from compiled_partition failed");
+        return logical_tensor {lt};
+    }
+
+    /// Returns the hint of in-place pairs from a compiled partition. It
+    /// indicates that an input and an output of the partition can share the
+    /// same memory buffer for computation. In-place computation helps to reduce
+    /// the memory footprint and improves cache locality. But since the library
+    /// may not have a global view of user's application, it's possible that the
+    /// input tensor is used at other places in user's computation graph. In
+    /// this case, the user should take the in-place pair as a hint and pass a
+    /// different memory buffer for output tensor to avoid overwriting the input
+    /// memory buffer which will probably cause unexpected incorrect results.
+    ///
+    /// @returns A list of pairs of input and output IDs.
+    std::vector<std::pair<size_t, size_t>> get_inplace_ports() const {
+        size_t num = 0;
+        const dnnl_graph_inplace_pair_t *inplace_pairs;
+
+        error::wrap_c_api(dnnl_graph_compiled_partition_get_inplace_ports(
+                                  get(), &num, &inplace_pairs),
+                "could not get the in-place pairs from a compiled partition");
+        if (num == 0) return {};
+
+        std::vector<std::pair<size_t, size_t>> inplace_options;
+        inplace_options.reserve(num);
+        for (size_t i = 0; i < num; ++i) {
+            const dnnl_graph_inplace_pair_t *inplace_pair = inplace_pairs + i;
+            inplace_options.emplace_back(
+                    inplace_pair->input_id, inplace_pair->output_id);
+        }
+        return inplace_options;
+    }
+
+    /// Execute a compiled partition.
+    ///
+    /// @param astream Stream object to run over.
+    /// @param inputs A list of input tensors.
+    /// @param outputs A list of output tensors.
+    void execute(stream &astream, const std::vector<tensor> &inputs,
+            const std::vector<tensor> &outputs) const {
+        std::vector<const_dnnl_graph_tensor_t> c_inputs;
+        c_inputs.reserve(inputs.size());
+        for (auto &in : inputs) {
+            c_inputs.push_back(in.get());
+        }
+        std::vector<const_dnnl_graph_tensor_t> c_outputs;
+        c_outputs.reserve(outputs.size());
+        for (auto &out : outputs) {
+            c_outputs.push_back(out.get());
+        }
+
+        error::wrap_c_api(
+                dnnl_graph_compiled_partition_execute(get(), astream.get(),
+                        c_inputs.size(), c_inputs.data(), c_outputs.size(),
+                        c_outputs.data()),
+                "could not execute the compiled_partition");
+    }
+};
+
+/// @} dnnl_graph_api_compiled_partition
+
+/// @addtogroup dnnl_graph_api_op Op
+///
+/// OP is an abstraction of computation logic for deep neural network
+/// operations. An op object encapsulates an operation kind which describes the
+/// computation logic, an unique ID which differentiates operations with the
+/// same kind, and logical tensors which describes the input and output of the
+/// operation and its connections to other operations in the graph.
+///
+/// @{
+
+/// An op object.
+class op : public op_handle {
+public:
+    /// Kinds of operations
+    enum class kind {
+        Abs = dnnl_graph_op_abs,
+        AbsBackward = dnnl_graph_op_abs_backward,
+        Add = dnnl_graph_op_add,
+        AvgPool = dnnl_graph_op_avg_pool,
+        AvgPoolBackward = dnnl_graph_op_avg_pool_backward,
+        BatchNormForwardTraining = dnnl_graph_op_batch_norm_forward_training,
+        BatchNormInference = dnnl_graph_op_batch_norm_inference,
+        BatchNormTrainingBackward = dnnl_graph_op_batch_norm_backward,
+        BiasAdd = dnnl_graph_op_bias_add,
+        BiasAddBackward = dnnl_graph_op_bias_add_backward,
+        Clamp = dnnl_graph_op_clamp,
+        ClampBackward = dnnl_graph_op_clamp_backward,
+        Concat = dnnl_graph_op_concat,
+        Convolution = dnnl_graph_op_convolution,
+        ConvolutionBackwardData = dnnl_graph_op_convolution_backward_data,
+        ConvolutionBackwardWeights = dnnl_graph_op_convolution_backward_weights,
+        ConvTranspose = dnnl_graph_op_conv_transpose,
+        ConvTransposeBackwardData = dnnl_graph_op_conv_transpose_backward_data,
+        ConvTransposeBackwardWeights
+        = dnnl_graph_op_conv_transpose_backward_weights,
+        Dequantize = dnnl_graph_op_dequantize,
+        Divide = dnnl_graph_op_divide,
+        DynamicDequantize = dnnl_graph_op_dynamic_dequantize,
+        DynamicQuantize = dnnl_graph_op_dynamic_quantize,
+        Elu = dnnl_graph_op_elu,
+        EluBackward = dnnl_graph_op_elu_backward,
+        End = dnnl_graph_op_end,
+        Exp = dnnl_graph_op_exp,
+        GELU = dnnl_graph_op_gelu,
+        GELUBackward = dnnl_graph_op_gelu_backward,
+        GroupNorm = dnnl_graph_op_group_norm,
+        HardSigmoid = dnnl_graph_op_hard_sigmoid,
+        HardSigmoidBackward = dnnl_graph_op_hard_sigmoid_backward,
+        HardSwish = dnnl_graph_op_hard_swish,
+        HardSwishBackward = dnnl_graph_op_hard_swish_backward,
+        Interpolate = dnnl_graph_op_interpolate,
+        InterpolateBackward = dnnl_graph_op_interpolate_backward,
+        LayerNorm = dnnl_graph_op_layer_norm,
+        LayerNormBackward = dnnl_graph_op_layer_norm_backward,
+        LeakyReLU = dnnl_graph_op_leaky_relu,
+        Log = dnnl_graph_op_log,
+        LogSoftmax = dnnl_graph_op_log_softmax,
+        LogSoftmaxBackward = dnnl_graph_op_log_softmax_backward,
+        MatMul = dnnl_graph_op_matmul,
+        Maximum = dnnl_graph_op_maximum,
+        MaxPool = dnnl_graph_op_max_pool,
+        MaxPoolBackward = dnnl_graph_op_max_pool_backward,
+        Minimum = dnnl_graph_op_minimum,
+        Mish = dnnl_graph_op_mish,
+        MishBackward = dnnl_graph_op_mish_backward,
+        Multiply = dnnl_graph_op_multiply,
+        Pow = dnnl_graph_op_pow,
+        PReLU = dnnl_graph_op_prelu,
+        PReLUBackward = dnnl_graph_op_prelu_backward,
+        Quantize = dnnl_graph_op_quantize,
+        Reciprocal = dnnl_graph_op_reciprocal,
+        ReduceL1 = dnnl_graph_op_reduce_l1,
+        ReduceL2 = dnnl_graph_op_reduce_l2,
+        ReduceMax = dnnl_graph_op_reduce_max,
+        ReduceMean = dnnl_graph_op_reduce_mean,
+        ReduceMin = dnnl_graph_op_reduce_min,
+        ReduceProd = dnnl_graph_op_reduce_prod,
+        ReduceSum = dnnl_graph_op_reduce_sum,
+        ReLU = dnnl_graph_op_relu,
+        ReLUBackward = dnnl_graph_op_relu_backward,
+        Reorder = dnnl_graph_op_reorder,
+        Round = dnnl_graph_op_round,
+        Select = dnnl_graph_op_select,
+        Sigmoid = dnnl_graph_op_sigmoid,
+        SigmoidBackward = dnnl_graph_op_sigmoid_backward,
+        SoftMax = dnnl_graph_op_softmax,
+        SoftMaxBackward = dnnl_graph_op_softmax_backward,
+        SoftPlus = dnnl_graph_op_softplus,
+        SoftPlusBackward = dnnl_graph_op_softplus_backward,
+        Sqrt = dnnl_graph_op_sqrt,
+        SqrtBackward = dnnl_graph_op_sqrt_backward,
+        Square = dnnl_graph_op_square,
+        SquaredDifference = dnnl_graph_op_squared_difference,
+        StaticReshape = dnnl_graph_op_static_reshape,
+        StaticTranspose = dnnl_graph_op_static_transpose,
+        Subtract = dnnl_graph_op_subtract,
+        Tanh = dnnl_graph_op_tanh,
+        TanhBackward = dnnl_graph_op_tanh_backward,
+        TypeCast = dnnl_graph_op_type_cast,
+        Wildcard = dnnl_graph_op_wildcard,
+        GenIndex = dnnl_graph_op_gen_index,
+        GreaterEqual = dnnl_graph_op_greater_equal,
+        // Sentinel
+        LastSymbol = dnnl_graph_op_last_symbol,
+    };
+
+    /// Attributes of operations. Different operations support different
+    /// attributes. Check the document of each operation for what attributes are
+    /// supported and what are the potential values for them. Missing required
+    /// attribute or illegal attribute value may lead to failure when adding the
+    /// operation to a graph.
+    enum class attr {
+        /// Undefined op attribute.
+        undef = dnnl_graph_op_attr_undef,
+
+        // float32 attributes. The value of these attributes can be any single
+        // float32 number.
+
+        /// Specifies an alpha attribute to an op.
+        alpha = dnnl_graph_op_attr_alpha,
+        /// Specifies an beta attribute to an op.
+        beta = dnnl_graph_op_attr_beta,
+        /// Specifies an epsilon attribute to an op.
+        epsilon = dnnl_graph_op_attr_epsilon,
+        /// Specifies a max attribute to an op.
+        max = dnnl_graph_op_attr_max,
+        /// Specifies a min attribute to an op.
+        min = dnnl_graph_op_attr_min,
+        /// Specifies a momentum attribute to an op.
+        momentum = dnnl_graph_op_attr_momentum,
+
+        // float32 vector attributes. The value of these attributes can be a
+        // vector of float32 numbers.
+
+        /// Specifies a scales attribute to an op.
+        scales = dnnl_graph_op_attr_scales,
+
+        // int64_t attributes. The value of these attributes can be any single
+        // int64 number.
+
+        /// Specifies an axis attribute to an op.
+        axis = dnnl_graph_op_attr_axis,
+        /// Specifies a begin_norm_axis attribute to an op.
+        begin_norm_axis = dnnl_graph_op_attr_begin_norm_axis,
+        /// Specifies a groups attribute to an op.
+        groups = dnnl_graph_op_attr_groups,
+
+        // int64_t vector attributes. The value of these attributes can be a
+        // vector of int64 numbers.
+
+        /// Specifies an axes attribute to an op.
+        axes = dnnl_graph_op_attr_axes,
+        /// Specifies a dilations attribute to an op.
+        dilations = dnnl_graph_op_attr_dilations,
+        /// Specifies an dst_shape attribute to an op.
+        dst_shape = dnnl_graph_op_attr_dst_shape,
+        /// Specifies a kernel attribute to an op.
+        kernel = dnnl_graph_op_attr_kernel,
+        /// Specifies an order attribute to an op.
+        order = dnnl_graph_op_attr_order,
+        /// Specifies an output_padding attribute to an op.
+        output_padding = dnnl_graph_op_attr_output_padding,
+        /// Specifies a pads_begin attribute to an op.
+        pads_begin = dnnl_graph_op_attr_pads_begin,
+        /// Specifies a pads_end attribute to an op.
+        pads_end = dnnl_graph_op_attr_pads_end,
+        /// Specifies a shape attribute to an op.
+        shape = dnnl_graph_op_attr_shape,
+        /// Specifies a sizes attribute to an op.
+        sizes = dnnl_graph_op_attr_sizes,
+        /// Specifies an src_shape attribute to an op.
+        src_shape = dnnl_graph_op_attr_src_shape,
+        /// Specifies a strides attribute to an op.
+        strides = dnnl_graph_op_attr_strides,
+        /// Specifies a weight_shape attribute to an op.
+        weights_shape = dnnl_graph_op_attr_weights_shape,
+        /// Specifies a zps attribute to an op.
+        zps = dnnl_graph_op_attr_zps,
+        /// Specifies the group shape of an op. The size of the vector should
+        /// match that of the input. For the dimensions where the grouped
+        /// quantization occurs, the values should correspond to the group
+        /// size, which indicates the number of elements that will share the
+        /// same scaling factor.
+        group_shape = dnnl_graph_op_attr_group_shape,
+
+        // bool attributes. The value of these attributes can be any single bool
+        // value.
+
+        /// Specifies an exclude_pad attribute to an op.
+        exclude_pad = dnnl_graph_op_attr_exclude_pad,
+        /// Specifies a keep_dims attribute to an op.
+        keep_dims = dnnl_graph_op_attr_keep_dims,
+        /// Specifies a keep_stats attribute to an op.
+        keep_stats = dnnl_graph_op_attr_keep_stats,
+        /// Specifies a per_channel_broadcast attribute to an op.
+        per_channel_broadcast = dnnl_graph_op_attr_per_channel_broadcast,
+        /// Specifies a special_zero attribute to an op.
+        special_zero = dnnl_graph_op_attr_special_zero,
+        /// Specifies a transpose_a attribute to an op.
+        transpose_a = dnnl_graph_op_attr_transpose_a,
+        /// Specifies a transpose_b attribute to an op.
+        transpose_b = dnnl_graph_op_attr_transpose_b,
+        /// Specifies an use_affine attribute to an op.
+        use_affine = dnnl_graph_op_attr_use_affine,
+        /// Specifies an use_dst attribute to an op.
+        use_dst = dnnl_graph_op_attr_use_dst,
+
+        // string attributes. The value of these attributes can be a string.
+
+        /// Specifies an auto_broadcast attribute to an op. The value can be
+        /// "none" or "numpy".
+        auto_broadcast = dnnl_graph_op_attr_auto_broadcast,
+        /// Specifies an auto_pad attribute to an op. The value can be "none",
+        /// "same_upper", "same_lower", or "valid".
+        auto_pad = dnnl_graph_op_attr_auto_pad,
+        /// Specifies an coordinate_transformation_mode attribute to an op. The
+        /// value can be "half_pixel" or "align_corners". The attribute is
+        /// defined for Interpolate operations.
+        coordinate_transformation_mode
+        = dnnl_graph_op_attr_coordinate_transformation_mode,
+        /// Specifies a data_format of an op. The value can be "NCX" or "NXC".
+        data_format = dnnl_graph_op_attr_data_format,
+        /// Specifies a mode attribute of an op. The value can be "nearest",
+        /// "linear", "bilinear", or "trilinear". The attribute is defined for
+        /// Interpolate operations.
+        mode = dnnl_graph_op_attr_mode,
+        /// Specifies a qtype attribute to an op. The value can be "per_channel"
+        /// or "per_tensor". The attribute is defined for quantization
+        /// operations.
+        qtype = dnnl_graph_op_attr_qtype,
+        /// Specifies a rounding_type attribute to an op. The value can be
+        /// "ceil" or "floor".
+        rounding_type = dnnl_graph_op_attr_rounding_type,
+        /// Specifies a weights_format of an op. The value can be "OIX", "XIO",
+        /// "IOX", or "XOI". Different operations may support different values.
+        weights_format = dnnl_graph_op_attr_weights_format,
+
+        /// Specifies the end of all above exteral attributes for check.
+        end = dnnl_graph_op_attr_end,
+    };
+
+    /// Constructs an op object with an unique ID, an operation kind, and a name
+    /// string.
+    ///
+    /// @param id The unique ID of the op.
+    /// @param akind The op kind specifies which computation is represented by
+    ///     the op, such as Convolution or ReLU.
+    /// @param verbose_name The string added as the op name.
+    op(size_t id, kind akind, const std::string &verbose_name = "") {
+        dnnl_graph_op_t op = nullptr;
+        error::wrap_c_api(dnnl_graph_op_create(&op, id, convert_to_c(akind),
+                                  verbose_name.c_str()),
+                "could not create op with id and op kind");
+        reset(op);
+    }
+
+    /// Constructs an op object with an unique ID, an operation kind, and
+    /// input/output logical tensors.
+    ///
+    /// @param id The unique ID of this op.
+    /// @param akind The op kind specifies which computation is represented by
+    ///     this op, such as Convolution or ReLU.
+    /// @param inputs Input logical tensor to be bound to this op.
+    /// @param outputs Output logical tensor to be bound to this op.
+    /// @param verbose_name The string added as the op name.
+    op(size_t id, kind akind, const std::vector<logical_tensor> &inputs,
+            const std::vector<logical_tensor> &outputs,
+            const std::string &verbose_name = "")
+        : op(id, akind, verbose_name) {
+        for (const auto &input : inputs) {
+            error::wrap_c_api(dnnl_graph_op_add_input(get(), &(input.data)),
+                    "adding input to the op failed");
+        }
+        for (const auto &output : outputs) {
+            error::wrap_c_api(dnnl_graph_op_add_output(get(), &(output.data)),
+                    "adding output to the op failed");
+        }
+    }
+
+    /// Adds an input logical tensor to the op.
+    ///
+    /// @param t Input logical tensor.
+    void add_input(const logical_tensor &t) {
+        error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)),
+                "adding input to the op failed");
+    }
+
+    /// Adds a vector of input logical tensors to the op.
+    ///
+    /// @param ts The list of input logical tensors.
+    void add_inputs(const std::vector<logical_tensor> &ts) {
+        for (const auto &t : ts) {
+            error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)),
+                    "adding input to the op failed");
+        }
+    }
+
+    /// Adds an output logical tensor to the op.
+    ///
+    /// @param t Output logical tensor.
+    void add_output(const logical_tensor &t) {
+        error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)),
+                "adding output to the op failed");
+    }
+
+    /// Adds a vector of output logical tensors to the op.
+    ///
+    /// @param ts The list of output logical tensors.
+    void add_outputs(const std::vector<logical_tensor> &ts) {
+        for (const auto &t : ts) {
+            error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)),
+                    "adding output to the op failed");
+        }
+    }
+
+    /// Sets the attribute according to the name and type (int64_t).
+    ///
+    /// @tparam Type_i Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_i, req<std::is_same<Type_i, int64_t>::value> = true>
+    op &set_attr(attr name, const Type_i &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        error::wrap_c_api(dnnl_graph_op_set_attr_s64(get(), attr, &value, 1),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+    /// Sets the attribute according to the name and type (float).
+    ///
+    /// @tparam Type_f Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_f, req<std::is_same<Type_f, float>::value> = true>
+    op &set_attr(attr name, const Type_f &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        error::wrap_c_api(dnnl_graph_op_set_attr_f32(get(), attr, &value, 1),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+    /// Sets the attribute according to the name and type (bool).
+    ///
+    /// @tparam Type_b Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_b, req<std::is_same<Type_b, bool>::value> = true>
+    op &set_attr(attr name, const Type_b &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        const uint8_t val = value;
+        error::wrap_c_api(dnnl_graph_op_set_attr_bool(get(), attr, &val, 1),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+    /// Sets the attribute according to the name and type (string).
+    ///
+    /// @tparam Type_s Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_s,
+            req<std::is_same<Type_s, std::string>::value> = true>
+    op &set_attr(attr name, const Type_s &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        error::wrap_c_api(dnnl_graph_op_set_attr_str(
+                                  get(), attr, value.c_str(), value.size()),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+    /// Sets the attribute according to the name and type
+    /// (std::vector<int64_t>).
+    ///
+    /// @tparam Type_is Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_is,
+            req<std::is_same<Type_is, std::vector<int64_t>>::value> = true>
+    op &set_attr(attr name, const Type_is &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        error::wrap_c_api(dnnl_graph_op_set_attr_s64(
+                                  get(), attr, value.data(), value.size()),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+    /// Sets the attribute according to the name and type (std::vector<float>).
+    ///
+    /// @tparam Type_fs Attribute's type.
+    /// @param name Attribute's name.
+    /// @param value The attribute's value.
+    /// @returns The Op self.
+    template <typename Type_fs,
+            req<std::is_same<Type_fs, std::vector<float>>::value> = true>
+    op &set_attr(attr name, const Type_fs &value) {
+        dnnl_graph_op_attr_t attr = convert_to_c(name);
+        error::wrap_c_api(dnnl_graph_op_set_attr_f32(
+                                  get(), attr, value.data(), value.size()),
+                "could not set attribute to the op");
+        return *this;
+    }
+
+private:
+    dnnl_graph_op_kind_t convert_to_c(kind akind) {
+        return static_cast<dnnl_graph_op_kind_t>(akind);
+    }
+
+    dnnl_graph_op_attr_t convert_to_c(attr aattr) {
+        return static_cast<dnnl_graph_op_attr_t>(aattr);
+    }
+};
+
+/// @} dnnl_graph_api_op
+
+/// @addtogroup dnnl_graph_api_partition Partition
+///
+/// Partition represents a collection of operations and their input and output
+/// logical tensors identified by library as the basic unit for compilation and
+/// execution.
+///
+/// @{
+
+/// A partition object.
+class partition : public partition_handle {
+public:
+    /// Policy specifications for partitioning.
+    enum class policy {
+        /// Fusion policy returns partitions with typical post-op fusions, eg.
+        /// Convolution + ReLU or other element-wise operations or a chian of
+        /// post-ops.
+        fusion = dnnl_graph_partition_policy_fusion,
+        /// Debug policy doesn't not apply any fusions. It returns partitions
+        /// with single operations in each partition. The policy is useful when
+        /// users notice any bug or correctness issue in fusion policy.
+        debug = dnnl_graph_partition_policy_debug,
+    };
+
+    partition() = default;
+
+    /// Constructs a partition object
+    ///
+    /// @param p A raw pointer to the C API handle
+    partition(dnnl_graph_partition_t p) { reset(p, false); }
+
+    /// Creates a new partition with a given operator and engine kind. The API
+    /// is used to create a partition from an operation directly without
+    /// creating the graph and calling `get_partitions()`. The output partition
+    /// contains only one operation.
+    ///
+    /// @param aop An operation used to create the partition.
+    /// @param ekind Engine kind.
+    partition(const op &aop, engine::kind ekind) {
+        dnnl_graph_partition_t p = nullptr;
+        error::wrap_c_api(dnnl_graph_partition_create_with_op(&p, aop.get(),
+                                  static_cast<dnnl_engine_kind_t>(ekind)),
+                "could not create a partition with the op and engine kind");
+        reset(p);
+    }
+
+    /// Returns the number of operations contained in the partition.
+    ///
+    /// @returns Number of operations.
+    size_t get_ops_num() const {
+        size_t num {0};
+        error::wrap_c_api(dnnl_graph_partition_get_op_num(get(), &num),
+                "could not get number of ops from the partition");
+        return num;
+    }
+
+    /// Returns all operation IDs contained in the partition.
+    ///
+    /// @returns An unordered set of operation IDs.
+    std::vector<size_t> get_ops() const {
+        auto num = get_ops_num();
+        std::vector<size_t> ops(num);
+
+        error::wrap_c_api(dnnl_graph_partition_get_ops(get(), num, ops.data()),
+                "could not get op ids from the partition");
+        return ops;
+    }
+
+    /// Returns the unique ID of the partition. Partition ID is generated by the
+    /// library internally. The ID can be used for debugging purpose or verbose.
+    ///
+    /// @returns ID of the partition.
+    size_t get_id() const {
+        size_t id {};
+        error::wrap_c_api(dnnl_graph_partition_get_id(get(), &id),
+                "could not get id of the partition");
+        return id;
+    }
+
+    /// Compiles a partition with given input and output logical tensors. The
+    /// output logical tensors can contain unknown dimensions. For this case,
+    /// the compilation will deduce the output shapes according to input shapes.
+    /// The output logical tensors can also have layout type `any`. The
+    /// compilation will choose the optimal layout for output tensors. The
+    /// optimal layout will be represented as an opaque layout ID saved in the
+    /// output logical tensor.
+    ///
+    /// @param inputs A list of input logical tensors.
+    /// @param outputs A list of output logical tensors.
+    /// @param e The engine used to compile the partition.
+    /// @returns A compiled partition.
+    compiled_partition compile(const std::vector<logical_tensor> &inputs,
+            const std::vector<logical_tensor> &outputs, const engine &e) const {
+        if (!is_supported()) {
+            error::wrap_c_api(dnnl_invalid_arguments,
+                    "could not compile an unsupported partition");
+        }
+
+        return compile_(inputs, outputs, e);
+    }
+
+    /// Returns the supporting status of a partition. Some operations may not be
+    /// supported by the library under certain circumstances. During
+    /// partitioning stage, unsupported partitions will be returned to users
+    /// with each containing an unsupported operation. Users should check the
+    /// supporting status of a partition before transforming the computation
+    /// graph or compiling the partition.
+    ///
+    /// @returns @c true if this partition is supported or @c false if this
+    ///     partition isn't supported by the library
+    bool is_supported() const {
+        uint8_t supported {0};
+        error::wrap_c_api(dnnl_graph_partition_is_supported(get(), &supported),
+                "could not get supporting status of the partition");
+        return supported != 0;
+    }
+
+    /// Returns a list of input logical tensors from the partition.
+    ///
+    /// @returns A list of input logical tensors.
+    std::vector<logical_tensor> get_input_ports() const {
+        size_t num = 0;
+        error::wrap_c_api(dnnl_graph_partition_get_input_ports_num(get(), &num),
+                "could not get number of inputs of the partition");
+        if (num == 0) return {};
+
+        std::vector<dnnl_graph_logical_tensor_t> c_inputs(num);
+        error::wrap_c_api(dnnl_graph_partition_get_input_ports(
+                                  get(), num, c_inputs.data()),
+                "could not get input logical tensors of the partition");
+
+        std::vector<logical_tensor> inputs;
+        inputs.reserve(num);
+        for (auto &c_lt : c_inputs)
+            inputs.emplace_back(c_lt);
+        return inputs;
+    }
+
+    /// Returns a list of output logical tensors from the partition.
+    ///
+    /// @returns A list of output logical tensor.
+    std::vector<logical_tensor> get_output_ports() const {
+        size_t num = 0;
+        error::wrap_c_api(
+                dnnl_graph_partition_get_output_ports_num(get(), &num),
+                "cannot get number of outputs of the partition");
+        if (num == 0) return {};
+
+        std::vector<dnnl_graph_logical_tensor_t> c_outputs(num);
+        error::wrap_c_api(dnnl_graph_partition_get_output_ports(
+                                  get(), num, c_outputs.data()),
+                "could not get output logical tensors of the partition");
+
+        std::vector<logical_tensor> outputs;
+        outputs.reserve(num);
+        for (auto &c_lt : c_outputs)
+            outputs.emplace_back(c_lt);
+        return outputs;
+    }
+
+    /// Returns the engine kind of the partition
+    ///
+    /// @returns The engine kind
+    engine::kind get_engine_kind() const {
+        dnnl_engine_kind_t akind;
+        error::wrap_c_api(dnnl_graph_partition_get_engine_kind(get(), &akind),
+                "cannot get the engine kind from the partition");
+
+        return static_cast<engine::kind>(akind);
+    }
+
+private:
+    compiled_partition compile_(const std::vector<logical_tensor> &inputs,
+            const std::vector<logical_tensor> &outputs, const engine &e) const {
+        std::vector<const dnnl_graph_logical_tensor_t *> c_inputs;
+        std::vector<const dnnl_graph_logical_tensor_t *> c_outputs;
+
+        c_inputs.reserve(inputs.size());
+        for (const auto &in : inputs) {
+            c_inputs.push_back(&(in.data));
+        }
+
+        c_outputs.reserve(outputs.size());
+        for (const auto &out : outputs) {
+            c_outputs.push_back(&(out.data));
+        }
+
+        dnnl_graph_compiled_partition_t cpartitions = nullptr;
+        error::wrap_c_api(
+                dnnl_graph_compiled_partition_create(&cpartitions, get()),
+                "could not create compiled_partition");
+        error::wrap_c_api(dnnl_graph_partition_compile(get(), cpartitions,
+                                  c_inputs.size(), c_inputs.data(),
+                                  c_outputs.size(), c_outputs.data(), e.get()),
+                "partition compile failed");
+
+        return compiled_partition(cpartitions);
+    }
+};
+
+/// @} dnnl_graph_api_partition
+
+/// @addtogroup dnnl_graph_api_graph Graph
+///
+/// Graph represents a computational DAG with a set of operations.
+/// #dnnl::graph::graph::add_op() adds an operation and its input and output
+/// logical tensors into a graph. The library accumulates the operations and
+/// logical tensors and constructs and validates the graph as an internal state.
+/// A graph object is associated to a specific engine kind. The partitions
+/// returned from the graph will inherit the engine kind of the graph.
+///
+/// @{
+
+/// A graph object.
+class graph : public graph_handle {
+public:
+    /// Constructs a graph with an engine kind.
+    ///
+    /// @param engine_kind Engine kind.
+    graph(engine::kind engine_kind) {
+        dnnl_graph_graph_t g = nullptr;
+        error::wrap_c_api(
+                dnnl_graph_graph_create(&g, convert_to_c(engine_kind)),
+                "could not create graph with engine kind");
+        reset(g);
+    }
+
+    /// Creates a new empty graph with an engine kind and a floating-point math
+    /// mode. All partitions returned from the graph will inherit the engine
+    /// kind and floating-point math mode.
+    ///
+    /// Setting the floating-point math mode enables automatic down-conversion
+    /// of inputs for the given graph, promoting speedup by using
+    /// lower-precision data types when available.
+    ///
+    /// @param engine_kind Engine kind.
+    /// @param mode Floating-point math mode.
+    graph(engine::kind engine_kind, fpmath_mode mode) {
+        dnnl_graph_graph_t g = nullptr;
+        error::wrap_c_api(
+                dnnl_graph_graph_create_with_fpmath_mode(
+                        &g, convert_to_c(engine_kind), convert_to_c(mode)),
+                "could not create graph with engine kind and math mode");
+        reset(g);
+    }
+
+    /// Set the floating point math mode for a graph. Users can enforce the
+    /// graph to comply with the mode by specifying a boolean flag with the
+    /// setter function.
+    ///
+    /// @param mode The floating-point math mode.
+    /// @param apply_to_int The flag that controls whether to use
+    /// floating-point arithmetic for integral operations.
+    void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) {
+        error::wrap_c_api(dnnl_graph_graph_set_fpmath_mode(
+                                  get(), convert_to_c(mode), apply_to_int),
+                "could not set fpmath mode graph attribute");
+    }
+
+    /// Get the floating point math mode and the boolean flag that specifies
+    /// whether the graph will be enforced to comply the mode.
+    ///
+    /// @param mode The floating-point math mode.
+    /// @param apply_to_int The flag that controls whether to use
+    /// floating-point arithmetic for integral operations.
+    void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const {
+        dnnl_fpmath_mode_t c_mode;
+        int c_apply_to_int;
+
+        error::wrap_c_api(dnnl_graph_graph_get_fpmath_mode(
+                                  get(), &c_mode, &c_apply_to_int),
+                "could not get fpmath mode graph attribute");
+
+        mode = fpmath_mode(c_mode);
+        apply_to_int = static_cast<bool>(c_apply_to_int);
+    }
+
+    /// Adds an op into the graph to construct a computational DAG. The API will
+    /// return failure if the operator has already been added to the graph or
+    /// the operation cannot pass the schema check in the library (eg. input and
+    /// output numbers and data types, the attributes of the operation, etc.).
+    ///
+    /// @param op An operation to be added.
+    /// @param allow_exception A flag indicating whether the method is allowed
+    ///     to throw an exception if it fails to add the op to the graph.
+    /// @returns #status::success or a status describing the error otherwise.
+    status add_op(const op &op, bool allow_exception = true) {
+        dnnl_status_t ret = dnnl_graph_add_op(get(), op.get());
+
+        if (allow_exception) {
+            error::wrap_c_api(ret, "could not add op to the graph");
+        }
+
+        return static_cast<status>(ret);
+    }
+
+    /// Finalizes a graph. It means users have finished adding operations into
+    /// the graph and the graph is ready for partitioning. Adding a new
+    /// operation into a finalized graph will return failures. Similarly,
+    /// partitioning on a un-finalized graph will also return failures.
+    void finalize() {
+        error::wrap_c_api(dnnl_graph_graph_finalize(get()),
+                "could not finalize the graph");
+    }
+
+    /// Checks if a graph is finalized.
+    ///
+    /// @return True if the graph is finalized or false if the graph is not
+    /// finalized.
+    bool is_finalized() const {
+        uint8_t ret = 0;
+        error::wrap_c_api(dnnl_graph_graph_is_finalized(get(), &ret),
+                "could not get the finalization status of the graph");
+
+        return ret != 0;
+    }
+
+    /// Gets filtered partitions from a graph. Partitions will be claimed
+    /// internally according to the capability of the library, the engine kind
+    /// of the graph, and the policy.
+    ///
+    /// @param policy Partition policy, defaults to policy
+    ///     #dnnl::graph::partition::policy::fusion.
+    /// @return A vector storing the partitions.
+    std::vector<partition> get_partitions(
+            partition::policy policy = partition::policy::fusion) {
+        if (!is_finalized()) {
+            error::wrap_c_api(
+                    dnnl_invalid_graph, "the graph is not finalized yet");
+        }
+
+        error::wrap_c_api(
+                dnnl_graph_graph_filter(get(),
+                        static_cast<dnnl_graph_partition_policy_t>(policy)),
+                "could not filter the graph");
+
+        size_t num = 0;
+        error::wrap_c_api(dnnl_graph_graph_get_partition_num(get(), &num),
+                "could not get number of partitions from the graph");
+
+        // return early if there is no partitions in the graph.
+        if (num == 0) return {};
+
+        std::vector<partition> out_list;
+        out_list.reserve(num);
+
+        std::vector<dnnl_graph_partition_t> partitions(num);
+        error::wrap_c_api(
+                dnnl_graph_graph_get_partitions(get(), num, partitions.data()),
+                "could not get partitions from the graph");
+
+        for (auto p : partitions) {
+            out_list.emplace_back(p);
+        }
+
+        return out_list;
+    }
+
+private:
+    static dnnl_fpmath_mode_t convert_to_c(fpmath_mode mode) {
+        return static_cast<dnnl_fpmath_mode_t>(mode);
+    }
+
+    static dnnl_engine_kind_t convert_to_c(engine::kind akind) {
+        return static_cast<dnnl_engine_kind_t>(akind);
+    }
+};
+
+/// @} dnnl_graph_api_graph
+
+/// @addtogroup dnnl_graph_api_compiled_partition_cache Compiled Partition Cache
+///
+/// A set of functions that provide compiled partition cache control.
+///
+/// @{
+
+/// Returns the number of compiled partition that can be held in the compiled
+/// partition cache at the same time.
+inline int get_compiled_partition_cache_capacity() {
+    int result = 0;
+    error::wrap_c_api(dnnl_graph_get_compiled_partition_cache_capacity(&result),
+            "could not get compiled partition cache capacity");
+    return result;
+}
+
+/// @copydoc dnnl_graph_set_compiled_partition_cache_capacity(int capacity)
+inline void set_compiled_partition_cache_capacity(int capacity) {
+    error::wrap_c_api(
+            dnnl_graph_set_compiled_partition_cache_capacity(capacity),
+            "could not set compiled partition cache capacity");
+}
+
+/// @} dnnl_graph_api_compiled_partition_cache
+
+/// @addtogroup dnnl_graph_api_constant_tensor_cache Constant Tensor Cache
+///
+/// A set of functions that provide constant tensor cache control
+///
+/// @{
+
+/// Control the enabling or disabling of constant tensor cache. This API must be
+/// called once before compilation stage. By default, constant tensor cache is
+/// disabled in the library.
+/// @note This API is deprecated and will be removed in future release, please
+/// use the set_constant_tensor_cache_capacity API to disable
+/// constant tensor cache by setting it's capacity to zero.
+///
+/// @param flag Set to positive value to enable the cache and set to 0 to
+/// disable the cache. Negative values are invalid.
+inline void set_constant_tensor_cache(int flag) {
+    error::wrap_c_api(dnnl_graph_set_constant_tensor_cache(flag),
+            "fail to set constant tensor cache");
+}
+
+/// Return the enabling status of constant tensor cache.
+/// @note This API is deprecated and will be removed in future release, please
+/// use the get_constant_tensor_cache_capacity API to check the
+/// enabling status by checking it's capacity.
+inline int get_constant_tensor_cache() {
+    int result = 0;
+    error::wrap_c_api(dnnl_graph_get_constant_tensor_cache(&result),
+            "fail to get constant tensor cache");
+    return result;
+}
+
+/// Control the capacity for the constant tensor cache that used for specific
+/// engine kind. This API is thread safe and can be called multiple times at
+/// runtime. The capacity is set to zero by default which means the cache is
+/// disabled. When calling this API, the corresponding cache will be flushed.
+/// Setting capacity to 0 means to clear all cached tensors and disable cache.
+/// Once the capacity limit is reached, no new tensors will be cached. If there
+/// are multiple devices for an engine kind, the capacity set here is for each
+/// device.
+///
+/// @param kind The engine kind that the constant tensor cache used for.
+/// @param size The constant tensor cache capacity size to set.
+inline void set_constant_tensor_cache_capacity(engine::kind kind, size_t size) {
+    error::wrap_c_api(dnnl_graph_set_constant_tensor_cache_capacity(
+                              static_cast<dnnl_engine_kind_t>(kind), size),
+            "fail to set constant tensor cache capacity");
+}
+
+/// Return the current capacity of constant tensor cache.
+///
+/// @param kind The engine kind that the constant tensor cache used for.
+inline size_t get_constant_tensor_cache_capacity(engine::kind kind) {
+    size_t size = 0;
+    error::wrap_c_api(dnnl_graph_get_constant_tensor_cache_capacity(
+                              static_cast<dnnl_engine_kind_t>(kind), &size),
+            "fail to get constant tensor cache capacity");
+    return size;
+}
+
+/// @} dnnl_graph_api_constant_tensor_cache
+
+} // namespace graph
+
+/// @} dnnl_graph_api
+
+} // namespace dnnl
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+/// oneAPI namespace
+// Contains the oneapi::dnnl namespace as an alias to the ::dnnl namespace.
+namespace oneapi {
+// Note: without this guard, doxygen warns of potentially recursive namespace
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+/// oneDNN alias namespace
+namespace dnnl = ::dnnl;
+#endif
+} // namespace oneapi
+
+/// @endcond
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f33c90c5e5e130c982d3f2fad00559734261c15a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.h
@@ -0,0 +1,154 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_OCL_H
+#define ONEAPI_DNNL_DNNL_GRAPH_OCL_H
+
+#include "oneapi/dnnl/dnnl_graph.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+// Set target version for OpenCL explicitly to suppress a compiler warning.
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 120
+#endif
+
+#include <CL/cl.h>
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_interop
+/// @{
+
+/// @addtogroup dnnl_graph_api_ocl_interop
+/// @{
+
+/// Allocation call-back function interface for OpenCL. OpenCL allocator should
+/// be used for OpenCL GPU runtime. The call-back should return a USM device
+/// memory pointer.
+///
+/// @param size Memory size in bytes for requested allocation
+/// @param alignment The minimum alignment in bytes for the requested allocation
+/// @param device A valid OpenCL device used to allocate
+/// @param context A valid OpenCL context used to allocate
+/// @returns The memory address of the requested USM allocation.
+typedef void *(*dnnl_graph_ocl_allocate_f)(
+        size_t size, size_t alignment, cl_device_id device, cl_context context);
+
+/// Deallocation call-back function interface for OpenCL. OpenCL allocator
+/// should be used for OpenCL runtime. The call-back should deallocate a USM
+/// device memory returned by #dnnl_graph_ocl_allocate_f. The event should be
+/// completed before deallocate the USM.
+///
+/// @param buf The USM allocation to be released
+/// @param device A valid OpenCL device the USM associated with
+/// @param context A valid OpenCL context used to free the USM allocation
+/// @param event A event which the USM deallocation depends on
+typedef void (*dnnl_graph_ocl_deallocate_f)(
+        void *buf, cl_device_id device, cl_context context, cl_event event);
+
+/// Creates an allocator with the given allocation and deallocation call-back
+/// function pointers.
+///
+/// @param allocator Output allocator
+/// @param ocl_malloc A pointer to OpenCL malloc function
+/// @param ocl_free A pointer to OpenCL free function
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_ocl_interop_allocator_create(
+        dnnl_graph_allocator_t *allocator, dnnl_graph_ocl_allocate_f ocl_malloc,
+        dnnl_graph_ocl_deallocate_f ocl_free);
+
+/// This API is a supplement for existing oneDNN engine API:
+/// dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create(
+///     dnnl_engine_t *engine, cl_device_id device, cl_context context);
+///
+/// @param engine Output engine.
+/// @param device Underlying OpenCL device to use for the engine.
+/// @param context Underlying OpenCL context to use for the engine.
+/// @param alloc Underlying allocator to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_ocl_interop_make_engine_with_allocator(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context,
+        const_dnnl_graph_allocator_t alloc);
+
+/// This API is a supplement for existing oneDNN engine API:
+/// dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob(
+///     dnnl_engine_t *engine, cl_device_id device, cl_context context,
+///     size_t size, const uint8_t *cache_blob);
+///
+/// @param engine Output engine.
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param alloc Underlying allocator to use for the engine.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API
+dnnl_graph_ocl_interop_make_engine_from_cache_blob_with_allocator(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context,
+        const_dnnl_graph_allocator_t alloc, size_t size,
+        const uint8_t *cache_blob);
+
+/// Execute a compiled partition with OpenCL runtime.
+///
+/// @param compiled_partition The handle of target compiled_partition.
+/// @param stream The stream used for execution
+/// @param num_inputs The number of input tensors
+/// @param inputs A list of input tensors
+/// @param num_outputs The number of output tensors
+/// @param outputs A non-empty list of output tensors
+/// @param deps Optional handle of list with `cl_event` dependencies.
+/// @param ndeps Number of dependencies.
+/// @param return_event The handle of cl_event.
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_ocl_interop_compiled_partition_execute(
+        const_dnnl_graph_compiled_partition_t compiled_partition,
+        dnnl_stream_t stream, size_t num_inputs,
+        const_dnnl_graph_tensor_t *inputs, size_t num_outputs,
+        const_dnnl_graph_tensor_t *outputs, const cl_event *deps, int ndeps,
+        cl_event *return_event);
+
+/// @} dnnl_graph_api_ocl_interop
+
+/// @} dnnl_graph_api_interop
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be50bd934d46d4cd1ef8178e4435ad405b862675
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_ocl.hpp
@@ -0,0 +1,161 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// Graph OpenCL interop API
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP
+#define ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <vector>
+
+#include <CL/cl.h>
+
+#include "oneapi/dnnl/dnnl_graph.hpp"
+#include "oneapi/dnnl/dnnl_graph_ocl.h"
+#include "oneapi/dnnl/dnnl_ocl.hpp"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+namespace graph {
+
+/// @addtogroup dnnl_graph_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_graph_api_ocl_interop OpenCL interoperability API
+/// API extensions to interact with the underlying OpenCL run-time.
+/// @{
+
+/// OpenCL interoperability namespace
+namespace ocl_interop {
+
+/// Constructs an allocator from OpenCL malloc and free function pointer. OpenCL
+/// allocator  should be used for OpenCL GPU runtime. Currently, only device USM
+/// allocator is supported.
+///
+/// @param ocl_malloc The pointer to OpenCL malloc function
+/// @param ocl_free The pointer to OpenCL free function
+/// @returns Created allocator
+inline allocator make_allocator(dnnl_graph_ocl_allocate_f ocl_malloc,
+        dnnl_graph_ocl_deallocate_f ocl_free) {
+    dnnl_graph_allocator_t c_allocator = nullptr;
+    error::wrap_c_api(dnnl_graph_ocl_interop_allocator_create(
+                              &c_allocator, ocl_malloc, ocl_free),
+            "could not create allocator for opencl device");
+    return allocator(c_allocator);
+}
+
+/// Constructs an engine from an OpenCL device, an OpenCL context, and an
+/// allocator.
+///
+/// @param device A valid OpenCL device to construct the engine
+/// @param context A valid OpenCL context to construct the engine
+/// @param alloc An allocator to associate with the engine
+/// @returns Created engine
+inline engine make_engine_with_allocator(
+        cl_device_id device, cl_context context, const allocator &alloc) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(dnnl_graph_ocl_interop_make_engine_with_allocator(
+                              &c_engine, device, context, alloc.get()),
+            "could not make an engine with allocator");
+    return engine(c_engine);
+}
+
+/// Constructs an engine from an OpenCL device, an OpenCL context, an
+/// allocator, and a serialized engine cache blob.
+///
+/// @param device A valid OpenCL device to construct the engine
+/// @param context A valid OpenCL context to construct the engine
+/// @param alloc An allocator to associate with the engine
+/// @param cache_blob Cache blob serialized beforehand
+/// @returns Created engine
+inline engine make_engine_with_allocator(cl_device_id device,
+        cl_context context, const allocator &alloc,
+        const std::vector<uint8_t> &cache_blob) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_graph_ocl_interop_make_engine_from_cache_blob_with_allocator(
+                    &c_engine, device, context, alloc.get(), cache_blob.size(),
+                    cache_blob.data()),
+            "could not make an engine with allocator from cache blob");
+    return engine(c_engine);
+}
+
+/// Executes a compiled partition in a specified stream and returns a OpenCL
+/// event.
+///
+/// @param c_partition Compiled partition to execute.
+/// @param astream Stream object to run over
+/// @param inputs Arguments map.
+/// @param outputs Arguments map.
+/// @param deps Optional vector with `cl_event` dependencies.
+/// @returns Output event.
+inline cl_event execute(compiled_partition &c_partition, stream &astream,
+        const std::vector<tensor> &inputs, std::vector<tensor> &outputs,
+        const std::vector<cl_event> &deps = {}) {
+    std::vector<const_dnnl_graph_tensor_t> c_inputs;
+    c_inputs.reserve(inputs.size());
+    for (auto &in : inputs) {
+        c_inputs.push_back(in.get());
+    }
+    std::vector<const_dnnl_graph_tensor_t> c_outputs;
+    c_outputs.reserve(outputs.size());
+    for (auto &out : outputs) {
+        c_outputs.push_back(out.get());
+    }
+
+    const cl_event *c_deps = deps.empty() ? nullptr : deps.data();
+
+    cl_event ocl_event;
+    error::wrap_c_api(
+            dnnl_graph_ocl_interop_compiled_partition_execute(c_partition.get(),
+                    astream.get(), c_inputs.size(), c_inputs.data(),
+                    c_outputs.size(), c_outputs.data(), c_deps,
+                    (int)deps.size(), &ocl_event),
+            "could not execute the compiled_partition on a specified opencl "
+            "stream");
+    return ocl_event;
+}
+
+} // namespace ocl_interop
+
+/// @} dnnl_graph_api_ocl_interop
+
+/// @} dnnl_graph_api_interop
+
+} // namespace graph
+
+/// @} dnnl_graph_api
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf1380b364a74260f65707ba3082b872a2dce80
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h
@@ -0,0 +1,104 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_H
+#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_H
+
+#include "oneapi/dnnl/dnnl_graph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_interop
+/// @{
+
+/// @addtogroup dnnl_graph_api_sycl_interop
+/// @{
+
+/// Allocation call-back function interface for SYCL. SYCL allocator should be
+/// used for SYCL runtime and host allocator should be used for non-SYCL. The
+/// call-back should return a USM device memory pointer.
+typedef void *(*dnnl_graph_sycl_allocate_f)(
+        size_t size, size_t alignment, const void *dev, const void *context);
+
+/// Deallocation call-back function interface for SYCL. SYCL allocator should be
+/// used for SYCL runtime and host allocator should be used for non-SYCL. The
+/// call-back should deallocate a USM device memory returned by
+/// #dnnl_graph_sycl_allocate_f.
+typedef void (*dnnl_graph_sycl_deallocate_f)(
+        void *buf, const void *dev, const void *context, void *event);
+
+/// Creates an allocator with the given allocation and deallocation call-back
+/// function pointers.
+///
+/// @param allocator Output allocator
+/// @param sycl_malloc A pointer to SYCL malloc function
+/// @param sycl_free A pointer to SYCL free function
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_allocator_create(
+        dnnl_graph_allocator_t *allocator,
+        dnnl_graph_sycl_allocate_f sycl_malloc,
+        dnnl_graph_sycl_deallocate_f sycl_free);
+
+/// This API is a supplement for existing onednn engine API.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_make_engine_with_allocator(
+        dnnl_engine_t *engine, const void *device, const void *context,
+        const_dnnl_graph_allocator_t alloc);
+
+/// Execute a compiled partition with sycl runtime.
+///
+/// @param compiled_partition The handle of target compiled_partition.
+/// @param stream The stream used for execution
+/// @param num_inputs The number of input tensors
+/// @param inputs A list of input tensors
+/// @param num_outputs The number of output tensors
+/// @param outputs A non-empty list of output tensors
+/// @param deps Optional handle of list with `sycl::event` dependencies.
+/// @param sycl_event The handle of sycl event.
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_compiled_partition_execute(
+        const_dnnl_graph_compiled_partition_t compiled_partition,
+        dnnl_stream_t stream, size_t num_inputs,
+        const_dnnl_graph_tensor_t *inputs, size_t num_outputs,
+        const_dnnl_graph_tensor_t *outputs, const void *deps, void *sycl_event);
+
+/// @} dnnl_graph_api_sycl_interop
+
+/// @} dnnl_graph_api_interop
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bfa589ed0f739429711eadd36418391d18f2033
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp
@@ -0,0 +1,136 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// Graph SYCL interop API
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP
+#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+
+#include "oneapi/dnnl/dnnl_graph.hpp"
+#include "oneapi/dnnl/dnnl_graph_sycl.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+namespace graph {
+
+/// @addtogroup dnnl_graph_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_graph_api_sycl_interop SYCL interoperability API
+/// API extensions to interact with the underlying SYCL run-time.
+/// @{
+
+/// SYCL interoperability namespace
+namespace sycl_interop {
+
+/// Constructs an allocator from SYCL malloc and free function pointer. SYCL
+/// allocator  should be used for SYCL runtime and host allocator should be used
+/// for non-SYCL. Currently, only device USM allocator is supported.
+///
+/// @param sycl_malloc The pointer to SYCL malloc function
+/// @param sycl_free The pointer to SYCL free function
+/// @returns Created allocator
+inline allocator make_allocator(dnnl_graph_sycl_allocate_f sycl_malloc,
+        dnnl_graph_sycl_deallocate_f sycl_free) {
+    dnnl_graph_allocator_t c_allocator = nullptr;
+    error::wrap_c_api(dnnl_graph_sycl_interop_allocator_create(
+                              &c_allocator, sycl_malloc, sycl_free),
+            "could not create allocator for sycl device");
+    return allocator(c_allocator);
+}
+
+inline engine make_engine_with_allocator(const sycl::device &adevice,
+        const sycl::context &acontext, const allocator &alloc) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_graph_sycl_interop_make_engine_with_allocator(&c_engine,
+                    static_cast<const void *>(&adevice),
+                    static_cast<const void *>(&acontext), alloc.get()),
+            "could not make an engine with allocator");
+    return engine(c_engine);
+}
+
+/// Executes a compiled partition in a specified stream and returns a SYCL
+/// event.
+///
+/// @param c_partition Compiled partition to execute.
+/// @param astream Stream object to run over
+/// @param inputs Arguments map.
+/// @param outputs Arguments map.
+/// @param deps Optional vector with `sycl::event` dependencies.
+/// @returns Output event.
+inline sycl::event execute(compiled_partition &c_partition, stream &astream,
+        const std::vector<tensor> &inputs, std::vector<tensor> &outputs,
+        const std::vector<sycl::event> &deps = {}) {
+    std::vector<const_dnnl_graph_tensor_t> c_inputs;
+    c_inputs.reserve(inputs.size());
+    for (auto &in : inputs) {
+        c_inputs.push_back(in.get());
+    }
+    std::vector<const_dnnl_graph_tensor_t> c_outputs;
+    c_outputs.reserve(outputs.size());
+    for (auto &out : outputs) {
+        c_outputs.push_back(out.get());
+    }
+
+    sycl::event sycl_event;
+    error::wrap_c_api(dnnl_graph_sycl_interop_compiled_partition_execute(
+                              c_partition.get(), astream.get(), c_inputs.size(),
+                              c_inputs.data(), c_outputs.size(),
+                              c_outputs.data(), &deps, &sycl_event),
+            "could not execute the compiled_partition on a specified sycl "
+            "stream");
+    return sycl_event;
+}
+
+} // namespace sycl_interop
+
+/// @} dnnl_graph_api_sycl_interop
+
+/// @} dnnl_graph_api_interop
+
+} // namespace graph
+
+/// @} dnnl_graph_api
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..421b3db10427d3536a4031f363e8af3b7d358269
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h
@@ -0,0 +1,480 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+ * Copyright 2020-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+/// @file
+/// C API definitions
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_TYPES_H
+#define ONEAPI_DNNL_DNNL_GRAPH_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <limits.h>
+#include <stddef.h>
+
+#include "oneapi/dnnl/dnnl_common_types.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_logical_tensor
+/// @{
+
+/// A wildcard value for number of dimensions which is unknown at a tensor or
+/// operation creation time.
+#define DNNL_GRAPH_UNKNOWN_NDIMS -1
+
+/// A wildcard value for dimensions that are unknown at a tensor or operation
+/// creation time.
+#define DNNL_GRAPH_UNKNOWN_DIM INT64_MIN
+
+/// Layout type specification
+typedef enum {
+    /// Undefined layout type
+    dnnl_graph_layout_type_undef = 0,
+    /// Any means to let the library to decide the layout for a tensor during
+    /// partition compilation.
+    dnnl_graph_layout_type_any = 1,
+    /// Strided means that the layout of a tensor is determined by the strides
+    /// field in the logical tensor.
+    dnnl_graph_layout_type_strided = 2,
+    /// Opaque means that the layout of a tensor is the library specific.
+    /// Usually, an opaque layout is generated by a partition which is compiled
+    /// with layout type any.
+    dnnl_graph_layout_type_opaque = 3,
+} dnnl_graph_layout_type_t;
+
+/// Logical tensor property
+typedef enum {
+    /// Undefined tensor property
+    dnnl_graph_tensor_property_undef = 0,
+    /// Variable means the tensor may be changed during computation or between
+    /// different iterations.
+    dnnl_graph_tensor_property_variable = 1,
+    /// Constant means the tensor will keep unchanged during computation and
+    /// between different iterations. It's useful for the library to apply
+    /// optimizations for constant tensors or cache constant tensors inside the
+    /// library. For example, constant weight tensors in inference scenarios.
+    dnnl_graph_tensor_property_constant = 2,
+} dnnl_graph_tensor_property_t;
+
+/// Logical tensor. It is based on an ID, a number of dimensions, dimensions
+/// themselves, element data type, tensor property and tensor memory layout.
+typedef struct {
+    /// Unique id of each logical tensor. The library uses logical tensor IDs to
+    /// build up the connections between operations if the output of one
+    /// operation has the same ID as the input of another operation.
+    size_t id;
+
+    /// Number of dimensions. -1 means unknown (DNNL_GRAPH_UNKNOWN_NDIMS). 0 is
+    /// used to define scalar tensor.
+    int ndims;
+
+    /// Size of each dimension. #DNNL_GRAPH_UNKNOWN_DIM means the size of that
+    /// dimension is unknown. 0 is used to define zero-dimension tensor. The
+    /// library supports to deduce output shapes according to input shapes
+    /// during compilation. Unlike memory descriptor in oneDNN primitive API,
+    /// the order of dimensions is not defined in logical tensor. It is defined
+    /// by the operations which respect the order through the attributes
+    /// #dnnl_graph_op_attr_data_format or #dnnl_graph_op_attr_weights_format.
+    /// For example, for a Convolution with `data_format=NXC`, it means the
+    /// first element of dims of activation tensor is mini-batch size, the last
+    /// effective element of dims is channel size, and other elements between
+    /// them are spatial dimensions.
+    dnnl_dims_t dims;
+
+    /// Data type of the tensor elements.
+    dnnl_data_type_t data_type;
+
+    /// Property type of the tensor.
+    dnnl_graph_tensor_property_t property;
+
+    /// Layout type of the tensor.
+    dnnl_graph_layout_type_t layout_type;
+    union {
+        /// The field is valid when `layout_type` is
+        /// #dnnl_graph_layout_type_strided. #DNNL_GRAPH_UNKNOWN_DIM means the
+        /// stride of the dimension is unknown. The library currently doesn't
+        /// support other negative stride values.
+        dnnl_dims_t strides;
+
+        /// The field is valid when `layout_type` is
+        /// #dnnl_graph_layout_type_opaque. An opaque layout ID is usually
+        /// generated by a partition which is compiled with layout type any.
+        size_t layout_id;
+    } layout;
+} dnnl_graph_logical_tensor_t;
+
+/// @} dnnl_graph_api_logical_tensor
+
+/// @addtogroup dnnl_graph_api_partition
+/// @{
+
+/// Policy specifications for partitioning
+typedef enum {
+    /// Fusion policy returns partitions with typical post-op fusions, eg.
+    /// Convolution + ReLU or other element-wise operations or a chian of
+    /// post-ops.
+    dnnl_graph_partition_policy_fusion = 1,
+    /// Debug policy doesn't not apply any fusions. It returns partitions with
+    /// single operation in each partition. The policy is useful when users
+    /// notice any bug or correctness issue in fusion policy.
+    dnnl_graph_partition_policy_debug = 2,
+} dnnl_graph_partition_policy_t;
+
+/// An opaque structure to describe a partition.
+struct dnnl_graph_partition;
+
+/// A partition handle.
+typedef struct dnnl_graph_partition *dnnl_graph_partition_t;
+
+/// A constant partition handle.
+typedef const struct dnnl_graph_partition *const_dnnl_graph_partition_t;
+
+/// @} dnnl_graph_api_partition
+
+/// @addtogroup dnnl_graph_api_graph
+/// @{
+
+/// An opaque structure to describe a graph.
+struct dnnl_graph_graph;
+
+/// A graph handle.
+typedef struct dnnl_graph_graph *dnnl_graph_graph_t;
+
+/// A constant graph handle.
+typedef const struct dnnl_graph_graph *const_dnnl_graph_graph_t;
+
+/// @} dnnl_graph_api_graph
+
+/// @addtogroup dnnl_graph_api_op
+/// @{
+
+/// Kinds of operations
+typedef enum {
+    dnnl_graph_op_abs,
+    dnnl_graph_op_abs_backward,
+    dnnl_graph_op_add,
+    dnnl_graph_op_avg_pool,
+    dnnl_graph_op_avg_pool_backward,
+    dnnl_graph_op_batch_norm_backward,
+    dnnl_graph_op_batch_norm_forward_training,
+    dnnl_graph_op_batch_norm_inference,
+    dnnl_graph_op_bias_add,
+    dnnl_graph_op_bias_add_backward,
+    dnnl_graph_op_clamp,
+    dnnl_graph_op_clamp_backward,
+    dnnl_graph_op_concat,
+    dnnl_graph_op_convolution,
+    dnnl_graph_op_convolution_backward_data,
+    dnnl_graph_op_convolution_backward_weights,
+    dnnl_graph_op_conv_transpose,
+    dnnl_graph_op_conv_transpose_backward_data,
+    dnnl_graph_op_conv_transpose_backward_weights,
+    dnnl_graph_op_dequantize,
+    dnnl_graph_op_divide,
+    dnnl_graph_op_dynamic_dequantize,
+    dnnl_graph_op_dynamic_quantize,
+    dnnl_graph_op_elu,
+    dnnl_graph_op_elu_backward,
+    dnnl_graph_op_end,
+    dnnl_graph_op_exp,
+    dnnl_graph_op_gelu,
+    dnnl_graph_op_gelu_backward,
+    dnnl_graph_op_hard_swish,
+    dnnl_graph_op_hard_swish_backward,
+    dnnl_graph_op_interpolate,
+    dnnl_graph_op_interpolate_backward,
+    dnnl_graph_op_layer_norm,
+    dnnl_graph_op_layer_norm_backward,
+    dnnl_graph_op_leaky_relu,
+    dnnl_graph_op_log,
+    dnnl_graph_op_log_softmax,
+    dnnl_graph_op_log_softmax_backward,
+    dnnl_graph_op_matmul,
+    dnnl_graph_op_maximum,
+    dnnl_graph_op_max_pool,
+    dnnl_graph_op_max_pool_backward,
+    dnnl_graph_op_minimum,
+    dnnl_graph_op_mish,
+    dnnl_graph_op_mish_backward,
+    dnnl_graph_op_multiply,
+    dnnl_graph_op_prelu,
+    dnnl_graph_op_prelu_backward,
+    dnnl_graph_op_quantize,
+    dnnl_graph_op_reciprocal,
+    dnnl_graph_op_reduce_l1,
+    dnnl_graph_op_reduce_l2,
+    dnnl_graph_op_reduce_max,
+    dnnl_graph_op_reduce_mean,
+    dnnl_graph_op_reduce_min,
+    dnnl_graph_op_reduce_prod,
+    dnnl_graph_op_reduce_sum,
+    dnnl_graph_op_relu,
+    dnnl_graph_op_relu_backward,
+    dnnl_graph_op_reorder,
+    dnnl_graph_op_round,
+    dnnl_graph_op_sigmoid,
+    dnnl_graph_op_sigmoid_backward,
+    dnnl_graph_op_softmax,
+    dnnl_graph_op_softmax_backward,
+    dnnl_graph_op_softplus,
+    dnnl_graph_op_softplus_backward,
+    dnnl_graph_op_sqrt,
+    dnnl_graph_op_sqrt_backward,
+    dnnl_graph_op_square,
+    dnnl_graph_op_squared_difference,
+    dnnl_graph_op_static_reshape,
+    dnnl_graph_op_static_transpose,
+    dnnl_graph_op_subtract,
+    dnnl_graph_op_tanh,
+    dnnl_graph_op_tanh_backward,
+    dnnl_graph_op_type_cast,
+    dnnl_graph_op_wildcard,
+    dnnl_graph_op_hard_sigmoid,
+    dnnl_graph_op_hard_sigmoid_backward,
+    dnnl_graph_op_select,
+    dnnl_graph_op_pow,
+    dnnl_graph_op_group_norm,
+    dnnl_graph_op_gen_index,
+    dnnl_graph_op_greater_equal,
+    dnnl_graph_op_last_symbol,
+} dnnl_graph_op_kind_t;
+
+/// Attributes of operations
+typedef enum {
+    /// Undefined op attribute.
+    dnnl_graph_op_attr_undef = 0,
+
+    // float32 attributes. The value of these attributes can be any single
+    // float32 number.
+
+    /// Specifies an alpha attribute to an op.
+    dnnl_graph_op_attr_alpha = 0x1,
+    /// Specifies an beta attribute to an op.
+    dnnl_graph_op_attr_beta,
+    /// Specifies an epsilon attribute to an op.
+    dnnl_graph_op_attr_epsilon,
+    /// Specifies a max attribute to an op.
+    dnnl_graph_op_attr_max,
+    ///Specifies a min attribute to an op.
+    dnnl_graph_op_attr_min,
+    /// Specifies a momentum attribute to an op.
+    dnnl_graph_op_attr_momentum,
+
+    // float32 vector attributes. The value of these attributes can be a vector
+    // of float32 numbers.
+
+    /// Specifies a scales attribute to an op.
+    dnnl_graph_op_attr_scales = 0x20,
+
+    // int64_t attributes. The value of these attributes can be any single int64
+    // number.
+
+    /// Specifies an axis attribute to an op.
+    dnnl_graph_op_attr_axis = 0x30,
+    /// Specifies a begin_norm_axis attribute to an op.
+    dnnl_graph_op_attr_begin_norm_axis,
+    /// Specifies a groups attribute to an op.
+    dnnl_graph_op_attr_groups,
+
+    // int64_t vector attributes. The value of these attributes can be a vector
+    // of int64 numbers.
+
+    /// Specifies an axes attribute to an op.
+    dnnl_graph_op_attr_axes = 0x40,
+    /// Specifies a dilations attribute to an op.
+    dnnl_graph_op_attr_dilations,
+    /// Specifies an dst_shape attribute to an op.
+    dnnl_graph_op_attr_dst_shape,
+    /// Specifies a kernel attribute to an op.
+    dnnl_graph_op_attr_kernel,
+    /// Specifies an order attribute to an op.
+    dnnl_graph_op_attr_order,
+    /// Specifies an output_padding attribute to an op.
+    dnnl_graph_op_attr_output_padding,
+    /// Specifies a pads_begin attribute to an op.
+    dnnl_graph_op_attr_pads_begin,
+    /// Specifies a pads_end attribute to an op.
+    dnnl_graph_op_attr_pads_end,
+    /// Specifies a shape attribute to an op.
+    dnnl_graph_op_attr_shape,
+    /// Specifies a sizes attribute to an op.
+    dnnl_graph_op_attr_sizes,
+    /// Specifies a input_shape attribute to an op.
+    dnnl_graph_op_attr_src_shape,
+    /// Specifies a strides attribute to an op.
+    dnnl_graph_op_attr_strides,
+    /// Specifies a weight_shape attribute to an op.
+    dnnl_graph_op_attr_weights_shape,
+    /// Specifies a zps attribute to an op.
+    dnnl_graph_op_attr_zps,
+    /// Specifies a group shape attribute to an op.
+    dnnl_graph_op_attr_group_shape,
+
+    // bool attributes. The value of these attributes can be any single bool
+    // value.
+
+    /// Specifies an exclude_pad attribute to an op.
+    dnnl_graph_op_attr_exclude_pad = 0x60,
+    /// Specifies a keep_dims attribute to an op.
+    dnnl_graph_op_attr_keep_dims,
+    /// Specifies a keep_stats attribute to an op.
+    dnnl_graph_op_attr_keep_stats,
+    /// Specifies a per_channel_broadcast attribute to an op.
+    dnnl_graph_op_attr_per_channel_broadcast,
+    /// Specifies a special_zero attribute to an op.
+    dnnl_graph_op_attr_special_zero,
+    /// Specifies a transpose_a attribute to an op.
+    dnnl_graph_op_attr_transpose_a,
+    /// Specifies a transpose_b attribute to an op.
+    dnnl_graph_op_attr_transpose_b,
+    /// Specifies an use_affine attribute to an op.
+    dnnl_graph_op_attr_use_affine,
+    /// Specifies an use_dst attribute to an op.
+    dnnl_graph_op_attr_use_dst,
+
+    // string attributes. The value of these attributes can be a string.
+
+    /// Specifies an auto_broadcast attribute to an op. The value can be "none"
+    /// or "numpy".
+    dnnl_graph_op_attr_auto_broadcast = 0x80,
+    /// Specifies an auto_pad attribute to an op. The value can be "none",
+    /// "same_upper", "same_lower", or "valid".
+    dnnl_graph_op_attr_auto_pad,
+    /// Specifies an coordinate_transformation_mode attribute to an op. The
+    /// value can be "half_pixel" or "align_corners". The attribute is defined
+    /// for Interpolate operations.
+    dnnl_graph_op_attr_coordinate_transformation_mode,
+    /// Specifies a data_format of an op. The value can be "NCX" or "NXC".
+    dnnl_graph_op_attr_data_format,
+    /// Specifies a mode attribute of an op. The value can be "nearest",
+    /// "linear", "bilinear", or "trilinear". The attribute is defined for
+    /// Interpolate operations.
+    dnnl_graph_op_attr_mode,
+    /// Specifies a qtype attribute to an op. The value can be "per_channel" or
+    /// "per_tensor". The attribute is defined for quantization operations.
+    dnnl_graph_op_attr_qtype,
+    /// Specifies a rounding_type attribute to an op. The value can be "ceil" or
+    /// "floor".
+    dnnl_graph_op_attr_rounding_type,
+    /// Specifies a weights_format of an op. The value can be "OIX", "XIO",
+    /// "IOX", or "XOI". Different operations may support different values.
+    dnnl_graph_op_attr_weights_format,
+
+    /// Specifies the end of all above exteral attributes for check.
+    dnnl_graph_op_attr_end = 0xFF,
+} dnnl_graph_op_attr_t;
+
+/// An opaque structure to describe an operation.
+struct dnnl_graph_op;
+
+/// An operation handle.
+typedef struct dnnl_graph_op *dnnl_graph_op_t;
+
+/// A constant operation handle.
+typedef const struct dnnl_graph_op *const_dnnl_graph_op_t;
+
+/// @} dnnl_graph_api_op
+
+/// @addtogroup dnnl_graph_api_allocator
+/// @{
+
+/// Allocation call-back function interface for host. For SYCL allocator, see
+/// #dnnl_graph_sycl_allocate_f.
+typedef void *(*dnnl_graph_host_allocate_f)(size_t size, size_t alignment);
+
+/// Deallocation call-back function interface for host. For SYCL allocator, see
+/// #dnnl_graph_sycl_deallocate_f.
+typedef void (*dnnl_graph_host_deallocate_f)(void *);
+
+/// An opaque structure to describe an allocator.
+struct dnnl_graph_allocator;
+
+/// An allocator handle.
+typedef struct dnnl_graph_allocator *dnnl_graph_allocator_t;
+
+/// A constant allocator handle.
+typedef const struct dnnl_graph_allocator *const_dnnl_graph_allocator_t;
+
+/// @} dnnl_graph_api_allocator
+
+/// @addtogroup dnnl_graph_api_compiled_partition
+/// @{
+
+/// In-place pair definition. It can queried from a compiled partition
+/// indicating that an input and an output of the partition can share the same
+/// memory buffer for computation. In-place computation helps to reduce the
+/// memory footprint and improves cache locality. But since the library may not
+/// have a global view of user's application, it's possible that the tensor with
+/// `input_id` is used at other places in user's computation graph. In this
+/// case, the user should take the in-place pair as a hint and pass a different
+/// memory buffer for output tensor to avoid overwriting the input memory buffer
+/// which will probably cause unexpected incorrect results.
+typedef struct {
+    /// The id of input tensor
+    size_t input_id;
+
+    /// The id of output tensor
+    size_t output_id;
+} dnnl_graph_inplace_pair_t;
+
+/// An opaque structure to describe a compiled partition.
+struct dnnl_graph_compiled_partition;
+
+/// A compiled partition handle.
+typedef struct dnnl_graph_compiled_partition *dnnl_graph_compiled_partition_t;
+
+/// A constant compiled partition handle.
+typedef const struct dnnl_graph_compiled_partition
+        *const_dnnl_graph_compiled_partition_t;
+
+/// @} dnnl_graph_api_compiled_partition
+
+/// @addtogroup dnnl_graph_api_tensor
+/// @{
+
+/// An opaque structure to describe a tensor.
+struct dnnl_graph_tensor;
+
+/// A tensor handle.
+typedef struct dnnl_graph_tensor *dnnl_graph_tensor_t;
+
+/// A constant tensor handle.
+typedef const struct dnnl_graph_tensor *const_dnnl_graph_tensor_t;
+
+/// @} dnnl_graph_api_tensor
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7e33c54c3d7bcdb98a0908a8a7ceade34a3d485
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h
@@ -0,0 +1,281 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_H
+#define ONEAPI_DNNL_DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+#include "oneapi/dnnl/dnnl_ocl_types.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+// Set target version for OpenCL explicitly to suppress a compiler warning.
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 120
+#endif
+
+#include <CL/cl.h>
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop
+/// @{
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() has been called, if @p memory_kind is equal
+///   to dnnl_ocl_interop_usm, or
+/// - dnnl_ocl_interop_memory_set_mem_object() has been called, if @p memory_kind
+///   is equal to dnnl_ocl_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_ocl_interop_memory_kind_t memory_kind, void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_ocl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param memory Memory to query.
+/// @param memory_kind Output underlying memory allocation kind of the memory
+///     object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_memory_kind(
+        const_dnnl_memory_t memory,
+        dnnl_ocl_interop_memory_kind_t *memory_kind);
+
+/// Returns an OpenCL memory object associated with a memory object.
+///
+/// @param memory Memory object.
+/// @param mem_object Output OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object(
+        const_dnnl_memory_t memory, cl_mem *mem_object);
+
+/// Sets OpenCL memory object associated with a memory object.
+///
+/// For behavioral details, see dnnl_memory_set_data_handle().
+///
+/// @param memory Memory object.
+/// @param mem_object OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object(
+        dnnl_memory_t memory, cl_mem mem_object);
+
+/// Retrieves a cache blob ID for the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl_ocl_interop_engine_get_cache_blob() and
+///     #dnnl_ocl_interop_engine_create_from_cache_blob(). The returned cache
+///     blob ID can only be used as an ID of the cache blob returned by
+///     #dnnl_ocl_interop_engine_get_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @param size Size of the cache blob ID in bytes.
+/// @param cache_blob_id Cache blob id of size @p size. If
+///     the @p cache_blob_id is nullptr then the size of the cache blob ID is
+///     returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob_id(
+        cl_device_id device, size_t *size, uint8_t *cache_blob_id);
+
+/// Retrieves a cache blob associated with the given engine.
+///
+/// @note The cache blob can be empty (@p size will be 0 and @p cache_blob
+///     will be nullptr) if oneDNN doesn't have anything to put in the cache
+///     blob. It's the user's responsibility to check whether it's empty
+///     prior to passing it to
+///     #dnnl_ocl_interop_engine_create_from_cache_blob().
+///
+/// @param engine Engine to query for the cache blob.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is
+///     nullptr then the size of the cache blob is returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob(
+        dnnl_engine_t engine, size_t *size, uint8_t *cache_blob);
+
+/// Creates an engine from the given cache blob.
+///
+/// @param engine Output engine.
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context,
+        size_t size, const uint8_t *cache_blob);
+
+/// Creates an engine associated with an OpenCL device and an OpenCL context.
+///
+/// @param engine Output engine.
+/// @param device Underlying OpenCL device to use for the engine.
+/// @param context Underlying OpenCL context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context);
+
+/// Returns the OpenCL context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Output underlying OpenCL context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_context(
+        dnnl_engine_t engine, cl_context *context);
+
+/// Returns the OpenCL device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Output underlying OpenCL device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_get_device(
+        dnnl_engine_t engine, cl_device_id *device);
+
+/// Creates an execution stream for a given engine associated with
+/// an OpenCL command queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue OpenCL command queue to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, cl_command_queue queue);
+
+/// Returns the OpenCL command queue associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param queue Output OpenCL command queue.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_get_command_queue(
+        dnnl_stream_t stream, cl_command_queue *queue);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns an OpenCL event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param deps A pointer to a vector of size @p ndeps that contains
+///     dependencies.
+/// @param ndeps Number of dependencies.
+/// @param return_event Output event. It's the user's responsibility to
+///     manage lifetime of the event. Can be NULL. When @p stream is in-order
+///     NULL will be returned.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs,
+        const dnnl_exec_arg_t *args, const cl_event *deps, int ndeps,
+        cl_event *return_event);
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc8258fe2420873476069ec78bdc13dc53ffef9c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp
@@ -0,0 +1,450 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_HPP
+#define ONEAPI_DNNL_DNNL_OCL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#include <CL/cl.h>
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop OpenCL interoperability API
+/// API extensions to interact with the underlying OpenCL run-time.
+///
+/// @sa @ref dev_guide_opencl_interoperability in developer guide
+/// @{
+
+/// OpenCL interoperability namespace
+namespace ocl_interop {
+
+/// Memory allocation kind.
+enum class memory_kind {
+    /// USM (device, shared, host, or unknown) memory allocation kind.
+    usm = dnnl_ocl_interop_usm,
+    /// Buffer memory allocation kind - default.
+    buffer = dnnl_ocl_interop_buffer,
+};
+
+/// Converts a memory allocation kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API memory allocation kind enum value.
+/// @returns Corresponding C API memory allocation kind enum value.
+inline dnnl_ocl_interop_memory_kind_t convert_to_c(memory_kind akind) {
+    return static_cast<dnnl_ocl_interop_memory_kind_t>(akind);
+}
+
+/// Returns the cache blob ID of the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl::ocl_interop::get_engine_cache_blob() and
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &).
+///     The returned cache blob ID can only be used as an ID of the cache blob
+///     returned by #dnnl::ocl_interop::get_engine_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @returns A vector containing the cache blob ID.
+inline std::vector<uint8_t> get_engine_cache_blob_id(cl_device_id device) {
+    size_t size = 0;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_cache_blob_id(device, &size, nullptr),
+            "could not get an engine cache blob id size");
+
+    std::vector<uint8_t> cache_blob_id(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob_id(
+                              device, &size, cache_blob_id.data()),
+            "could not get an engine cache blob id");
+    return cache_blob_id;
+}
+
+/// Returns a cache blob for the engine.
+///
+/// @note The cache blob vector can be empty if oneDNN doesn't have anything
+///     to put in the cache blob. It's the user's responsibility to check
+///     whether it's empty prior to passing it to
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &)
+///
+/// @param aengine Engine to query for the cache blob.
+/// @returns Vector containing the cache blob.
+inline std::vector<uint8_t> get_engine_cache_blob(const engine &aengine) {
+    size_t size = 0;
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, nullptr),
+            "could not get an engine cache blob size");
+
+    std::vector<uint8_t> cache_blob(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, cache_blob.data()),
+            "could not get an engine cache blob");
+    return cache_blob;
+}
+
+/// Constructs an engine from the given cache blob.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param cache_blob Cache blob.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context,
+        const std::vector<uint8_t> &cache_blob) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create_from_cache_blob(&c_engine, device,
+                    context, cache_blob.size(), cache_blob.data()),
+            "could not create an engine from cache blob");
+    return engine(c_engine);
+}
+
+/// Constructs an engine from OpenCL device and context objects.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create(&c_engine, device, context),
+            "could not create an engine");
+    return engine(c_engine);
+}
+
+/// Returns OpenCL context associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL context.
+inline cl_context get_context(const engine &aengine) {
+    cl_context context = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_context(aengine.get(), &context),
+            "could not get an OpenCL context from an engine");
+    return context;
+}
+
+/// Returns OpenCL device associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL device.
+inline cl_device_id get_device(const engine &aengine) {
+    cl_device_id device = nullptr;
+    error::wrap_c_api(dnnl_ocl_interop_get_device(aengine.get(), &device),
+            "could not get an OpenCL device from an engine");
+    return device;
+}
+
+/// Constructs an execution stream for the specified engine and OpenCL queue.
+///
+/// @param aengine Engine to create the stream on.
+/// @param queue OpenCL queue to use for the stream.
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, cl_command_queue queue) {
+    dnnl_stream_t c_stream;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_create(&c_stream, aengine.get(), queue),
+            "could not create a stream");
+    return stream(c_stream);
+}
+
+/// Returns OpenCL queue object associated with the execution stream.
+///
+/// @param astream An execution stream.
+/// @returns Underlying OpenCL queue.
+inline cl_command_queue get_command_queue(const stream &astream) {
+    cl_command_queue queue = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_get_command_queue(astream.get(), &queue),
+            "could not get an OpenCL command queue from a stream");
+    return queue;
+}
+
+/// Returns the OpenCL memory object associated with the memory object.
+///
+/// @param amemory A memory object.
+/// @returns Underlying OpenCL memory object.
+inline cl_mem get_mem_object(const memory &amemory) {
+    cl_mem mem_object;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_get_mem_object(amemory.get(), &mem_object),
+            "could not get OpenCL buffer object from a memory object");
+    return mem_object;
+}
+
+/// Sets the OpenCL memory object associated with the memory object.
+///
+/// For behavioral details see memory::set_data_handle().
+///
+/// @param amemory A memory object.
+/// @param mem_object OpenCL cl_mem object to use as the underlying
+///     storage. It must have at least get_desc().get_size() bytes
+///     allocated.
+inline void set_mem_object(memory &amemory, cl_mem mem_object) {
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_set_mem_object(amemory.get(), mem_object),
+            "could not set OpenCL buffer object from a memory object");
+}
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param amemory A memory object.
+///
+/// @returns The underlying memory allocation kind of the memory object.
+inline memory_kind get_memory_kind(const memory &amemory) {
+    dnnl_ocl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get memory kind");
+    return static_cast<memory_kind>(ckind);
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Constructs a memory object with multiple OpenCL buffers.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_objects A vector of OpenCL buffers to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, std::vector<cl_mem> mem_objects) {
+    const int nhandles = memory_desc.get_num_handles();
+    std::vector<void *> handles(nhandles, DNNL_MEMORY_NONE);
+    memory amemory(memory_desc, aengine, handles);
+    for (int i = 0; i < nhandles; i++)
+        amemory.set_data_handle(mem_objects[i], i);
+    return amemory;
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::usm, or
+/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::ocl_interop::memory_kind::usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to
+///       dnnl::ocl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+
+/// Constructs a memory object from an OpenCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_object An OpenCL buffer to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, cl_mem mem_object) {
+    return make_memory(memory_desc, aengine, std::vector<cl_mem> {mem_object});
+}
+#else
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::usm, or
+/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::ocl_interop::memory_kind::usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to
+///       dnnl::ocl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        void *handle = DNNL_MEMORY_ALLOCATE) {
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_create(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), handle),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Constructs a memory object from an OpenCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_object An OpenCL buffer to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, cl_mem mem_object) {
+    memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE);
+    set_mem_object(amemory, mem_object);
+    return amemory;
+}
+#endif
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `cl_event` dependencies.
+///
+/// @returns Output event. It's the user's responsibility to manage lifetime
+///     of the event.
+inline cl_event execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<cl_event> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    const cl_event *c_deps = deps.empty() ? nullptr : deps.data();
+
+    cl_event return_event;
+    error::wrap_c_api(dnnl_ocl_interop_primitive_execute(aprimitive.get(),
+                              astream.get(), (int)c_args.size(), c_args.data(),
+                              c_deps, (int)deps.size(), &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace ocl_interop
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8e3ab7be1897a19a4db44393e0cfdf4f9a36d8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_TYPES_H
+#define ONEAPI_DNNL_DNNL_OCL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop
+/// @{
+
+/// Memory allocation kind.
+typedef enum {
+    /// USM (device, shared, host, or unknown) memory allocation kind.
+    dnnl_ocl_interop_usm,
+    /// Buffer memory allocation kind - default.
+    dnnl_ocl_interop_buffer,
+} dnnl_ocl_interop_memory_kind_t;
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..443ff82bb93536da911a8c6dbb5be838c6f0e9ff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h
@@ -0,0 +1,204 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_H
+#define ONEAPI_DNNL_DNNL_SYCL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+#include "oneapi/dnnl/dnnl_sycl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop
+/// @{
+
+/// Creates an engine associated with a SYCL device and a SYCL context.
+///
+/// @param engine Output engine.
+/// @param device Pointer to the SYCL device to use for the engine.
+/// @param context Pointer to the SYCL context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_create(
+        dnnl_engine_t *engine, const void *device, const void *context);
+
+/// Returns the SYCL context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Pointer to the underlying SYCL context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_context(
+        dnnl_engine_t engine, void **context);
+
+/// Returns the SYCL device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying SYCL device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_device(
+        dnnl_engine_t engine, void **device);
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
+///   to dnnl_sycl_interop_usm, or
+/// - dnnl_sycl_interop_memory_set_buffer() has been called, if @p memory_kind
+///   is equal to dnnl_sycl_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_sycl_interop_usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl_sycl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_sycl_interop_memory_kind_t memory_kind, void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_sycl_interop_usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl_sycl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_sycl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param memory Memory to query.
+/// @param memory_kind Output underlying memory allocation kind of the memory
+///     object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_get_memory_kind(
+        const_dnnl_memory_t memory,
+        dnnl_sycl_interop_memory_kind_t *memory_kind);
+
+/// Sets a SYCL buffer for a memory object.
+///
+/// @param memory Memory object.
+/// @param buffer SYCL buffer to be set in the memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_set_buffer(
+        dnnl_memory_t memory, void *buffer);
+
+/// Creates an execution stream for a given engine associated with a SYCL
+/// queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue SYCL queue to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, void *queue);
+
+/// Returns the SYCL queue associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param queue Output SYCL command queue.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_stream_get_queue(
+        dnnl_stream_t stream, void **queue);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param deps A pointer to std::vector<sycl::event> that contains
+///     dependencies.
+/// @param return_event Output event.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs,
+        const dnnl_exec_arg_t *args, const void *deps, void *return_event);
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab95e4f0ea13eaecb6daa934178d887a4207998c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp
@@ -0,0 +1,389 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_HPP
+#define ONEAPI_DNNL_DNNL_SYCL_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_sycl.h"
+
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop SYCL interoperability API
+/// API extensions to interact with the underlying SYCL run-time.
+///
+/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
+/// @{
+
+/// SYCL interoperability namespace
+namespace sycl_interop {
+
+/// Memory allocation kind.
+enum class memory_kind {
+    /// USM (device, shared, host, or unknown) memory allocation kind - default.
+    usm = dnnl_sycl_interop_usm,
+    /// Buffer memory allocation kind.
+    buffer = dnnl_sycl_interop_buffer,
+};
+
+/// Converts a memory allocation kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API memory allocation kind enum value.
+/// @returns Corresponding C API memory allocation kind enum value.
+inline dnnl_sycl_interop_memory_kind_t convert_to_c(memory_kind akind) {
+    return static_cast<dnnl_sycl_interop_memory_kind_t>(akind);
+}
+
+/// Constructs an engine from SYCL device and context objects.
+///
+/// @param adevice SYCL device.
+/// @param acontext SYCL context.
+///
+/// @returns Created engine.
+inline engine make_engine(
+        const sycl::device &adevice, const sycl::context &acontext) {
+    dnnl_engine_t aengine;
+    error::wrap_c_api(dnnl_sycl_interop_engine_create(&aengine,
+                              static_cast<const void *>(&adevice),
+                              static_cast<const void *>(&acontext)),
+            "could not create an engine");
+    return engine(aengine);
+}
+
+/// Returns the SYCL context associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying SYCL device of the engine.
+inline sycl::context get_context(const engine &aengine) {
+    void *ctx_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_engine_get_context(aengine.get(), &ctx_ptr),
+            "could not get a context handle");
+    auto ctx = *static_cast<sycl::context *>(ctx_ptr);
+    return ctx;
+}
+
+/// Returns the SYCL device associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying SYCL context of the engine.
+inline sycl::device get_device(const engine &aengine) {
+    void *dev_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_engine_get_device(aengine.get(), &dev_ptr),
+            "could not get a device handle");
+    auto dev = *static_cast<sycl::device *>(dev_ptr);
+    return dev;
+}
+
+/// Creates an execution stream for a given engine associated with a SYCL
+/// queue.
+///
+/// @param aengine Engine object to use for the stream.
+/// @param aqueue SYCL queue to use for the stream.
+///
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, sycl::queue &aqueue) {
+    dnnl_stream_t astream;
+    error::wrap_c_api(
+            dnnl_sycl_interop_stream_create(&astream, aengine.get(), &aqueue),
+            "could not create a stream");
+    return stream(astream);
+}
+
+/// Returns the SYCL queue associated with an execution stream.
+///
+/// @param astream Execution stream to query.
+///
+/// @returns SYCL queue object.
+inline sycl::queue get_queue(const stream &astream) {
+    void *queue_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_stream_get_queue(astream.get(), &queue_ptr),
+            "could not get a stream handle");
+    auto queue = *static_cast<sycl::queue *>(queue_ptr);
+    return queue;
+}
+
+/// Returns the SYCL buffer associated with a memory object.
+///
+/// Throws an exception if the memory allocation kind associated with the
+/// memory object is not equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @tparam T Type of the requested buffer.
+/// @tparam ndims Number of dimensions of the requested buffer.
+/// @param amemory Memory object.
+///
+/// @returns SYCL buffer associated with the memory object.
+template <typename T, int ndims = 1>
+sycl::buffer<T, ndims> get_buffer(const memory &amemory) {
+    static_assert(ndims == 1, "only 1D buffers supported");
+
+    // XXX: workaround: when CPU runtime is not SYCL and amemory was created
+    // for CPU engine `get_buffer` should return an error. Use interop API to
+    // implement the check.
+    dnnl_sycl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get SYCL buffer object");
+
+    void *handle_ptr;
+    error::wrap_c_api(dnnl_memory_get_data_handle(amemory.get(), &handle_ptr),
+            "could not get SYCL buffer object");
+
+    // XXX: workaround: zero-range buffer cannot be constructed.
+    if (!handle_ptr) return sycl::buffer<T, ndims>(sycl::range<1>(1));
+
+    auto &buf_u8 = *static_cast<sycl::buffer<uint8_t, 1> *>(handle_ptr);
+
+    auto range = sycl::range<1>(buf_u8.byte_size() / sizeof(T));
+    return buf_u8.reinterpret<T, 1>(range);
+}
+
+/// Sets SYCL buffer associated with a memory object.
+///
+/// @tparam T Type of the buffer.
+/// @tparam ndims Number of dimensions of the buffer.
+/// @param amemory Memory object to change.
+/// @param abuffer SYCL buffer.
+template <typename T, int ndims>
+void set_buffer(memory &amemory, sycl::buffer<T, ndims> &abuffer) {
+    auto range = sycl::range<1>(abuffer.byte_size());
+    auto buf_u8 = abuffer.template reinterpret<uint8_t, 1>(range);
+    error::wrap_c_api(dnnl_sycl_interop_memory_set_buffer(
+                              amemory.get(), static_cast<void *>(&buf_u8)),
+            "could not set SYCL buffer object");
+}
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param amemory A memory object.
+///
+/// @returns The underlying memory allocation kind of the memory object.
+inline memory_kind get_memory_kind(const memory &amemory) {
+    dnnl_sycl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get memory kind");
+    return static_cast<memory_kind>(ckind);
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::usm, or
+/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+#else
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::usm, or
+/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        void *handle = DNNL_MEMORY_ALLOCATE) {
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_create(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), handle),
+            "could not create a memory");
+    return memory(c_memory);
+}
+#endif
+
+/// Constructs a memory object from a SYCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param abuffer A SYCL buffer to use.
+///
+/// @returns Created memory object.
+template <typename T, int ndims = 1>
+memory make_memory(const memory::desc &memory_desc, const engine &aengine,
+        sycl::buffer<T, ndims> &abuffer) {
+    memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE);
+    set_buffer(amemory, abuffer);
+    return amemory;
+}
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `sycl::event` dependencies.
+///
+/// @returns Output event.
+inline sycl::event execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<sycl::event> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    sycl::event return_event;
+    error::wrap_c_api(
+            dnnl_sycl_interop_primitive_execute(aprimitive.get(), astream.get(),
+                    (int)c_args.size(), c_args.data(), &deps, &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace sycl_interop
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif // DNNL_SYCL_HPP
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..d137c666508351ef0a9aad39f38cd6f439b74c83
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h
@@ -0,0 +1,56 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_TYPES_H
+#define ONEAPI_DNNL_DNNL_SYCL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop
+/// @{
+
+/// Memory allocation kind.
+typedef enum {
+    /// USM (device, shared, host, or unknown) memory allocation kind - default.
+    dnnl_sycl_interop_usm,
+    /// Buffer memory allocation kind.
+    dnnl_sycl_interop_buffer,
+} dnnl_sycl_interop_memory_kind_t;
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..586624e53a5775535136fefd83ef81feb0f6e527
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h
@@ -0,0 +1,123 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_H
+#define ONEAPI_DNNL_DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+/// Creates an execution stream with specified threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, void *threadpool);
+
+/// Returns a threadpool to be used by the execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream Execution stream.
+/// @param threadpool Output pointer to an instance of a C++ class that
+///     implements dnnl::threapdool_iface interface. Set to NULL if the
+///     stream was created without threadpool.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_get_threadpool(
+        dnnl_stream_t astream, void **threadpool);
+
+/// Sets the maximum concurrency assumed by oneDNN when outside a
+/// parallel call.
+///
+/// @param max_concurrency The maximum concurrency assumed by oneDNN
+/// when outside a parallel call. This is a threadlocal setting.
+/// @returns #dnnl_success on success and a status describing the
+/// error otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_set_max_concurrency(
+        int max_concurrency);
+
+/// Gets the maximum concurrency assumed by oneDNN when outside a
+/// parallel call.
+///
+/// @param max_concurrency The maximum concurrency assumed by oneDNN
+/// when outside a parallel call. This is a threadlocal setting.
+/// @returns #dnnl_success on success and a status describing the
+/// error otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_get_max_concurrency(
+        int *max_concurrency);
+
+/// @copydoc dnnl_sgemm()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_sgemm(char transa, char transb,
+        dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A,
+        dnnl_dim_t lda, const float *B, dnnl_dim_t ldb, float beta, float *C,
+        dnnl_dim_t ldc, void *threadpool);
+
+/// @copydoc dnnl_gemm_u8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_u8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const uint8_t *A, dnnl_dim_t lda, uint8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @copydoc dnnl_gemm_s8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_s8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const int8_t *A, dnnl_dim_t lda, int8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f483868c6cc55d48131881dcc46d1a9c5e104cf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp
@@ -0,0 +1,118 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop Threadpool interoperability API
+/// API extensions to interact with the underlying Threadpool run-time.
+/// @{
+
+/// Threadpool interoperability namespace
+namespace threadpool_interop {
+
+/// Constructs an execution stream for the specified engine and threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param aengine Engine to create the stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns An execution stream.
+inline dnnl::stream make_stream(
+        const dnnl::engine &aengine, threadpool_iface *threadpool) {
+    dnnl_stream_t c_stream;
+    dnnl::error::wrap_c_api(dnnl_threadpool_interop_stream_create(
+                                    &c_stream, aengine.get(), threadpool),
+            "could not create stream");
+    return dnnl::stream(c_stream);
+}
+
+/// Returns the pointer to a threadpool that is used by an execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream An execution stream.
+/// @returns Output pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface or NULL if the stream was created
+///     without threadpool.
+inline threadpool_iface *get_threadpool(const dnnl::stream &astream) {
+    void *tp;
+    dnnl::error::wrap_c_api(
+            dnnl_threadpool_interop_stream_get_threadpool(astream.get(), &tp),
+            "could not get stream threadpool");
+    return static_cast<threadpool_iface *>(tp);
+}
+
+/// @copydoc dnnl_threadpool_interop_sgemm()
+inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N,
+        dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda,
+        const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_sgemm(transa, transb, M,
+            N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool));
+}
+/// @copydoc dnnl_threadpool_interop_gemm_u8s8s32()
+inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A,
+        dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_gemm_u8s8s32(transa,
+            transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C,
+            ldc, co, threadpool));
+}
+
+/// @copydoc dnnl_threadpool_interop_gemm_s8s8s32()
+inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A,
+        dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_gemm_s8s8s32(transa,
+            transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C,
+            ldc, co, threadpool));
+}
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..124b8cfa34ee87e2b090bbdc57085e0541cfc3ff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
@@ -0,0 +1,78 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+
+#include <cstdint>
+#include <functional>
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+namespace threadpool_interop {
+
+/// Abstract threadpool interface. The users are expected to subclass this
+/// interface and pass an object to the library during CPU stream creation or
+/// directly in case of BLAS functions.
+struct threadpool_iface {
+    /// Returns the number of worker threads.
+    virtual int get_num_threads() const = 0;
+
+    /// Returns true if the calling thread belongs to this threadpool.
+    virtual bool get_in_parallel() const = 0;
+
+    /// Submits n instances of a closure for execution in parallel:
+    ///
+    /// for (int i = 0; i < n; i++) fn(i, n);
+    ///
+    virtual void parallel_for(int n, const std::function<void(int, int)> &fn)
+            = 0;
+
+    /// Returns threadpool behavior flags bit mask (see below).
+    virtual uint64_t get_flags() const = 0;
+
+    /// If set, parallel_for() returns immediately and oneDNN needs implement
+    /// waiting for the submitted closures to finish execution on its own.
+    static constexpr uint64_t ASYNCHRONOUS = 1;
+
+    virtual ~threadpool_iface() {}
+};
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..7809085bf53ae01821921d9917c3b6abaf9ea243
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_types.h
@@ -0,0 +1,2941 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C API types definitions
+
+#ifndef ONEAPI_DNNL_DNNL_TYPES_H
+#define ONEAPI_DNNL_DNNL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <stddef.h>
+#include <stdint.h>
+/// @endcond
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+#include "oneapi/dnnl/dnnl_common_types.h"
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// Memory format kind
+typedef enum {
+    /// Undefined memory format kind, used for empty memory descriptors.
+    dnnl_format_kind_undef = 0,
+    /// A special format kind that indicates that the actual format will be
+    /// selected by a primitive automatically.
+    dnnl_format_kind_any,
+    /// A tensor in a generic format described by the stride and blocking
+    /// values in each dimension.
+    dnnl_blocked,
+    /// A special format kind that indicates that tensor format is opaque.
+    dnnl_format_kind_opaque,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Format kind for sparse tensors.
+    dnnl_format_kind_sparse,
+#endif
+    /// Parameter to allow internal only format kinds without undefined
+    /// behavior. This parameter is chosen to be valid for so long as
+    /// sizeof(int) >= 2.
+    dnnl_format_kind_max = 0x7fff,
+} dnnl_format_kind_t;
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Sparse encodings.
+typedef enum {
+    /// Undefined sparse encoding kind, used for empty memory descriptors.
+    dnnl_sparse_encoding_undef = 0,
+    /// Compressed Sparse Row (CSR) encoding.
+    dnnl_csr,
+    /// An encoding that is used for an opaque storage schema for
+    /// tensors with unstructured sparsity. A memory descriptor with the
+    /// packed encoding cannot be used to create a memory object. It can
+    /// only be used to create a primitive descriptor to query the
+    /// actual memory descriptor (similar to the format tag `any`).
+    dnnl_packed,
+    /// Coordinate Sparse Encoding (COO).
+    dnnl_coo,
+} dnnl_sparse_encoding_t;
+#endif
+
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+/// Profiling data kind.
+typedef enum {
+    /// Undefined profiling data kind.
+    dnnl_profiling_data_kind_undef = 0,
+    /// Data kind to query an execution time in nanoseconds.
+    dnnl_profiling_data_kind_time,
+} dnnl_profiling_data_kind_t;
+
+#endif
+
+/// Memory format tag specification.
+///
+/// oneDNN formats describe physical data layout. The physical layout
+/// is described as a sequence of the dimensions as they are laid out in the
+/// memory (from the outer-most to the inner-most). Note that this order
+/// doesn't affect the logical order of the dimensions that is kept in the
+/// `dims` field of the dnnl_memory_desc_t structure. The logical order of the
+/// dimensions is specified by the primitive that uses the tensor.
+///
+/// For example, CNN 5D tensor always has its logical dimensions in the order
+/// `(batch, channels, depth, height, width)`, while the physical layout might be
+/// `NCDHW` (corresponds to #dnnl_ncdhw format tag) or
+/// `NDHWC` (corresponds to #dnnl_ndhwc format tag).
+///
+/// ~~~cpp
+/// int batch = 2, channels = 16, depth = 13, height = 13, width = 13;
+///
+/// int ndims = 5; // 5D tensor
+/// dnnl_dims_t dims = {batch, channels, depth, height, width};
+/// dnnl_memory_desc_t data_in_ncdhw;
+/// dnnl_memory_desc_create_with_tag(
+///      &data_in_ncdhw, 5, dims, dnnl_f32, dnnl_ncdhw);
+///
+/// // note that in both cases dims passed are the same
+/// dnnl_memory_desc_t data_in_ndhwc;
+/// dnnl_memory_desc_create_with_tag(
+///      &data_in_ndhwc, 5, dims, dnnl_f32, dnnl_ndhwc);
+///
+/// dnnl_memory_desc_destroy(data_in_ncdhw);
+/// dnnl_memory_desc_destroy(data_in_ndhwc);
+/// ~~~
+///
+/// Memory format tags can be further divided into two categories:
+///  - Domain-agnostic names, i.e. names the do not depend on the tensor usage
+///    in the specific primitive. These names use letters from `a` to `l` to
+///    denote logical dimension from 1 to 12, and form the order in which the
+///    dimensions are laid in memory. For instance, #dnnl_ab is used to denote
+///    2D tensor where the second logical dimension (aka `b`) is the innermost,
+///    i.e. has stride = 1, and the first logical dimension (`a`) laid out in
+///    memory with stride equal to the size of second dimension. On the other
+///    hand, #dnnl_ba is just transposed version of the same tensor: the
+///    first dimension (`a`) becomes the innermost one.
+///  - Domain-specific names, i.e. names that make sense only in the context of
+///    a certain domain, such as CNN. This names are just aliases to the
+///    corresponding domain-agnostic tags and used mostly for the convenience.
+///    For example, #dnnl_nc is used to denote 2D CNN activations tensor
+///    memory format, where channels are the innermost dimension and batch is an
+///    outermost one. Moreover, #dnnl_nc is just an alias to #dnnl_ab,
+///    since for oneDNN CNN primitives the logical dimensions of
+///    activations tensors come in order: batch, channels, spatial.
+///    In other words, batch corresponds to the first logical dimension (`a`),
+///    channels correspond to the second one (`b`).
+///
+/// The following domain-specific notation applies to memory format tags:
+///  - @c 'n' denotes the mini-batch dimension
+///  - @c 'c' denotes a channels dimension
+///  - When there are multiple channel dimensions (for example, in convolution
+///    weights tensor), @c 'i' and @c 'o' denote dimensions of input and output
+///    channels
+///  - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width
+///    respectively
+///
+/// Upper-case letters indicate that the data is laid out in blocks for a
+/// particular dimension. In such cases, the format name contains both upper-
+/// and lower-case letters for that dimension with a lower-case letter preceded
+/// by the block size. For example: #dnnl_nChw8c describes a format where the
+/// outermost dimension is mini-batch, followed by the channel block number,
+/// followed by the spatial height and width, and finally followed by 8-element
+/// channel blocks.
+///
+/// @sa @ref dev_guide_understanding_memory_formats
+typedef enum {
+    /// Undefined memory format tag
+    dnnl_format_tag_undef = 0,
+    /// Undefined memory format tag.
+    /// The primitive selects a format automatically.
+    dnnl_format_tag_any,
+
+    // Semantic agnostic section
+    // The physical order of dimensions is defined by the permutation of the
+    // characters, assuming that ab..z defines the natural order.
+
+    // Plain formats
+
+    dnnl_a, ///< plain 1D tensor
+    dnnl_ab, ///< plain 2D tensor
+    dnnl_abc, ///< plain 3D tensor
+    dnnl_abcd, ///< plain 4D tensor
+    dnnl_abcde, ///< plain 5D tensor
+    dnnl_abcdef, ///< plain 6D tensor
+    dnnl_abcdefg, ///< plain 7D tensor
+    dnnl_abcdefgh, ///< plain 8D tensor
+    dnnl_abcdefghi, ///< plain 9D tensor
+    dnnl_abcdefghij, ///< plain 10D tensor
+    dnnl_abcdefghijk, ///< plain 11D tensor
+    dnnl_abcdefghijkl, ///< plain 12D tensor
+
+    // Permuted plain formats
+
+    dnnl_ba, ///< permuted 2D tensor
+    dnnl_acb, ///< permuted 3D tensor
+    dnnl_bac, ///< permuted 3D tensor
+    dnnl_bca, ///< permuted 3D tensor
+    dnnl_cab, ///< permuted 3D tensor
+    dnnl_cba, ///< permuted 3D tensor
+    dnnl_abdc, ///< permuted 4D tensor
+    dnnl_acbd, ///< permuted 4D tensor
+    dnnl_acdb, ///< permuted 4D tensor
+    dnnl_adbc, ///< permuted 4D tensor
+    dnnl_adcb, ///< permuted 4D tensor
+    dnnl_bacd, ///< permuted 4D tensor
+    dnnl_bcda, ///< permuted 4D tensor
+    dnnl_cdab, ///< permuted 4D tensor
+    dnnl_cdba, ///< permuted 4D tensor
+    dnnl_dcab, ///< permuted 4D tensor
+    dnnl_abced, ///< permuted 5D tensor
+    dnnl_abdec, ///< permuted 5D tensor
+    dnnl_acbde, ///< permuted 5D tensor
+    dnnl_acdeb, ///< permuted 5D tensor
+    dnnl_adecb, ///< permuted 5D tensor
+    dnnl_bacde, ///< permuted 5D tensor
+    dnnl_bcdea, ///< permuted 5D tensor
+    dnnl_cdeab, ///< permuted 5D tensor
+    dnnl_cdeba, ///< permuted 5D tensor
+    dnnl_decab, ///< permuted 5D tensor
+    dnnl_abcdfe, ///< permuted 6D tensor
+    dnnl_abdefc, ///< permuted 6D tensor
+    dnnl_abdfce, ///< permuted 6D tensor
+    dnnl_acbdef, ///< permuted 6D tensor
+    dnnl_adefcb, ///< permuted 6D tensor
+    dnnl_defcab, ///< permuted 6D tensor
+    dnnl_abcdegf, ///< permuted 7D tensor
+    dnnl_abcdefhg, ///< permuted 8D tensor
+    dnnl_abcdefgih, ///< permuted 9D tensor
+    dnnl_abcdefghji, ///< permuted 10D tensor
+    dnnl_abcdefghikj, ///< permuted 11D tensor
+    dnnl_abcdefghijlk, ///< permuted 12D tensor
+
+    // Opaque blocked formats
+
+    dnnl_Abc16a,
+    dnnl_ABc16a16b,
+    dnnl_ABc32a32b,
+    dnnl_ABc4a4b,
+    /// 3D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBc16b,
+    dnnl_ABc16b16a,
+    dnnl_Abc4a,
+    /// 3D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBc32b,
+    /// 3D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBc4b,
+    dnnl_ABc4b16a4b,
+    dnnl_ABc2b8a4b,
+    dnnl_ABc16b16a4b,
+    dnnl_ABc16b16a2b,
+    dnnl_ABc4b4a,
+    dnnl_ABc8a16b2a,
+    dnnl_ABc8a8b,
+    dnnl_ABc8a4b,
+    /// 3D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBc8b,
+    dnnl_ABc8b16a2b,
+    dnnl_BAc8a16b2a,
+    dnnl_ABc8b8a,
+    dnnl_Abcd16a,
+    dnnl_Abcd8a,
+    dnnl_ABcd16a16b,
+    dnnl_Abcd32a,
+    dnnl_ABcd32a32b,
+    /// 4D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcd16b,
+    dnnl_ABcd16b16a,
+    dnnl_aBCd16b16c,
+    dnnl_aBCd16c16b,
+    dnnl_Abcd4a,
+    /// 4D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBcd32b,
+    /// 4D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcd4b,
+    dnnl_ABcd4b16a4b,
+    dnnl_ABcd16b16a4b,
+    dnnl_ABcd16b16a2b,
+    dnnl_ABcd4b4a,
+    dnnl_ABcd4a4b,
+    dnnl_aBCd2c4b2c,
+    dnnl_aBCd4b8c2b,
+    dnnl_aBCd4c16b4c,
+    dnnl_aBCd2c8b4c,
+    dnnl_aBCd16c16b4c,
+    dnnl_aBCd16c16b2c,
+    dnnl_aBCd4c4b,
+    dnnl_aBCd4b4c,
+    dnnl_ABcd8a16b2a,
+    dnnl_ABcd2b8a4b,
+    dnnl_ABcd8a8b,
+    dnnl_ABcd8a4b,
+    /// 4D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBcd8b,
+    dnnl_aBCd4c8b2c,
+    dnnl_ABcd8b16a2b,
+    dnnl_aBCd8b16c2b,
+    dnnl_BAcd8a16b2a,
+    /// 4D tensor blocked by 1st and 2nd dimension with block size 8
+    dnnl_ABcd8b8a,
+    dnnl_aBCd8b8c,
+    dnnl_aBCd8b4c,
+    dnnl_aBCd8c16b2c,
+    dnnl_ABcde8a16b2a,
+    dnnl_aCBd8b16c2b,
+    dnnl_aBCd8c8b,
+    dnnl_Abcde16a,
+    dnnl_Abcde32a,
+    dnnl_ABcde16a16b,
+    dnnl_BAcde8a16b2a,
+    /// 4D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCd2b4c2b,
+    /// 5D tensor blocked by 1st dimension with block size 16
+    dnnl_ABcde4b16a4b,
+    /// 5D tensor blocked by 1st dimension with block size 8
+    dnnl_ABcde2b8a4b,
+    /// 5D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcde16b,
+    dnnl_ABcde16b16a,
+    dnnl_aBCde16b16c,
+    dnnl_aBCde16c16b,
+    dnnl_aBCde2c8b4c,
+    dnnl_Abcde4a,
+    /// 5D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBcde32b,
+    /// 5D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcde4b,
+    dnnl_ABcde4b4a,
+    dnnl_ABcde4a4b,
+    dnnl_aBCde4b4c,
+    dnnl_aBCde2c4b2c,
+    dnnl_aBCde4b8c2b,
+    dnnl_aBCde4c16b4c,
+    dnnl_aBCde16c16b4c,
+    dnnl_aBCde16c16b2c,
+    dnnl_aBCde4c4b,
+    dnnl_Abcde8a,
+    dnnl_ABcde8a8b,
+    dnnl_ABcde8a4b,
+    dnnl_BAcde16b16a,
+    /// 5D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBcde8b,
+    dnnl_ABcde8b16a2b,
+    dnnl_aBCde8b16c2b,
+    dnnl_aBCde4c8b2c,
+    dnnl_aCBde8b16c2b,
+    dnnl_ABcde8b8a,
+    dnnl_ABcde32a32b,
+    dnnl_aBCde8b8c,
+    dnnl_aBCde8b4c,
+    dnnl_ABc4a8b8a4b,
+    dnnl_ABcd4a8b8a4b,
+    dnnl_ABcde4a8b8a4b,
+    dnnl_BAc4b8a8b4a,
+    dnnl_BAcd4b8a8b4a,
+    dnnl_BAcde4b8a8b4a,
+    dnnl_ABcd2a8b8a2b,
+    dnnl_aBCd4b8c8b4c,
+    dnnl_aBCde4b8c8b4c,
+    dnnl_aBCde2b8c8b2c,
+    dnnl_aBCde8c16b2c,
+    dnnl_aBCde8c8b,
+    /// 5D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCde2b4c2b,
+    /// 6D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcdef16b,
+    dnnl_aBCdef16b16c,
+    dnnl_aBCdef16c16b,
+    dnnl_aBCdef4c16b4c,
+    /// 6D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBCdef2c8b4c,
+    dnnl_aBCdef4c8b2c,
+    /// 6D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCdef2b4c2b,
+    /// 6D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcdef4b,
+    dnnl_aBCdef4c4b,
+    dnnl_aBCdef4b4c,
+    dnnl_aBCdef2c4b2c,
+    dnnl_aBCdef4b8c2b,
+    dnnl_aBCdef8b8c,
+    dnnl_aBCdef8b4c,
+    dnnl_aBCdef8c16b2c,
+    dnnl_aBCdef4b8c8b4c,
+    dnnl_aBCdef8b16c2b,
+    dnnl_aCBdef8b16c2b,
+    dnnl_aBCdef8c8b,
+    dnnl_aBdc16b,
+    dnnl_aBdC16b2c,
+    dnnl_aBdC16b4c,
+    dnnl_aBdc4b,
+    dnnl_aBdc8b,
+    dnnl_aBdec16b,
+    dnnl_aBdeC16b2c,
+    dnnl_aBdeC16b4c,
+    dnnl_aBdec32b,
+    dnnl_aBdec4b,
+    dnnl_aBdec8b,
+    dnnl_aBdefc16b,
+    dnnl_aBdefC16b2c,
+    dnnl_aCBdef16c16b,
+    dnnl_aBdefc4b,
+    dnnl_aBdefc8b,
+    dnnl_Abcdef16a,
+    dnnl_Abcdef32a,
+    dnnl_aBedc16b,
+    dnnl_Acb16a,
+    dnnl_AcB16a2b,
+    dnnl_AcB16a4b,
+    dnnl_Acb4a,
+    dnnl_Acb8a,
+    dnnl_aCBd16b16c,
+    dnnl_aCBd16c16b,
+    dnnl_aCBde16b16c,
+    dnnl_aCBde16c16b,
+    dnnl_Acdb16a,
+    dnnl_AcdB16a2b,
+    dnnl_AcdB16a4b,
+    dnnl_Acdb32a,
+    dnnl_Acdb4a,
+    dnnl_Acdb8a,
+    dnnl_Acdeb16a,
+    dnnl_AcdeB16a2b,
+    dnnl_Acdeb4a,
+    dnnl_Acdeb8a,
+    dnnl_Adcb16a,
+    dnnl_BAc16a16b,
+    dnnl_BAc16b16a,
+    dnnl_BAcd16a16b,
+    dnnl_BAcd16b16a,
+    dnnl_aCBd4c8b8c4b,
+    dnnl_aCBde4c8b8c4b,
+    dnnl_aCBdef4c8b8c4b,
+    dnnl_BAcde16a16b,
+    dnnl_aCBdef16b16c,
+    dnnl_ABc16b32a,
+    dnnl_ABc16b64a,
+    dnnl_ABc4b32a4b,
+    dnnl_ABc4b64a4b,
+    dnnl_ABc8b32a2b,
+    dnnl_ABc8b64a2b,
+    dnnl_AB16b16a,
+    dnnl_AB16b32a,
+    dnnl_AB16b64a,
+    dnnl_AB8b16a2b,
+    dnnl_AB8b32a2b,
+    dnnl_AB8b64a2b,
+    dnnl_AB4b16a4b,
+    dnnl_AB4b32a4b,
+    dnnl_AB4b64a4b,
+    dnnl_AB16b16a4b,
+    dnnl_ABcd16b32a,
+    dnnl_ABcd16b64a,
+    dnnl_ABcd4b32a4b,
+    dnnl_ABcd4b64a4b,
+    dnnl_ABcd8b32a2b,
+    dnnl_ABcd8b64a2b,
+    dnnl_ABcde4b32a4b,
+    dnnl_ABcde4b64a4b,
+    dnnl_ABcde16b16a4b,
+    dnnl_ABcde16b16a2b,
+    dnnl_ABcde16b32a,
+    dnnl_ABcde16b64a,
+    dnnl_ABcde8b32a2b,
+    dnnl_ABcde8b64a2b,
+    dnnl_aBCdef16c16b4c,
+    dnnl_aBCdef16c16b2c,
+    dnnl_AB32a32b8a4b,
+    dnnl_AB8a4b,
+    dnnl_AB32a32b8a2b,
+    dnnl_AB8a2b,
+    dnnl_abDc32d,
+    dnnl_abDC32d4c,
+    dnnl_abdEc32e,
+    dnnl_abdEC32e2c,
+    dnnl_abdEC32e4c,
+    dnnl_aBdefC16b4c,
+    dnnl_AcdeB16a4b,
+    dnnl_ABcd16a16b2a,
+    dnnl_ABc16a16b2a,
+    dnnl_aBCd16b16c2b,
+    dnnl_aBCde16b16c2b,
+    dnnl_Acb32a,
+    dnnl_AcB32a2b,
+    dnnl_AcB32a4b,
+    dnnl_Acb48a,
+    dnnl_AcB48a2b,
+    dnnl_AcB48a4b,
+    dnnl_Acb64a,
+    dnnl_AcB64a2b,
+    dnnl_AcB64a4b,
+    dnnl_cBa2b,
+    dnnl_cBa4b,
+    dnnl_aBdc32b,
+    dnnl_aBdC32b2c,
+    dnnl_aBdC32b4c,
+    dnnl_aBdc48b,
+    dnnl_aBdC48b2c,
+    dnnl_aBdC48b4c,
+    dnnl_aBdc64b,
+    dnnl_aBdC64b2c,
+    dnnl_aBdC64b4c,
+    dnnl_adCb2c,
+    dnnl_adCb4c,
+    dnnl_AcdB32a2b,
+    dnnl_AcdB32a4b,
+    dnnl_Acdb48a,
+    dnnl_AcdB48a2b,
+    dnnl_AcdB48a4b,
+    dnnl_Acdb64a,
+    dnnl_AcdB64a2b,
+    dnnl_AcdB64a4b,
+    dnnl_cdBa2b,
+    dnnl_cdBa4b,
+    dnnl_aBdeC32b2c,
+    dnnl_aBdeC32b4c,
+    dnnl_aBdec48b,
+    dnnl_aBdeC48b2c,
+    dnnl_aBdeC48b4c,
+    dnnl_aBdec64b,
+    dnnl_aBdeC64b2c,
+    dnnl_aBdeC64b4c,
+    dnnl_adeCb2c,
+    dnnl_adeCb4c,
+    dnnl_Acdeb32a,
+    dnnl_AcdeB32a2b,
+    dnnl_AcdeB32a4b,
+    dnnl_Acdeb48a,
+    dnnl_AcdeB48a2b,
+    dnnl_AcdeB48a4b,
+    dnnl_Acdeb64a,
+    dnnl_AcdeB64a2b,
+    dnnl_AcdeB64a4b,
+    dnnl_cdeBa2b,
+    dnnl_cdeBa4b,
+    dnnl_aBdefc32b,
+    dnnl_aBdefC32b2c,
+    dnnl_aBdefC32b4c,
+    dnnl_aBdefc48b,
+    dnnl_aBdefC48b2c,
+    dnnl_aBdefC48b4c,
+    dnnl_aBdefc64b,
+    dnnl_aBdefC64b2c,
+    dnnl_aBdefC64b4c,
+    dnnl_adefCb2c,
+    dnnl_adefCb4c,
+    dnnl_AB16b32a4b,
+    dnnl_AB16b48a4b,
+    dnnl_AB16b64a4b,
+    dnnl_AB16b16a2b,
+    dnnl_AB16b32a2b,
+    dnnl_AB16b48a2b,
+    dnnl_AB16b64a2b,
+    dnnl_ABc16b32a4b,
+    dnnl_ABc16b48a4b,
+    dnnl_ABc16b64a4b,
+    dnnl_ABc16b32a2b,
+    dnnl_ABc16b48a2b,
+    dnnl_ABc16b64a2b,
+    dnnl_ABcd16b32a4b,
+    dnnl_ABcd16b48a4b,
+    dnnl_ABcd16b64a4b,
+    dnnl_ABcd16b32a2b,
+    dnnl_ABcd16b48a2b,
+    dnnl_ABcd16b64a2b,
+    dnnl_ABcde16b32a4b,
+    dnnl_ABcde16b48a4b,
+    dnnl_ABcde16b64a4b,
+    dnnl_ABcde16b32a2b,
+    dnnl_ABcde16b48a2b,
+    dnnl_ABcde16b64a2b,
+    dnnl_ABc32a16b,
+    dnnl_ABcd32a16b,
+    dnnl_ABcde32a16b,
+    dnnl_AB48a16b,
+    dnnl_AB48a32b,
+    dnnl_ABc40a16b,
+    dnnl_ABc40a32b,
+    dnnl_aBC48b16c,
+    dnnl_aBC48b32c,
+    dnnl_ABcd40a16b,
+    dnnl_ABcd40a32b,
+    dnnl_abCd32c,
+    dnnl_abdCe32c,
+    dnnl_abdCE32c2e,
+    dnnl_BA16a16b2a,
+    dnnl_BA16a32b2a,
+    dnnl_BA16a48b2a,
+    dnnl_BA16a64b2a,
+    dnnl_BA16a16b4a,
+    dnnl_BA16a32b4a,
+    dnnl_BA16a48b4a,
+    dnnl_BA16a64b4a,
+    dnnl_ABcd8a2b,
+    dnnl_aBdeC16c16b2c,
+    dnnl_aBdeC16c16b4c,
+    dnnl_aBdefC16c16b2c,
+    dnnl_AcB16b16a2b,
+    dnnl_AcB16b16a4b,
+    dnnl_AcdB16b16a2b,
+    dnnl_AcdB16b16a4b,
+    dnnl_AcdeB16b16a2b,
+    dnnl_aBdefC16c16b4c,
+    dnnl_AcdeB16b16a4b,
+    dnnl_AcB16b32a2b,
+    dnnl_AcB16b32a4b,
+    dnnl_AcB16b48a2b,
+    dnnl_AcB16b48a4b,
+    dnnl_AcB16b64a2b,
+    dnnl_AcB16b64a4b,
+    dnnl_aBdC16c16b2c,
+    dnnl_aBdC16c16b4c,
+    dnnl_aBdC16c32b2c,
+    dnnl_aBdC16c32b4c,
+    dnnl_aBdC16c48b2c,
+    dnnl_aBdC16c48b4c,
+    dnnl_aBdC16c64b2c,
+    dnnl_aBdC16c64b4c,
+    dnnl_AcdB16b32a2b,
+    dnnl_AcdB16b32a4b,
+    dnnl_AcdB16b48a2b,
+    dnnl_AcdB16b48a4b,
+    dnnl_AcdB16b64a2b,
+    dnnl_AcdB16b64a4b,
+    dnnl_aBdeC16c32b2c,
+    dnnl_aBdeC16c32b4c,
+    dnnl_aBdeC16c48b2c,
+    dnnl_aBdeC16c48b4c,
+    dnnl_aBdeC16c64b2c,
+    dnnl_aBdeC16c64b4c,
+    dnnl_AcdeB16b32a2b,
+    dnnl_AcdeB16b32a4b,
+    dnnl_AcdeB16b48a2b,
+    dnnl_AcdeB16b48a4b,
+    dnnl_AcdeB16b64a2b,
+    dnnl_AcdeB16b64a4b,
+    dnnl_aBdefC16c32b2c,
+    dnnl_aBdefC16c32b4c,
+    dnnl_aBdefC16c48b2c,
+    dnnl_aBdefC16c48b4c,
+    dnnl_aBdefC16c64b2c,
+    dnnl_aBdefC16c64b4c,
+    dnnl_decbA16a,
+    dnnl_ABc4a2b,
+    dnnl_ABc8a2b,
+    dnnl_aBCd8b2c,
+    dnnl_ABcde4a2b,
+    dnnl_ABcde8a2b,
+    dnnl_ABcde40a16b,
+    dnnl_ABcde40a32b,
+    dnnl_aBCde8b2c,
+    dnnl_ABcde4a8b8a2b,
+    dnnl_ABcd4a8b8a2b,
+    dnnl_ABc4a8b8a2b,
+    dnnl_aBCdef4b8c8b2c,
+    dnnl_aBCde4b8c8b2c,
+    dnnl_aBCd4b8c8b2c,
+    dnnl_BAcde4b8a8b2a,
+    dnnl_BAcd4b8a8b2a,
+    dnnl_BAc4b8a8b2a,
+    dnnl_aCBdef4c8b8c2b,
+    dnnl_aCBde4c8b8c2b,
+    dnnl_aCBd4c8b8c2b,
+    dnnl_aBCdef8b2c,
+    dnnl_AB32a16b,
+    dnnl_AB32a32b,
+    dnnl_BA4b8a8b2a,
+    dnnl_BA4b8a8b4a,
+    dnnl_aBC32b16c,
+    dnnl_aBC32b32c,
+    dnnl_aCB4c8b8c2b,
+    dnnl_aCB4c8b8c4b,
+    dnnl_ABcd4a2b,
+    dnnl_ABc2b8a16b4a,
+    dnnl_ABcd2b8a16b4a,
+    dnnl_ABcde2b8a16b4a,
+    dnnl_ABc2a8b16a4b,
+    dnnl_ABc2a8b16a2b,
+    dnnl_ABc2b32a8b,
+    dnnl_ABcd2a8b16a4b,
+    dnnl_ABcd2a8b16a2b,
+    dnnl_aCBd2c8b16c2b,
+    dnnl_ABcd2b32a8b,
+    dnnl_aBCd2c8b16c2b,
+    dnnl_ABcde2a8b16a4b,
+    dnnl_ABcde2a8b16a2b,
+    dnnl_aCBde2c8b16c2b,
+    dnnl_ABcde2b32a8b,
+    dnnl_aBC2b8c16b2c,
+    dnnl_aBCd2b8c16b2c,
+    dnnl_aBCde2b8c16b2c,
+    dnnl_aBCdef2b8c16b2c,
+    dnnl_BAcde2b8a16b4a,
+    dnnl_BAcd2b8a16b4a,
+    dnnl_BAc2b8a16b4a,
+    dnnl_BAcde2b8a16b2a,
+    dnnl_BAcd2b8a16b2a,
+    dnnl_BAc2b8a16b2a,
+    dnnl_aBCde2c8b16c2b,
+    dnnl_aBCdef2c8b16c2b,
+    dnnl_aCBdef2c8b16c2b,
+    dnnl_aBCd2b8c16b4c,
+    dnnl_aBCde2b8c16b4c,
+    dnnl_BA4b8a16b2a,
+    dnnl_BA4b8a16b4a,
+    dnnl_aCB4c8b16c2b,
+    dnnl_aCB4c8b16c4b,
+    dnnl_BA16a16b,
+    dnnl_BA16a32b,
+    dnnl_BA16a48b,
+    dnnl_BA16a64b,
+    dnnl_aCB16c2b,
+    dnnl_aCB16c4b,
+    dnnl_BA16b2a,
+    dnnl_BA16b4a,
+    dnnl_aBC16b16c,
+    dnnl_aBC16b32c,
+    dnnl_AB16a16b,
+    dnnl_AB16a32b,
+    dnnl_ABcde16a16b2a,
+    dnnl_aBCdef16b16c2b,
+    dnnl_Acedb16a,
+    dnnl_aBdfec16b,
+    dnnl_abdEC64e2c,
+    dnnl_abdEC64e4c,
+    dnnl_aCB16b16c,
+    dnnl_aCB16b32c,
+    dnnl_aCB16b48c,
+    dnnl_aCB16b64c,
+    dnnl_aCB16b16c2b,
+    dnnl_aCB16b32c2b,
+    dnnl_aCB16b48c2b,
+    dnnl_aCB16b64c2b,
+    dnnl_aCB16b16c4b,
+    dnnl_aCB16b32c4b,
+    dnnl_aCB16b48c4b,
+    dnnl_aCB16b64c4b,
+    dnnl_abCd4c,
+    dnnl_abCde4c,
+    dnnl_abCdef4c,
+    dnnl_abCde32c,
+    dnnl_abCdef32c,
+    dnnl_ABcd16a32b,
+    dnnl_decbA8a,
+    dnnl_aCdefB16b32c2b,
+    dnnl_aCdefB16b32c4b,
+    dnnl_aCdefB16b48c2b,
+    dnnl_aCdefB16b48c4b,
+    dnnl_aCdefB16b64c2b,
+    dnnl_aCdefB16b64c4b,
+    dnnl_BcdeA16a32b2a,
+    dnnl_BcdeA16a32b4a,
+    dnnl_BcdeA16a48b2a,
+    dnnl_BcdeA16a48b4a,
+    dnnl_BcdeA16a64b2a,
+    dnnl_BcdeA16a64b4a,
+    dnnl_aCdefb32c,
+    dnnl_aCdefB32c2b,
+    dnnl_aCdefB32c4b,
+    dnnl_aCdefb48c,
+    dnnl_aCdefB48c2b,
+    dnnl_aCdefB48c4b,
+    dnnl_aCdefb64c,
+    dnnl_aCdefB64c2b,
+    dnnl_aCdefB64c4b,
+    dnnl_Bcdea32b,
+    dnnl_BcdeA32b2a,
+    dnnl_BcdeA32b4a,
+    dnnl_Bcdea48b,
+    dnnl_BcdeA48b2a,
+    dnnl_BcdeA48b4a,
+    dnnl_Bcdea64b,
+    dnnl_BcdeA64b2a,
+    dnnl_BcdeA64b4a,
+    dnnl_Bca32b,
+    dnnl_BcA32b2a,
+    dnnl_BcA32b4a,
+    dnnl_Bca48b,
+    dnnl_BcA48b2a,
+    dnnl_BcA48b4a,
+    dnnl_Bca64b,
+    dnnl_BcA64b2a,
+    dnnl_BcA64b4a,
+    dnnl_aCdb32c,
+    dnnl_aCdB32c2b,
+    dnnl_aCdB32c4b,
+    dnnl_aCdb48c,
+    dnnl_aCdB48c2b,
+    dnnl_aCdB48c4b,
+    dnnl_aCdb64c,
+    dnnl_aCdB64c2b,
+    dnnl_aCdB64c4b,
+    dnnl_BcA16a16b2a,
+    dnnl_BcA16a16b4a,
+    dnnl_BcdA16a16b2a,
+    dnnl_BcdA16a16b4a,
+    dnnl_BcdeA16a16b2a,
+    dnnl_BcdeA16a16b4a,
+    dnnl_aCdB16b16c2b,
+    dnnl_aCdB16b16c4b,
+    dnnl_aCdeB16b16c2b,
+    dnnl_aCdeB16b16c4b,
+    dnnl_aCdefB16b16c2b,
+    dnnl_aCdefB16b16c4b,
+    dnnl_BcA16a32b2a,
+    dnnl_BcA16a32b4a,
+    dnnl_BcA16a48b2a,
+    dnnl_BcA16a48b4a,
+    dnnl_BcA16a64b2a,
+    dnnl_BcA16a64b4a,
+    dnnl_aCdB16b32c2b,
+    dnnl_aCdB16b32c4b,
+    dnnl_aCdB16b48c2b,
+    dnnl_aCdB16b48c4b,
+    dnnl_aCdB16b64c2b,
+    dnnl_aCdB16b64c4b,
+    dnnl_BcdA16a32b2a,
+    dnnl_BcdA16a32b4a,
+    dnnl_BcdA16a48b2a,
+    dnnl_BcdA16a48b4a,
+    dnnl_BcdA16a64b2a,
+    dnnl_BcdA16a64b4a,
+    dnnl_aCdeB16b32c2b,
+    dnnl_aCdeB16b32c4b,
+    dnnl_aCdeB16b48c2b,
+    dnnl_aCdeB16b48c4b,
+    dnnl_aCdeB16b64c2b,
+    dnnl_aCdeB16b64c4b,
+    dnnl_Bca16b,
+    dnnl_BcA16b2a,
+    dnnl_BcA16b4a,
+    dnnl_Bcda16b,
+    dnnl_BcdA16b2a,
+    dnnl_BcdA16b4a,
+    dnnl_Bcdea16b,
+    dnnl_BcdeA16b2a,
+    dnnl_BcdeA16b4a,
+    dnnl_aCdb16c,
+    dnnl_aCdB16c2b,
+    dnnl_aCdB16c4b,
+    dnnl_aCdeb16c,
+    dnnl_aCdeB16c2b,
+    dnnl_aCdeB16c4b,
+    dnnl_aCdefb16c,
+    dnnl_aCdefB16c2b,
+    dnnl_aCdefB16c4b,
+    dnnl_Bcda32b,
+    dnnl_BcdA32b2a,
+    dnnl_BcdA32b4a,
+    dnnl_Bcda48b,
+    dnnl_BcdA48b2a,
+    dnnl_BcdA48b4a,
+    dnnl_Bcda64b,
+    dnnl_BcdA64b2a,
+    dnnl_BcdA64b4a,
+    dnnl_aCdeb32c,
+    dnnl_aCdeB32c2b,
+    dnnl_aCdeB32c4b,
+    dnnl_aCdeb48c,
+    dnnl_aCdeB48c2b,
+    dnnl_aCdeB48c4b,
+    dnnl_aCdeb64c,
+    dnnl_aCdeB64c2b,
+    dnnl_aCdeB64c4b,
+    dnnl_Acb24a,
+    dnnl_Acdb24a,
+    dnnl_Acdeb24a,
+    dnnl_aBdc24b,
+    dnnl_aBdec24b,
+    dnnl_aBdefc24b,
+    dnnl_abDc16d,
+    dnnl_abdEc16e,
+    dnnl_abdCe16c,
+    dnnl_AcB24a2b,
+    dnnl_AcdB24a2b,
+    dnnl_AcdeB24a2b,
+    dnnl_aBdC24b2c,
+    dnnl_aBdeC24b2c,
+    dnnl_aBdefC24b2c,
+    dnnl_AcB8a2b,
+    dnnl_AcdB8a2b,
+    dnnl_AcdeB8a2b,
+    dnnl_aBdC8b2c,
+    dnnl_aBdeC8b2c,
+    dnnl_aBdefC8b2c,
+    dnnl_AB8b32a,
+    dnnl_ABc8b32a,
+    dnnl_ABcd8b32a,
+    dnnl_ABcde8b32a,
+    dnnl_AB8b24a,
+    dnnl_ABc8b24a,
+    dnnl_ABcd8b24a,
+    dnnl_ABcde8b24a,
+    dnnl_AB8b16a,
+    dnnl_ABc8b16a,
+    dnnl_ABcd8b16a,
+    dnnl_ABcde8b16a,
+    dnnl_AB8b8a,
+    dnnl_AB4b8a4b,
+    dnnl_AB4b24a4b,
+    dnnl_ABc4b8a4b,
+    dnnl_ABc4b24a4b,
+    dnnl_ABcd4b8a4b,
+    dnnl_ABcd4b24a4b,
+    dnnl_ABcde4b8a4b,
+    dnnl_ABcde4b24a4b,
+    dnnl_AB8b24a2b,
+    dnnl_ABc8b24a2b,
+    dnnl_ABcd8b24a2b,
+    dnnl_ABcde8b24a2b,
+    dnnl_AB8b8a2b,
+    dnnl_ABc8b8a2b,
+    dnnl_ABcd8b8a2b,
+    dnnl_ABcde8b8a2b,
+    dnnl_AcB24a4b,
+    dnnl_AcdB24a4b,
+    dnnl_AcdeB24a4b,
+    dnnl_aBdC24b4c,
+    dnnl_aBdeC24b4c,
+    dnnl_aBdefC24b4c,
+    dnnl_AcB8a4b,
+    dnnl_AcdB8a4b,
+    dnnl_AcdeB8a4b,
+    dnnl_aBdC8b4c,
+    dnnl_aBdeC8b4c,
+    dnnl_aBdefC8b4c,
+    dnnl_Bca8b,
+    dnnl_BcA8b2a,
+    dnnl_Bcda8b,
+    dnnl_BcdA8b2a,
+    dnnl_Bcdea8b,
+    dnnl_BcdeA8b2a,
+    dnnl_aCdb8c,
+    dnnl_aCdB8c2b,
+    dnnl_aCdeb8c,
+    dnnl_aCdeB8c2b,
+    dnnl_aCdefb8c,
+    dnnl_aCdefB8c2b,
+    dnnl_Bca24b,
+    dnnl_BcA24b2a,
+    dnnl_Bcda24b,
+    dnnl_BcdA24b2a,
+    dnnl_Bcdea24b,
+    dnnl_BcdeA24b2a,
+    dnnl_aCdb24c,
+    dnnl_aCdB24c2b,
+    dnnl_aCdeb24c,
+    dnnl_aCdeB24c2b,
+    dnnl_aCdefb24c,
+    dnnl_aCdefB24c2b,
+    dnnl_BcA8b4a,
+    dnnl_BcdA8b4a,
+    dnnl_BcdeA8b4a,
+    dnnl_aCdB8c4b,
+    dnnl_aCdeB8c4b,
+    dnnl_aCdefB8c4b,
+    dnnl_BcA24b4a,
+    dnnl_BcdA24b4a,
+    dnnl_BcdeA24b4a,
+    dnnl_aCdB24c4b,
+    dnnl_aCdeB24c4b,
+    dnnl_aCdefB24c4b,
+    dnnl_AB16b48a,
+    dnnl_ABc16b48a,
+    dnnl_ABcd16b48a,
+    dnnl_ABcde16b48a,
+    dnnl_ABc16a4b,
+    dnnl_ABcd16a4b,
+    dnnl_ABcde16a4b,
+    dnnl_defcbA16a,
+    dnnl_defcbA8a,
+    dnnl_AcB16b64a,
+    dnnl_AcdB16b64a,
+    dnnl_AcdeB16b64a,
+    dnnl_AcB16b48a,
+    dnnl_AcdB16b48a,
+    dnnl_AcdeB16b48a,
+    dnnl_AcB16b32a,
+    dnnl_AcdB16b32a,
+    dnnl_AcdeB16b32a,
+    dnnl_AcB16b16a,
+    dnnl_AcdB16b16a,
+    dnnl_AcdeB16b16a,
+    dnnl_AcB8b32a,
+    dnnl_AcdB8b32a,
+    dnnl_AcdeB8b32a,
+    dnnl_AcB8b24a,
+    dnnl_AcdB8b24a,
+    dnnl_AcdeB8b24a,
+    dnnl_AcB8b16a,
+    dnnl_AcdB8b16a,
+    dnnl_AcdeB8b16a,
+    dnnl_AcB8b8a,
+    dnnl_AcdB8b8a,
+    dnnl_AcdeB8b8a,
+    dnnl_AcB8b64a2b,
+    dnnl_AcdB8b64a2b,
+    dnnl_AcdeB8b64a2b,
+    dnnl_AcB8b32a2b,
+    dnnl_AcdB8b32a2b,
+    dnnl_AcdeB8b32a2b,
+    dnnl_AcB8b24a2b,
+    dnnl_AcdB8b24a2b,
+    dnnl_AcdeB8b24a2b,
+    dnnl_AcB8b16a2b,
+    dnnl_AcdB8b16a2b,
+    dnnl_AcdeB8b16a2b,
+    dnnl_AcB8b8a2b,
+    dnnl_AcdB8b8a2b,
+    dnnl_AcdeB8b8a2b,
+    dnnl_AcB4b64a4b,
+    dnnl_AcdB4b64a4b,
+    dnnl_AcdeB4b64a4b,
+    dnnl_AcB4b32a4b,
+    dnnl_AcdB4b32a4b,
+    dnnl_AcdeB4b32a4b,
+    dnnl_AcB4b24a4b,
+    dnnl_AcdB4b24a4b,
+    dnnl_AcdeB4b24a4b,
+    dnnl_AcB4b16a4b,
+    dnnl_AcdB4b16a4b,
+    dnnl_AcdeB4b16a4b,
+    dnnl_AcB4b8a4b,
+    dnnl_AcdB4b8a4b,
+    dnnl_AcdeB4b8a4b,
+    dnnl_Ab4a,
+    dnnl_Ab8a,
+    dnnl_BA4b4a,
+    dnnl_BA8b4a,
+    dnnl_BA2a24b,
+    dnnl_aCB2b24c,
+    dnnl_BA2a8b,
+    dnnl_aCB2b8c,
+    dnnl_BA8a24b,
+    dnnl_aCB8b24c,
+    dnnl_BA8a16b,
+    dnnl_aCB8b16c,
+    dnnl_BA8a8b,
+    dnnl_aCB8b8c,
+    dnnl_bcad,
+    dnnl_cabd,
+    dnnl_dabc,
+    dnnl_Ab32a,
+    dnnl_aCBd8b8c,
+    dnnl_aCBde8b8c,
+    dnnl_BAc8a8b,
+    dnnl_BAcd8a8b,
+    dnnl_BAcde8a8b,
+    dnnl_aCBdef8b8c,
+    dnnl_abdEC16e4c,
+    dnnl_abDC16d4c,
+
+    /// Just a sentinel, not real memory format tag. Must be changed after new
+    /// format tag is added.
+    dnnl_format_tag_last,
+
+    // Aliases
+
+    /// 1D tensor, an alias to #dnnl_a
+    dnnl_x = dnnl_a,
+    /// 2D CNN activations tensor, an alias to #dnnl_ab
+    dnnl_nc = dnnl_ab,
+    /// 2D CNN activations tensor, an alias to #dnnl_ba
+    dnnl_cn = dnnl_ba,
+    /// 2D RNN statistics tensor, an alias to #dnnl_ab
+    dnnl_tn = dnnl_ab,
+    /// 2D RNN statistics tensor, an alias to #dnnl_ba
+    dnnl_nt = dnnl_ba,
+    /// 3D CNN activations tensor, an alias to #dnnl_abc
+    dnnl_ncw = dnnl_abc,
+    /// 3D CNN activations tensor, an alias to #dnnl_acb
+    dnnl_nwc = dnnl_acb,
+    /// 4D CNN activations tensor, an alias to #dnnl_abcd
+    dnnl_nchw = dnnl_abcd,
+    /// 4D CNN activations tensor, an alias to #dnnl_acdb
+    dnnl_nhwc = dnnl_acdb,
+    /// 4D CNN activations tensor, an alias to #dnnl_bcda
+    dnnl_chwn = dnnl_bcda,
+    /// 5D CNN activations tensor, an alias to #dnnl_abcde
+    dnnl_ncdhw = dnnl_abcde,
+    /// 5D CNN activations tensor, an alias to #dnnl_acdeb
+    dnnl_ndhwc = dnnl_acdeb,
+
+    /// 2D CNN weights tensor, an alias to #dnnl_ab
+    dnnl_oi = dnnl_ab,
+    /// 2D CNN weights tensor, an alias to #dnnl_ba
+    dnnl_io = dnnl_ba,
+    /// 3D CNN weights tensor, an alias to #dnnl_abc
+    dnnl_oiw = dnnl_abc,
+    /// 3D CNN weights tensor, an alias to #dnnl_acb
+    dnnl_owi = dnnl_acb,
+    /// 3D CNN weights tensor, an alias to #dnnl_cba
+    dnnl_wio = dnnl_cba,
+    /// 3D CNN weights tensor, an alias to #dnnl_cab
+    dnnl_woi = dnnl_cab,
+    /// 3D CNN weights tensor, an alias to #dnnl_bca
+    dnnl_iwo = dnnl_bca,
+    /// 4D CNN weights tensor, an alias to #dnnl_abcd
+    dnnl_oihw = dnnl_abcd,
+    /// 4D CNN weights tensor, an alias to #dnnl_cdba
+    dnnl_hwio = dnnl_cdba,
+    /// 4D CNN weights tensor, an alias to #dnnl_cdab
+    dnnl_hwoi = dnnl_cdab,
+    /// 4D CNN weights tensor, an alias to #dnnl_acdb
+    dnnl_ohwi = dnnl_acdb,
+    /// 4D CNN weights tensor, an alias to #dnnl_bcda
+    dnnl_ihwo = dnnl_bcda,
+    /// 4D CNN weights tensor, an alias to #dnnl_bacd
+    dnnl_iohw = dnnl_bacd,
+    /// 5D CNN weights tensor, an alias to #dnnl_abcde
+    dnnl_oidhw = dnnl_abcde,
+    /// 5D CNN weights tensor, an alias to #dnnl_bacde
+    dnnl_iodhw = dnnl_bacde,
+    /// 5D CNN weights tensor, an alias to #dnnl_cdeba
+    dnnl_dhwio = dnnl_cdeba,
+    /// 5D CNN weights tensor, an alias to #dnnl_cdeab
+    dnnl_dhwoi = dnnl_cdeab,
+    /// 5D CNN weights tensor, an alias to #dnnl_acdeb
+    dnnl_odhwi = dnnl_acdeb,
+    /// 5D CNN weights tensor, an alias to #dnnl_bcdea
+    dnnl_idhwo = dnnl_bcdea,
+
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abcd
+    dnnl_goiw = dnnl_abcd,
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abdc
+    dnnl_gowi = dnnl_abdc,
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_dcab
+    dnnl_wigo = dnnl_dcab,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abcde
+    dnnl_goihw = dnnl_abcde,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abdec
+    dnnl_gohwi = dnnl_abdec,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_decab
+    dnnl_hwigo = dnnl_decab,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_acbde
+    dnnl_giohw = dnnl_acbde,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abcdef
+    dnnl_goidhw = dnnl_abcdef,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abdefc
+    dnnl_godhwi = dnnl_abdefc,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_acbdef
+    dnnl_giodhw = dnnl_acbdef,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_defcab
+    dnnl_dhwigo = dnnl_defcab,
+
+    /// 3D RNN data tensor in the format (seq_length, batch, input channels),
+    /// an alias to #dnnl_abc.
+    dnnl_tnc = dnnl_abc,
+    /// 3D RNN data tensor in the format (batch, seq_length, input channels),
+    /// an alias to #dnnl_bac.
+    dnnl_ntc = dnnl_bac,
+    /// 4D RNN states tensor in the format (num_layers, num_directions,
+    /// batch, state channels), an alias to #dnnl_abcd.
+    dnnl_ldnc = dnnl_abcd,
+    /// 5D RNN weights tensor in the format (num_layers, num_directions,
+    /// input_channels, num_gates, output_channels), an alias to #dnnl_abcde.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldigo = dnnl_abcde,
+    /// 5D RNN weights tensor in the format (num_layers, num_directions,
+    /// num_gates, output_channels, input_channels), an alias to #dnnl_abdec.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldgoi = dnnl_abdec,
+    /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+    /// num_channels_in_hidden_state, num_channels_in_recurrent_projection),
+    /// an alias to #dnnl_abcd.
+    dnnl_ldio = dnnl_abcd,
+    /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+    /// num_channels_in_recurrent_projection, num_channels_in_hidden_state),
+    /// an alias to #dnnl_abdc.
+    dnnl_ldoi = dnnl_abdc,
+    /// 4D RNN bias tensor in the format (num_layers, num_directions,
+    /// num_gates, output_channels), an alias to #dnnl_abcd.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldgo = dnnl_abcd,
+    /// 5D LSTM projection tensor
+    dnnl_ldOi16o = dnnl_abDc16d,
+    dnnl_ldOi32o = dnnl_abDc32d,
+    dnnl_ldOI16o4i = dnnl_abDC16d4c,
+    dnnl_ldOI32o4i = dnnl_abDC32d4c,
+    dnnl_ldIo32i = dnnl_abCd32c,
+    /// 6D RNN weights tensor
+    dnnl_ldgOi16o = dnnl_abdEc16e,
+    dnnl_ldgOI16o4i = dnnl_abdEC16e4c,
+    dnnl_ldgOi32o = dnnl_abdEc32e,
+    dnnl_ldgOI32o2i = dnnl_abdEC32e2c,
+    dnnl_ldgOI32o4i = dnnl_abdEC32e4c,
+    dnnl_ldgOI64o2i = dnnl_abdEC64e2c,
+    dnnl_ldgOI64o4i = dnnl_abdEC64e4c,
+    dnnl_ldgIo16i = dnnl_abdCe16c,
+    dnnl_ldgIo32i = dnnl_abdCe32c,
+    dnnl_ldgIO32i2o = dnnl_abdCE32c2e,
+
+    // Opaque data types, are not to be used explicitly
+
+    // data
+    /// 5D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBcde32b
+    dnnl_nCdhw32c = dnnl_aBcde32b,
+    /// 5D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBcde16b
+    dnnl_nCdhw16c = dnnl_aBcde16b,
+    /// 5D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBcde4b
+    dnnl_nCdhw4c = dnnl_aBcde4b,
+    /// 5D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBcde8b
+    dnnl_nCdhw8c = dnnl_aBcde8b,
+    /// 4D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBcd32b
+    dnnl_nChw32c = dnnl_aBcd32b,
+    /// 4D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBcd16b
+    dnnl_nChw16c = dnnl_aBcd16b,
+    /// 4D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBcd4b
+    dnnl_nChw4c = dnnl_aBcd4b,
+    /// 4D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBcd8b
+    dnnl_nChw8c = dnnl_aBcd8b,
+    /// 3D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBc32b
+    dnnl_nCw32c = dnnl_aBc32b,
+    /// 3D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBc16b
+    dnnl_nCw16c = dnnl_aBc16b,
+    /// 3D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBc4b
+    dnnl_nCw4c = dnnl_aBc4b,
+    /// 3D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBc8b
+    dnnl_nCw8c = dnnl_aBc8b,
+    dnnl_NCw16n16c = dnnl_ABc16a16b,
+    dnnl_NCdhw16n16c = dnnl_ABcde16a16b,
+    dnnl_NChw16n16c = dnnl_ABcd16a16b,
+    dnnl_NCw32n16c = dnnl_ABc32a16b,
+    dnnl_NChw32n16c = dnnl_ABcd32a16b,
+    dnnl_NChw16n32c = dnnl_ABcd16a32b,
+    dnnl_NCdhw32n16c = dnnl_ABcde32a16b,
+    dnnl_NCw32n32c = dnnl_ABc32a32b,
+    dnnl_NChw32n32c = dnnl_ABcd32a32b,
+    dnnl_NCdhw32n32c = dnnl_ABcde32a32b,
+
+    // weights, 2D
+    dnnl_OI16i16o = dnnl_AB16b16a,
+    dnnl_OI16i32o = dnnl_AB16b32a,
+    dnnl_OI16i48o = dnnl_AB16b48a,
+    dnnl_OI16i64o = dnnl_AB16b64a,
+    dnnl_OI8i8o2i = dnnl_AB8b8a2b,
+    dnnl_OI8i16o2i = dnnl_AB8b16a2b,
+    dnnl_OI8i24o2i = dnnl_AB8b24a2b,
+    dnnl_OI8i32o2i = dnnl_AB8b32a2b,
+    dnnl_OI8i64o2i = dnnl_AB8b64a2b,
+    dnnl_OI4i8o4i = dnnl_AB4b8a4b,
+    dnnl_OI4i16o4i = dnnl_AB4b16a4b,
+    dnnl_OI4i24o4i = dnnl_AB4b24a4b,
+    dnnl_OI4i32o4i = dnnl_AB4b32a4b,
+    dnnl_OI4i64o4i = dnnl_AB4b64a4b,
+    dnnl_OI16i16o4i = dnnl_AB16b16a4b,
+    dnnl_OI8i32o = dnnl_AB8b32a,
+    dnnl_OI8i24o = dnnl_AB8b24a,
+    dnnl_OI8i16o = dnnl_AB8b16a,
+    dnnl_OI8i8o = dnnl_AB8b8a,
+
+    // weights, 3D
+    dnnl_IOw8o8i = dnnl_BAc8a8b,
+    dnnl_IOw16o16i = dnnl_BAc16a16b,
+    dnnl_IOw16i16o = dnnl_BAc16b16a,
+    dnnl_OIw16i16o = dnnl_ABc16b16a,
+    dnnl_OwI16i16o = dnnl_AcB16b16a,
+    dnnl_OIw16i32o = dnnl_ABc16b32a,
+    dnnl_OwI16i32o = dnnl_AcB16b32a,
+    dnnl_OIw16i48o = dnnl_ABc16b48a,
+    dnnl_OwI16i48o = dnnl_AcB16b48a,
+    dnnl_OIw16i64o = dnnl_ABc16b64a,
+    dnnl_OwI16i64o = dnnl_AcB16b64a,
+    dnnl_OIw16o16i = dnnl_ABc16a16b,
+    dnnl_Oiw16o = dnnl_Abc16a,
+    dnnl_OIw4i8o4i = dnnl_ABc4b8a4b,
+    dnnl_OwI4i8o4i = dnnl_AcB4b8a4b,
+    dnnl_OIw4i16o4i = dnnl_ABc4b16a4b,
+    dnnl_OwI4i16o4i = dnnl_AcB4b16a4b,
+    dnnl_OIw4i24o4i = dnnl_ABc4b24a4b,
+    dnnl_OwI4i24o4i = dnnl_AcB4b24a4b,
+    dnnl_OIw4i32o4i = dnnl_ABc4b32a4b,
+    dnnl_OwI4i32o4i = dnnl_AcB4b32a4b,
+    dnnl_OIw4i64o4i = dnnl_ABc4b64a4b,
+    dnnl_OwI4i64o4i = dnnl_AcB4b64a4b,
+    dnnl_OIw2i8o4i = dnnl_ABc2b8a4b,
+    dnnl_OIw16i16o4i = dnnl_ABc16b16a4b,
+    dnnl_OIw16i16o2i = dnnl_ABc16b16a2b,
+    dnnl_OIw16o16i2o = dnnl_ABc16a16b2a,
+    dnnl_OIw4i4o = dnnl_ABc4b4a,
+    dnnl_OIw4o4i = dnnl_ABc4a4b,
+    dnnl_Oiw4o = dnnl_Abc4a,
+    dnnl_OIw8i8o2i = dnnl_ABc8b8a2b,
+    dnnl_OwI8i8o2i = dnnl_AcB8b8a2b,
+    dnnl_OIw8i16o2i = dnnl_ABc8b16a2b,
+    dnnl_OwI8i16o2i = dnnl_AcB8b16a2b,
+    dnnl_OIw8i24o2i = dnnl_ABc8b24a2b,
+    dnnl_OwI8i24o2i = dnnl_AcB8b24a2b,
+    dnnl_OIw8i32o2i = dnnl_ABc8b32a2b,
+    dnnl_OwI8i32o2i = dnnl_AcB8b32a2b,
+    dnnl_OIw8i64o2i = dnnl_ABc8b64a2b,
+    dnnl_OwI8i64o2i = dnnl_AcB8b64a2b,
+    dnnl_OIw8i8o = dnnl_ABc8b8a,
+    dnnl_OwI8i8o = dnnl_AcB8b8a,
+    dnnl_OIw8o16i2o = dnnl_ABc8a16b2a,
+    dnnl_IOw8o16i2o = dnnl_BAc8a16b2a,
+    dnnl_OIw8o8i = dnnl_ABc8a8b,
+    dnnl_OIw8o4i = dnnl_ABc8a4b,
+    dnnl_Owi16o = dnnl_Acb16a,
+    dnnl_OwI16o2i = dnnl_AcB16a2b,
+    dnnl_OwI16o4i = dnnl_AcB16a4b,
+    dnnl_Iwo8i = dnnl_Bca8b,
+    dnnl_IwO8i2o = dnnl_BcA8b2a,
+    dnnl_IwO8i4o = dnnl_BcA8b4a,
+    dnnl_Iwo16i = dnnl_Bca16b,
+    dnnl_IwO16i2o = dnnl_BcA16b2a,
+    dnnl_IwO16i4o = dnnl_BcA16b4a,
+    dnnl_Iwo24i = dnnl_Bca24b,
+    dnnl_IwO24i2o = dnnl_BcA24b2a,
+    dnnl_IwO24i4o = dnnl_BcA24b4a,
+    dnnl_Owi4o = dnnl_Acb4a,
+    dnnl_Owi8o = dnnl_Acb8a,
+    dnnl_OwI8o2i = dnnl_AcB8a2b,
+    dnnl_OIw8i32o = dnnl_ABc8b32a,
+    dnnl_OwI8i32o = dnnl_AcB8b32a,
+    dnnl_OIw8i24o = dnnl_ABc8b24a,
+    dnnl_OwI8i24o = dnnl_AcB8b24a,
+    dnnl_OIw8i16o = dnnl_ABc8b16a,
+    dnnl_OwI8i16o = dnnl_AcB8b16a,
+    dnnl_OwI8o4i = dnnl_AcB8a4b,
+
+    // weights, 4D
+    dnnl_IOhw16i16o = dnnl_BAcd16b16a,
+    dnnl_IOhw8o8i = dnnl_BAcd8a8b,
+    dnnl_IOhw16o16i = dnnl_BAcd16a16b,
+    dnnl_Ohwi16o = dnnl_Acdb16a,
+    dnnl_OhwI16o2i = dnnl_AcdB16a2b,
+    dnnl_OhwI16o4i = dnnl_AcdB16a4b,
+    dnnl_Ihwo8i = dnnl_Bcda8b,
+    dnnl_IhwO8i2o = dnnl_BcdA8b2a,
+    dnnl_IhwO8i4o = dnnl_BcdA8b4a,
+    dnnl_Ihwo16i = dnnl_Bcda16b,
+    dnnl_IhwO16i2o = dnnl_BcdA16b2a,
+    dnnl_IhwO16i4o = dnnl_BcdA16b4a,
+    dnnl_Ihwo24i = dnnl_Bcda24b,
+    dnnl_IhwO24i2o = dnnl_BcdA24b2a,
+    dnnl_IhwO24i4o = dnnl_BcdA24b4a,
+    dnnl_Ohwi24o = dnnl_Acdb24a,
+    dnnl_Ohwi32o = dnnl_Acdb32a,
+    dnnl_Ohwi4o = dnnl_Acdb4a,
+    dnnl_Ohwi8o = dnnl_Acdb8a,
+    dnnl_OhwI8o2i = dnnl_AcdB8a2b,
+    dnnl_OhwI8o4i = dnnl_AcdB8a4b,
+    dnnl_OIhw16i16o = dnnl_ABcd16b16a,
+    dnnl_OhwI16i16o = dnnl_AcdB16b16a,
+    dnnl_OIhw16i32o = dnnl_ABcd16b32a,
+    dnnl_OhwI16i32o = dnnl_AcdB16b32a,
+    dnnl_OIhw16i48o = dnnl_ABcd16b48a,
+    dnnl_OhwI16i48o = dnnl_AcdB16b48a,
+    dnnl_OIhw16i64o = dnnl_ABcd16b64a,
+    dnnl_OhwI16i64o = dnnl_AcdB16b64a,
+    dnnl_OIhw16o16i = dnnl_ABcd16a16b,
+    dnnl_Oihw16o = dnnl_Abcd16a,
+    dnnl_OIhw4i8o4i = dnnl_ABcd4b8a4b,
+    dnnl_OhwI4i8o4i = dnnl_AcdB4b8a4b,
+    dnnl_OIhw4i16o4i = dnnl_ABcd4b16a4b,
+    dnnl_OhwI4i16o4i = dnnl_AcdB4b16a4b,
+    dnnl_OIhw4i24o4i = dnnl_ABcd4b24a4b,
+    dnnl_OhwI4i24o4i = dnnl_AcdB4b24a4b,
+    dnnl_OIhw4i32o4i = dnnl_ABcd4b32a4b,
+    dnnl_OhwI4i32o4i = dnnl_AcdB4b32a4b,
+    dnnl_OIhw4i64o4i = dnnl_ABcd4b64a4b,
+    dnnl_OhwI4i64o4i = dnnl_AcdB4b64a4b,
+    dnnl_OIhw16i16o4i = dnnl_ABcd16b16a4b,
+    dnnl_OIhw16i16o2i = dnnl_ABcd16b16a2b,
+    dnnl_OIhw16o16i2o = dnnl_ABcd16a16b2a,
+    dnnl_OIhw4i4o = dnnl_ABcd4b4a,
+    dnnl_OIhw4o4i = dnnl_ABcd4a4b,
+    dnnl_Oihw4o = dnnl_Abcd4a,
+    dnnl_OIhw8i8o2i = dnnl_ABcd8b8a2b,
+    dnnl_OhwI8i8o2i = dnnl_AcdB8b8a2b,
+    dnnl_OIhw8i16o2i = dnnl_ABcd8b16a2b,
+    dnnl_OhwI8i16o2i = dnnl_AcdB8b16a2b,
+    dnnl_OIhw8i32o2i = dnnl_ABcd8b32a2b,
+    dnnl_OhwI8i32o2i = dnnl_AcdB8b32a2b,
+    dnnl_OIhw8i24o2i = dnnl_ABcd8b24a2b,
+    dnnl_OhwI8i24o2i = dnnl_AcdB8b24a2b,
+    dnnl_OIhw8i64o2i = dnnl_ABcd8b64a2b,
+    dnnl_OhwI8i64o2i = dnnl_AcdB8b64a2b,
+    dnnl_OIhw8i8o = dnnl_ABcd8b8a,
+    dnnl_OhwI8i8o = dnnl_AcdB8b8a,
+    dnnl_OIhw8o16i2o = dnnl_ABcd8a16b2a,
+    dnnl_OIhw2i8o4i = dnnl_ABcd2b8a4b,
+    dnnl_IOhw8o16i2o = dnnl_BAcd8a16b2a,
+    dnnl_OIhw8o8i = dnnl_ABcd8a8b,
+    dnnl_OIhw8o4i = dnnl_ABcd8a4b,
+    dnnl_Owhi16o = dnnl_Adcb16a,
+    dnnl_OIhw8i32o = dnnl_ABcd8b32a,
+    dnnl_OhwI8i32o = dnnl_AcdB8b32a,
+    dnnl_OIhw8i24o = dnnl_ABcd8b24a,
+    dnnl_OhwI8i24o = dnnl_AcdB8b24a,
+    dnnl_OIhw8i16o = dnnl_ABcd8b16a,
+    dnnl_OhwI8i16o = dnnl_AcdB8b16a,
+
+    // weights, 5D
+    dnnl_Odhwi16o = dnnl_Acdeb16a,
+    dnnl_OdhwI16o2i = dnnl_AcdeB16a2b,
+    dnnl_OdhwI16o4i = dnnl_AcdeB16a4b,
+    dnnl_Idhwo8i = dnnl_Bcdea8b,
+    dnnl_IdhwO8i2o = dnnl_BcdeA8b2a,
+    dnnl_IdhwO8i4o = dnnl_BcdeA8b4a,
+    dnnl_Idhwo16i = dnnl_Bcdea16b,
+    dnnl_IdhwO16i2o = dnnl_BcdeA16b2a,
+    dnnl_IdhwO16i4o = dnnl_BcdeA16b4a,
+    dnnl_Idhwo24i = dnnl_Bcdea24b,
+    dnnl_IdhwO24i2o = dnnl_BcdeA24b2a,
+    dnnl_IdhwO24i4o = dnnl_BcdeA24b4a,
+    dnnl_Odhwi4o = dnnl_Acdeb4a,
+    dnnl_Odhwi8o = dnnl_Acdeb8a,
+    dnnl_OdhwI8o2i = dnnl_AcdeB8a2b,
+    dnnl_OdhwI8o4i = dnnl_AcdeB8a4b,
+    dnnl_Odwhi16o = dnnl_Acedb16a,
+    dnnl_OIdhw16i16o = dnnl_ABcde16b16a,
+    dnnl_OdhwI16i16o = dnnl_AcdeB16b16a,
+    dnnl_OIdhw16i32o = dnnl_ABcde16b32a,
+    dnnl_OdhwI16i32o = dnnl_AcdeB16b32a,
+    dnnl_OIdhw16i48o = dnnl_ABcde16b48a,
+    dnnl_OdhwI16i48o = dnnl_AcdeB16b48a,
+    dnnl_OIdhw16i64o = dnnl_ABcde16b64a,
+    dnnl_OdhwI16i64o = dnnl_AcdeB16b64a,
+    dnnl_OIdhw16o16i = dnnl_ABcde16a16b,
+    dnnl_Oidhw16o = dnnl_Abcde16a,
+    dnnl_OIdhw4i4o = dnnl_ABcde4b4a,
+    dnnl_OIdhw4o4i = dnnl_ABcde4a4b,
+    dnnl_Oidhw4o = dnnl_Abcde4a,
+    dnnl_OIdhw8i8o2i = dnnl_ABcde8b8a2b,
+    dnnl_OdhwI8i8o2i = dnnl_AcdeB8b8a2b,
+    dnnl_OIdhw8i16o2i = dnnl_ABcde8b16a2b,
+    dnnl_OdhwI8i16o2i = dnnl_AcdeB8b16a2b,
+    dnnl_OIdhw8i32o2i = dnnl_ABcde8b32a2b,
+    dnnl_OdhwI8i32o2i = dnnl_AcdeB8b32a2b,
+    dnnl_OIdhw8i24o2i = dnnl_ABcde8b24a2b,
+    dnnl_OdhwI8i24o2i = dnnl_AcdeB8b24a2b,
+    dnnl_OIdhw8i64o2i = dnnl_ABcde8b64a2b,
+    dnnl_OdhwI8i64o2i = dnnl_AcdeB8b64a2b,
+    dnnl_OIdhw8i8o = dnnl_ABcde8b8a,
+    dnnl_OdhwI8i8o = dnnl_AcdeB8b8a,
+    dnnl_OIdhw8o16i2o = dnnl_ABcde8a16b2a,
+    dnnl_IOdhw8o16i2o = dnnl_BAcde8a16b2a,
+    dnnl_OIdhw4i8o4i = dnnl_ABcde4b8a4b,
+    dnnl_OdhwI4i8o4i = dnnl_AcdeB4b8a4b,
+    dnnl_OIdhw4i16o4i = dnnl_ABcde4b16a4b,
+    dnnl_OdhwI4i16o4i = dnnl_AcdeB4b16a4b,
+    dnnl_OIdhw4i24o4i = dnnl_ABcde4b24a4b,
+    dnnl_OdhwI4i24o4i = dnnl_AcdeB4b24a4b,
+    dnnl_OIdhw4i32o4i = dnnl_ABcde4b32a4b,
+    dnnl_OdhwI4i32o4i = dnnl_AcdeB4b32a4b,
+    dnnl_OIdhw4i64o4i = dnnl_ABcde4b64a4b,
+    dnnl_OdhwI4i64o4i = dnnl_AcdeB4b64a4b,
+    dnnl_OIdhw16i16o4i = dnnl_ABcde16b16a4b,
+    dnnl_OIdhw16i16o2i = dnnl_ABcde16b16a2b,
+    dnnl_OIdhw2i8o4i = dnnl_ABcde2b8a4b,
+    dnnl_OIdhw8o8i = dnnl_ABcde8a8b,
+    dnnl_OIdhw8o4i = dnnl_ABcde8a4b,
+    dnnl_IOdhw16i16o = dnnl_BAcde16b16a,
+    dnnl_OIdhw4o8i8o4i = dnnl_ABcde4a8b8a4b,
+    dnnl_IOdhw8o8i = dnnl_BAcde8a8b,
+    dnnl_IOdhw16o16i = dnnl_BAcde16a16b,
+    dnnl_OIdhw16o16i2o = dnnl_ABcde16a16b2a,
+    dnnl_OIdhw8i32o = dnnl_ABcde8b32a,
+    dnnl_OdhwI8i32o = dnnl_AcdeB8b32a,
+    dnnl_OIdhw8i24o = dnnl_ABcde8b24a,
+    dnnl_OdhwI8i24o = dnnl_AcdeB8b24a,
+    dnnl_OIdhw8i16o = dnnl_ABcde8b16a,
+    dnnl_OdhwI8i16o = dnnl_AcdeB8b16a,
+
+    // weights w/ groups, 3D
+    dnnl_Goiw16g = dnnl_Abcd16a,
+    dnnl_Goiw8g = dnnl_Abcd8a,
+    dnnl_Goiw4g = dnnl_Abcd4a,
+    dnnl_gIOw8o8i = dnnl_aCBd8b8c,
+    dnnl_gIOw16o16i = dnnl_aCBd16b16c,
+    dnnl_gIOw16i16o = dnnl_aCBd16c16b,
+    dnnl_gOIw16i16o = dnnl_aBCd16c16b,
+    dnnl_gOIw16o16i = dnnl_aBCd16b16c,
+    dnnl_gOiw16o = dnnl_aBcd16b,
+    dnnl_gOIw4i16o4i = dnnl_aBCd4c16b4c,
+    dnnl_gOIw2i8o4i = dnnl_aBCd2c8b4c,
+    dnnl_gOIw16i16o4i = dnnl_aBCd16c16b4c,
+    dnnl_gOIw16i16o2i = dnnl_aBCd16c16b2c,
+    dnnl_gOIw16o16i2o = dnnl_aBCd16b16c2b,
+    dnnl_gOIw4i4o = dnnl_aBCd4c4b,
+    dnnl_gOIw4o4i = dnnl_aBCd4b4c,
+    dnnl_gOiw4o = dnnl_aBcd4b,
+    dnnl_gOIw8i16o2i = dnnl_aBCd8c16b2c,
+    dnnl_gOIw8i8o = dnnl_aBCd8c8b,
+    dnnl_gOIw8o16i2o = dnnl_aBCd8b16c2b,
+    dnnl_gIOw8o16i2o = dnnl_aCBd8b16c2b,
+    dnnl_gOIw8o8i = dnnl_aBCd8b8c,
+    dnnl_gOIw8o4i = dnnl_aBCd8b4c,
+    dnnl_gOwi16o = dnnl_aBdc16b,
+    dnnl_gOwI16o2i = dnnl_aBdC16b2c,
+    dnnl_gOwI16o4i = dnnl_aBdC16b4c,
+    dnnl_gIwo8i = dnnl_aCdb8c,
+    dnnl_gIwO8i2o = dnnl_aCdB8c2b,
+    dnnl_gIwO8i4o = dnnl_aCdB8c4b,
+    dnnl_gIwo16i = dnnl_aCdb16c,
+    dnnl_gIwO16i2o = dnnl_aCdB16c2b,
+    dnnl_gIwO16i4o = dnnl_aCdB16c4b,
+    dnnl_gIwo24i = dnnl_aCdb24c,
+    dnnl_gIwO24i2o = dnnl_aCdB24c2b,
+    dnnl_gIwO24i4o = dnnl_aCdB24c4b,
+    dnnl_gOwi4o = dnnl_aBdc4b,
+    dnnl_gOwi8o = dnnl_aBdc8b,
+    dnnl_gOwI8o2i = dnnl_aBdC8b2c,
+    dnnl_gOwI8o4i = dnnl_aBdC8b4c,
+    dnnl_Goiw32g = dnnl_Abcd32a,
+    dnnl_gOIw2i4o2i = dnnl_aBCd2c4b2c,
+    dnnl_gOIw2o4i2o = dnnl_aBCd2b4c2b,
+    dnnl_gOIw4i8o2i = dnnl_aBCd4c8b2c,
+    dnnl_gOIw4o8i2o = dnnl_aBCd4b8c2b,
+    dnnl_goIw4i = dnnl_abCd4c,
+    dnnl_goIw32i = dnnl_abCd32c,
+
+    // weights w/ groups, 4D
+    dnnl_gIOhw16i16o = dnnl_aCBde16c16b,
+    dnnl_gIOhw8o8i = dnnl_aCBde8b8c,
+    dnnl_gIOhw16o16i = dnnl_aCBde16b16c,
+    dnnl_gOhwi16o = dnnl_aBdec16b,
+    dnnl_gOhwI16o2i = dnnl_aBdeC16b2c,
+    dnnl_gOhwI16o4i = dnnl_aBdeC16b4c,
+    dnnl_gIhwo8i = dnnl_aCdeb8c,
+    dnnl_gIhwO8i2o = dnnl_aCdeB8c2b,
+    dnnl_gIhwO8i4o = dnnl_aCdeB8c4b,
+    dnnl_gIhwo16i = dnnl_aCdeb16c,
+    dnnl_gIhwO16i2o = dnnl_aCdeB16c2b,
+    dnnl_gIhwO16i4o = dnnl_aCdeB16c4b,
+    dnnl_gIhwo24i = dnnl_aCdeb24c,
+    dnnl_gIhwO24i2o = dnnl_aCdeB24c2b,
+    dnnl_gIhwO24i4o = dnnl_aCdeB24c4b,
+    dnnl_gOhwi32o = dnnl_aBdec32b,
+    dnnl_gOhwi24o = dnnl_aBdec24b,
+    dnnl_gOhwI24o2i = dnnl_aBdeC24b2c,
+    dnnl_gOhwI24o4i = dnnl_aBdeC24b4c,
+    dnnl_gOhwi4o = dnnl_aBdec4b,
+    dnnl_gOhwi8o = dnnl_aBdec8b,
+    dnnl_gOhwI8o2i = dnnl_aBdeC8b2c,
+    dnnl_gOhwI8o4i = dnnl_aBdeC8b4c,
+    dnnl_Goihw16g = dnnl_Abcde16a,
+    dnnl_gOIhw16i16o = dnnl_aBCde16c16b,
+    dnnl_gOIhw16o16i = dnnl_aBCde16b16c,
+    dnnl_gOihw16o = dnnl_aBcde16b,
+    dnnl_gOIhw2i8o4i = dnnl_aBCde2c8b4c,
+    dnnl_gOIhw4i16o4i = dnnl_aBCde4c16b4c,
+    dnnl_gOIhw16i16o4i = dnnl_aBCde16c16b4c,
+    dnnl_gOIhw16i16o2i = dnnl_aBCde16c16b2c,
+    dnnl_gOIhw16o16i2o = dnnl_aBCde16b16c2b,
+    dnnl_gOIhw4i4o = dnnl_aBCde4c4b,
+    dnnl_gOIhw4o4i = dnnl_aBCde4b4c,
+    dnnl_gOihw4o = dnnl_aBcde4b,
+    dnnl_Goihw8g = dnnl_Abcde8a,
+    dnnl_Goihw4g = dnnl_Abcde4a,
+    dnnl_gOIhw8i16o2i = dnnl_aBCde8c16b2c,
+    dnnl_gOIhw8i8o = dnnl_aBCde8c8b,
+    dnnl_gOIhw8o16i2o = dnnl_aBCde8b16c2b,
+    dnnl_gIOhw8o16i2o = dnnl_aCBde8b16c2b,
+    dnnl_gOIhw8o8i = dnnl_aBCde8b8c,
+    dnnl_gOIhw8o4i = dnnl_aBCde8b4c,
+    dnnl_Goihw32g = dnnl_Abcde32a,
+    dnnl_gOwhi16o = dnnl_aBedc16b,
+    dnnl_goIhw4i = dnnl_abCde4c,
+    dnnl_goIhw32i = dnnl_abCde32c,
+
+    dnnl_OIw4o8i8o4i = dnnl_ABc4a8b8a4b,
+    dnnl_OIhw4o8i8o4i = dnnl_ABcd4a8b8a4b,
+    dnnl_IOw4i8o8i4o = dnnl_BAc4b8a8b4a,
+    dnnl_IOhw4i8o8i4o = dnnl_BAcd4b8a8b4a,
+    dnnl_IOdhw4i8o8i4o = dnnl_BAcde4b8a8b4a,
+
+    dnnl_OIhw2o8i8o2i = dnnl_ABcd2a8b8a2b,
+    dnnl_gOIw4o8i8o4i = dnnl_aBCd4b8c8b4c,
+    dnnl_gOIhw4o8i8o4i = dnnl_aBCde4b8c8b4c,
+    dnnl_gOIdhw4o8i8o4i = dnnl_aBCdef4b8c8b4c,
+    dnnl_gIOw4i8o8i4o = dnnl_aCBd4c8b8c4b,
+    dnnl_gIOhw4i8o8i4o = dnnl_aCBde4c8b8c4b,
+    dnnl_gIOdhw4i8o8i4o = dnnl_aCBdef4c8b8c4b,
+    dnnl_gOIhw2o8i8o2i = dnnl_aBCde2b8c8b2c,
+    dnnl_gOIhw2i4o2i = dnnl_aBCde2c4b2c,
+    dnnl_gOIhw2o4i2o = dnnl_aBCde2b4c2b,
+    dnnl_gOIhw4i8o2i = dnnl_aBCde4c8b2c,
+    dnnl_gOIhw4o8i2o = dnnl_aBCde4b8c2b,
+
+    // weights w/ groups, 6D
+    dnnl_gIOdhw16i16o = dnnl_aCBdef16c16b,
+    dnnl_gIOdhw8o8i = dnnl_aCBdef8b8c,
+    dnnl_gIOdhw16o16i = dnnl_aCBdef16b16c,
+    dnnl_gOdhwi16o = dnnl_aBdefc16b,
+    dnnl_gOdhwI16o2i = dnnl_aBdefC16b2c,
+    dnnl_gOdhwI16o4i = dnnl_aBdefC16b4c,
+    dnnl_gIdhwo8i = dnnl_aCdefb8c,
+    dnnl_gIdhwO8i2o = dnnl_aCdefB8c2b,
+    dnnl_gIdhwO8i4o = dnnl_aCdefB8c4b,
+    dnnl_gIdhwo16i = dnnl_aCdefb16c,
+    dnnl_gIdhwO16i2o = dnnl_aCdefB16c2b,
+    dnnl_gIdhwO16i4o = dnnl_aCdefB16c4b,
+    dnnl_gIdhwo24i = dnnl_aCdefb24c,
+    dnnl_gIdhwO24i2o = dnnl_aCdefB24c2b,
+    dnnl_gIdhwO24i4o = dnnl_aCdefB24c4b,
+    dnnl_gOdhwi4o = dnnl_aBdefc4b,
+    dnnl_gOdhwi8o = dnnl_aBdefc8b,
+    dnnl_gOdhwI8o2i = dnnl_aBdefC8b2c,
+    dnnl_gOdhwI8o4i = dnnl_aBdefC8b4c,
+    dnnl_gOdwhi16o = dnnl_aBdfec16b,
+    dnnl_gOIdhw16i16o = dnnl_aBCdef16c16b,
+    dnnl_gOIdhw4i16o4i = dnnl_aBCdef4c16b4c,
+    dnnl_gOIdhw16i16o4i = dnnl_aBCdef16c16b4c,
+    dnnl_gOIdhw2i8o4i = dnnl_aBCdef2c8b4c,
+    dnnl_gOIdhw16i16o2i = dnnl_aBCdef16c16b2c,
+    dnnl_gOIdhw16o16i = dnnl_aBCdef16b16c,
+    dnnl_gOIdhw16o16i2o = dnnl_aBCdef16b16c2b,
+    dnnl_gOidhw16o = dnnl_aBcdef16b,
+    dnnl_gOIdhw4i4o = dnnl_aBCdef4c4b,
+    dnnl_gOIdhw4o4i = dnnl_aBCdef4b4c,
+    dnnl_gOidhw4o = dnnl_aBcdef4b,
+    dnnl_gOIdhw8i16o2i = dnnl_aBCdef8c16b2c,
+    dnnl_gOIdhw8i8o = dnnl_aBCdef8c8b,
+    dnnl_gOIdhw8o16i2o = dnnl_aBCdef8b16c2b,
+    dnnl_gIOdhw8o16i2o = dnnl_aCBdef8b16c2b,
+    dnnl_gOIdhw8o8i = dnnl_aBCdef8b8c,
+    dnnl_gOIdhw8o4i = dnnl_aBCdef8b4c,
+    dnnl_Goidhw16g = dnnl_Abcdef16a,
+    dnnl_Goidhw32g = dnnl_Abcdef32a,
+    dnnl_gOIdhw2i4o2i = dnnl_aBCdef2c4b2c,
+    dnnl_gOIdhw4i8o2i = dnnl_aBCdef4c8b2c,
+    dnnl_gOIdhw2o4i2o = dnnl_aBCdef2b4c2b,
+    dnnl_gOIdhw4o8i2o = dnnl_aBCdef4b8c2b,
+    dnnl_goIdhw4i = dnnl_abCdef4c,
+    dnnl_goIdhw32i = dnnl_abCdef32c,
+
+    // weights, 3D
+    dnnl_Owi24o = dnnl_Acb24a,
+    dnnl_OwI24o2i = dnnl_AcB24a2b,
+    dnnl_OwI24o4i = dnnl_AcB24a4b,
+    dnnl_Owi32o = dnnl_Acb32a,
+    dnnl_OwI32o2i = dnnl_AcB32a2b,
+    dnnl_OwI32o4i = dnnl_AcB32a4b,
+    dnnl_Owi48o = dnnl_Acb48a,
+    dnnl_OwI48o2i = dnnl_AcB48a2b,
+    dnnl_OwI48o4i = dnnl_AcB48a4b,
+    dnnl_Owi64o = dnnl_Acb64a,
+    dnnl_OwI64o2i = dnnl_AcB64a2b,
+    dnnl_OwI64o4i = dnnl_AcB64a4b,
+    dnnl_Iwo32i = dnnl_Bca32b,
+    dnnl_IwO32i2o = dnnl_BcA32b2a,
+    dnnl_IwO32i4o = dnnl_BcA32b4a,
+    dnnl_Iwo48i = dnnl_Bca48b,
+    dnnl_IwO48i2o = dnnl_BcA48b2a,
+    dnnl_IwO48i4o = dnnl_BcA48b4a,
+    dnnl_Iwo64i = dnnl_Bca64b,
+    dnnl_IwO64i2o = dnnl_BcA64b2a,
+    dnnl_IwO64i4o = dnnl_BcA64b4a,
+    dnnl_wIo2i = dnnl_cBa2b,
+    dnnl_wIo4i = dnnl_cBa4b,
+    dnnl_gOwi24o = dnnl_aBdc24b,
+    dnnl_gOwI24o2i = dnnl_aBdC24b2c,
+    dnnl_gOwI24o4i = dnnl_aBdC24b4c,
+    dnnl_gOwi32o = dnnl_aBdc32b,
+    dnnl_gOwI32o2i = dnnl_aBdC32b2c,
+    dnnl_gOwI32o4i = dnnl_aBdC32b4c,
+    dnnl_gOwi48o = dnnl_aBdc48b,
+    dnnl_gOwI48o2i = dnnl_aBdC48b2c,
+    dnnl_gOwI48o4i = dnnl_aBdC48b4c,
+    dnnl_gOwi64o = dnnl_aBdc64b,
+    dnnl_gOwI64o2i = dnnl_aBdC64b2c,
+    dnnl_gOwI64o4i = dnnl_aBdC64b4c,
+    dnnl_gIwo32i = dnnl_aCdb32c,
+    dnnl_gIwO32i2o = dnnl_aCdB32c2b,
+    dnnl_gIwO32i4o = dnnl_aCdB32c4b,
+    dnnl_gIwo48i = dnnl_aCdb48c,
+    dnnl_gIwO48i2o = dnnl_aCdB48c2b,
+    dnnl_gIwO48i4o = dnnl_aCdB48c4b,
+    dnnl_gIwo64i = dnnl_aCdb64c,
+    dnnl_gIwO64i2o = dnnl_aCdB64c2b,
+    dnnl_gIwO64i4o = dnnl_aCdB64c4b,
+    dnnl_gwio = dnnl_adcb,
+    dnnl_gwIo2i = dnnl_adCb2c,
+    dnnl_gwIo4i = dnnl_adCb4c,
+    // weights, 4D
+    dnnl_OhwI24o = dnnl_Acdb24a,
+    dnnl_OhwI24o2i = dnnl_AcdB24a2b,
+    dnnl_OhwI24o4i = dnnl_AcdB24a4b,
+    dnnl_OhwI32o = dnnl_Acdb32a,
+    dnnl_OhwI32o2i = dnnl_AcdB32a2b,
+    dnnl_OhwI32o4i = dnnl_AcdB32a4b,
+    dnnl_Ohwi48o = dnnl_Acdb48a,
+    dnnl_OhwI48o2i = dnnl_AcdB48a2b,
+    dnnl_OhwI48o4i = dnnl_AcdB48a4b,
+    dnnl_Ohwi64o = dnnl_Acdb64a,
+    dnnl_OhwI64o2i = dnnl_AcdB64a2b,
+    dnnl_OhwI64o4i = dnnl_AcdB64a4b,
+    dnnl_Ihwo32i = dnnl_Bcda32b,
+    dnnl_IhwO32i2o = dnnl_BcdA32b2a,
+    dnnl_IhwO32i4o = dnnl_BcdA32b4a,
+    dnnl_Ihwo48i = dnnl_Bcda48b,
+    dnnl_IhwO48i2o = dnnl_BcdA48b2a,
+    dnnl_IhwO48i4o = dnnl_BcdA48b4a,
+    dnnl_Ihwo64i = dnnl_Bcda64b,
+    dnnl_IhwO64i2o = dnnl_BcdA64b2a,
+    dnnl_IhwO64i4o = dnnl_BcdA64b4a,
+    dnnl_hwIo2i = dnnl_cdBa2b,
+    dnnl_hwIo4i = dnnl_cdBa4b,
+    dnnl_gOhwI24o = dnnl_aBdec24b,
+    dnnl_gOhwI32o = dnnl_aBdec32b,
+    dnnl_gOhwI32o2i = dnnl_aBdeC32b2c,
+    dnnl_gOhwI32o4i = dnnl_aBdeC32b4c,
+    dnnl_gOhwi48o = dnnl_aBdec48b,
+    dnnl_gOhwI48o2i = dnnl_aBdeC48b2c,
+    dnnl_gOhwI48o4i = dnnl_aBdeC48b4c,
+    dnnl_gOhwi64o = dnnl_aBdec64b,
+    dnnl_gOhwI64o2i = dnnl_aBdeC64b2c,
+    dnnl_gOhwI64o4i = dnnl_aBdeC64b4c,
+    dnnl_gIhwo32i = dnnl_aCdeb32c,
+    dnnl_gIhwO32i2o = dnnl_aCdeB32c2b,
+    dnnl_gIhwO32i4o = dnnl_aCdeB32c4b,
+    dnnl_gIhwo48i = dnnl_aCdeb48c,
+    dnnl_gIhwO48i2o = dnnl_aCdeB48c2b,
+    dnnl_gIhwO48i4o = dnnl_aCdeB48c4b,
+    dnnl_gIhwo64i = dnnl_aCdeb64c,
+    dnnl_gIhwO64i2o = dnnl_aCdeB64c2b,
+    dnnl_gIhwO64i4o = dnnl_aCdeB64c4b,
+    dnnl_ghwio = dnnl_adecb,
+    dnnl_ghwIo2i = dnnl_adeCb2c,
+    dnnl_ghwIo4i = dnnl_adeCb4c,
+    // weights, 5D
+    dnnl_Odhwi24o = dnnl_Acdeb24a,
+    dnnl_OdhwI24o2i = dnnl_AcdeB24a2b,
+    dnnl_OdhwI24o4i = dnnl_AcdeB24a4b,
+    dnnl_Odhwi32o = dnnl_Acdeb32a,
+    dnnl_OdhwI32o2i = dnnl_AcdeB32a2b,
+    dnnl_OdhwI32o4i = dnnl_AcdeB32a4b,
+    dnnl_Odhwi48o = dnnl_Acdeb48a,
+    dnnl_OdhwI48o2i = dnnl_AcdeB48a2b,
+    dnnl_OdhwI48o4i = dnnl_AcdeB48a4b,
+    dnnl_Odhwi64o = dnnl_Acdeb64a,
+    dnnl_OdhwI64o2i = dnnl_AcdeB64a2b,
+    dnnl_OdhwI64o4i = dnnl_AcdeB64a4b,
+    dnnl_Idhwo32i = dnnl_Bcdea32b,
+    dnnl_IdhwO32i2o = dnnl_BcdeA32b2a,
+    dnnl_IdhwO32i4o = dnnl_BcdeA32b4a,
+    dnnl_Idhwo48i = dnnl_Bcdea48b,
+    dnnl_IdhwO48i2o = dnnl_BcdeA48b2a,
+    dnnl_IdhwO48i4o = dnnl_BcdeA48b4a,
+    dnnl_Idhwo64i = dnnl_Bcdea64b,
+    dnnl_IdhwO64i2o = dnnl_BcdeA64b2a,
+    dnnl_IdhwO64i4o = dnnl_BcdeA64b4a,
+    dnnl_dhwIo2i = dnnl_cdeBa2b,
+    dnnl_dhwIo4i = dnnl_cdeBa4b,
+    dnnl_gOdhwi24o = dnnl_aBdefc24b,
+    dnnl_gOdhwI24o2i = dnnl_aBdefC24b2c,
+    dnnl_gOdhwI24o4i = dnnl_aBdefC24b4c,
+    dnnl_gOdhwi32o = dnnl_aBdefc32b,
+    dnnl_gOdhwI32o2i = dnnl_aBdefC32b2c,
+    dnnl_gOdhwI32o4i = dnnl_aBdefC32b4c,
+    dnnl_gOdhwi48o = dnnl_aBdefc48b,
+    dnnl_gOdhwI48o2i = dnnl_aBdefC48b2c,
+    dnnl_gOdhwI48o4i = dnnl_aBdefC48b4c,
+    dnnl_gOdhwi64o = dnnl_aBdefc64b,
+    dnnl_gOdhwI64o2i = dnnl_aBdefC64b2c,
+    dnnl_gOdhwI64o4i = dnnl_aBdefC64b4c,
+    dnnl_gIdhwo32i = dnnl_aCdefb32c,
+    dnnl_gIdhwO32i2o = dnnl_aCdefB32c2b,
+    dnnl_gIdhwO32i4o = dnnl_aCdefB32c4b,
+    dnnl_gIdhwo48i = dnnl_aCdefb48c,
+    dnnl_gIdhwO48i2o = dnnl_aCdefB48c2b,
+    dnnl_gIdhwO48i4o = dnnl_aCdefB48c4b,
+    dnnl_gIdhwo64i = dnnl_aCdefb64c,
+    dnnl_gIdhwO64i2o = dnnl_aCdefB64c2b,
+    dnnl_gIdhwO64i4o = dnnl_aCdefB64c4b,
+    dnnl_gdhwio = dnnl_adefcb,
+    dnnl_gdhwIo2i = dnnl_adefCb2c,
+    dnnl_gdhwIo4i = dnnl_adefCb4c,
+    dnnl_OI16i32o4i = dnnl_AB16b32a4b,
+    dnnl_OI16i48o4i = dnnl_AB16b48a4b,
+    dnnl_OI16i64o4i = dnnl_AB16b64a4b,
+    dnnl_OI16i16o2i = dnnl_AB16b16a2b,
+    dnnl_OI16i32o2i = dnnl_AB16b32a2b,
+    dnnl_OI16i48o2i = dnnl_AB16b48a2b,
+    dnnl_OI16i64o2i = dnnl_AB16b64a2b,
+    dnnl_OIw16i32o4i = dnnl_ABc16b32a4b,
+    dnnl_OIw16i48o4i = dnnl_ABc16b48a4b,
+    dnnl_OIw16i64o4i = dnnl_ABc16b64a4b,
+    dnnl_OIw16i32o2i = dnnl_ABc16b32a2b,
+    dnnl_OIw16i48o2i = dnnl_ABc16b48a2b,
+    dnnl_OIw16i64o2i = dnnl_ABc16b64a2b,
+    dnnl_OIhw16i32o4i = dnnl_ABcd16b32a4b,
+    dnnl_OIhw16i48o4i = dnnl_ABcd16b48a4b,
+    dnnl_OIhw16i64o4i = dnnl_ABcd16b64a4b,
+    dnnl_OIhw16i32o2i = dnnl_ABcd16b32a2b,
+    dnnl_OIhw16i48o2i = dnnl_ABcd16b48a2b,
+    dnnl_OIhw16i64o2i = dnnl_ABcd16b64a2b,
+    dnnl_OIdhw16i32o4i = dnnl_ABcde16b32a4b,
+    dnnl_OIdhw16i48o4i = dnnl_ABcde16b48a4b,
+    dnnl_OIdhw16i64o4i = dnnl_ABcde16b64a4b,
+    dnnl_OIdhw16i32o2i = dnnl_ABcde16b32a2b,
+    dnnl_OIdhw16i48o2i = dnnl_ABcde16b48a2b,
+    dnnl_OIdhw16i64o2i = dnnl_ABcde16b64a2b,
+    dnnl_OwI16i16o2i = dnnl_AcB16b16a2b,
+    dnnl_OwI16i16o4i = dnnl_AcB16b16a4b,
+    dnnl_OhwI16i16o2i = dnnl_AcdB16b16a2b,
+    dnnl_OhwI16i16o4i = dnnl_AcdB16b16a4b,
+    dnnl_OdhwI16i16o2i = dnnl_AcdeB16b16a2b,
+    dnnl_OdhwI16i16o4i = dnnl_AcdeB16b16a4b,
+    dnnl_IwO16o16i2o = dnnl_BcA16a16b2a,
+    dnnl_IwO16o16i4o = dnnl_BcA16a16b4a,
+    dnnl_IhwO16o16i2o = dnnl_BcdA16a16b2a,
+    dnnl_IhwO16o16i4o = dnnl_BcdA16a16b4a,
+    dnnl_IdhwO16o16i2o = dnnl_BcdeA16a16b2a,
+    dnnl_IdhwO16o16i4o = dnnl_BcdeA16a16b4a,
+    dnnl_gOwI16i16o2i = dnnl_aBdC16c16b2c,
+    dnnl_gOwI16i16o4i = dnnl_aBdC16c16b4c,
+    dnnl_gOhwI16i16o2i = dnnl_aBdeC16c16b2c,
+    dnnl_gOhwI16i16o4i = dnnl_aBdeC16c16b4c,
+    dnnl_gOdhwI16i16o2i = dnnl_aBdefC16c16b2c,
+    dnnl_gOdhwI16i16o4i = dnnl_aBdefC16c16b4c,
+    dnnl_gIwO16o16i2o = dnnl_aCdB16b16c2b,
+    dnnl_gIwO16o16i4o = dnnl_aCdB16b16c4b,
+    dnnl_gIhwO16o16i2o = dnnl_aCdeB16b16c2b,
+    dnnl_gIhwO16o16i4o = dnnl_aCdeB16b16c4b,
+    dnnl_gIdhwO16o16i2o = dnnl_aCdefB16b16c2b,
+    dnnl_gIdhwO16o16i4o = dnnl_aCdefB16b16c4b,
+    dnnl_OwI16i32o2i = dnnl_AcB16b32a2b,
+    dnnl_OwI16i32o4i = dnnl_AcB16b32a4b,
+    dnnl_OwI16i48o2i = dnnl_AcB16b48a2b,
+    dnnl_OwI16i48o4i = dnnl_AcB16b48a4b,
+    dnnl_OwI16i64o2i = dnnl_AcB16b64a2b,
+    dnnl_OwI16i64o4i = dnnl_AcB16b64a4b,
+    dnnl_IwO16o32i2o = dnnl_BcA16a32b2a,
+    dnnl_IwO16o32i4o = dnnl_BcA16a32b4a,
+    dnnl_IwO16o48i2o = dnnl_BcA16a48b2a,
+    dnnl_IwO16o48i4o = dnnl_BcA16a48b4a,
+    dnnl_IwO16o64i2o = dnnl_BcA16a64b2a,
+    dnnl_IwO16o64i4o = dnnl_BcA16a64b4a,
+    dnnl_gOwI16i32o2i = dnnl_aBdC16c32b2c,
+    dnnl_gOwI16i32o4i = dnnl_aBdC16c32b4c,
+    dnnl_gOwI16i48o2i = dnnl_aBdC16c48b2c,
+    dnnl_gOwI16i48o4i = dnnl_aBdC16c48b4c,
+    dnnl_gOwI16i64o2i = dnnl_aBdC16c64b2c,
+    dnnl_gOwI16i64o4i = dnnl_aBdC16c64b4c,
+    dnnl_gIwO16o32i2o = dnnl_aCdB16b32c2b,
+    dnnl_gIwO16o32i4o = dnnl_aCdB16b32c4b,
+    dnnl_gIwO16o48i2o = dnnl_aCdB16b48c2b,
+    dnnl_gIwO16o48i4o = dnnl_aCdB16b48c4b,
+    dnnl_gIwO16o64i2o = dnnl_aCdB16b64c2b,
+    dnnl_gIwO16o64i4o = dnnl_aCdB16b64c4b,
+    dnnl_OhwI16i32o2i = dnnl_AcdB16b32a2b,
+    dnnl_OhwI16i32o4i = dnnl_AcdB16b32a4b,
+    dnnl_OhwI16i48o2i = dnnl_AcdB16b48a2b,
+    dnnl_OhwI16i48o4i = dnnl_AcdB16b48a4b,
+    dnnl_OhwI16i64o2i = dnnl_AcdB16b64a2b,
+    dnnl_OhwI16i64o4i = dnnl_AcdB16b64a4b,
+    dnnl_IhwO16o32i2o = dnnl_BcdA16a32b2a,
+    dnnl_IhwO16o32i4o = dnnl_BcdA16a32b4a,
+    dnnl_IhwO16o48i2o = dnnl_BcdA16a48b2a,
+    dnnl_IhwO16o48i4o = dnnl_BcdA16a48b4a,
+    dnnl_IhwO16o64i2o = dnnl_BcdA16a64b2a,
+    dnnl_IhwO16o64i4o = dnnl_BcdA16a64b4a,
+    dnnl_gOhwI16i32o2i = dnnl_aBdeC16c32b2c,
+    dnnl_gOhwI16i32o4i = dnnl_aBdeC16c32b4c,
+    dnnl_gOhwI16i48o2i = dnnl_aBdeC16c48b2c,
+    dnnl_gOhwI16i48o4i = dnnl_aBdeC16c48b4c,
+    dnnl_gOhwI16i64o2i = dnnl_aBdeC16c64b2c,
+    dnnl_gOhwI16i64o4i = dnnl_aBdeC16c64b4c,
+    dnnl_gIhwO16o32i2o = dnnl_aCdeB16b32c2b,
+    dnnl_gIhwO16o32i4o = dnnl_aCdeB16b32c4b,
+    dnnl_gIhwO16o48i2o = dnnl_aCdeB16b48c2b,
+    dnnl_gIhwO16o48i4o = dnnl_aCdeB16b48c4b,
+    dnnl_gIhwO16o64i2o = dnnl_aCdeB16b64c2b,
+    dnnl_gIhwO16o64i4o = dnnl_aCdeB16b64c4b,
+    dnnl_OdhwI16i32o2i = dnnl_AcdeB16b32a2b,
+    dnnl_OdhwI16i32o4i = dnnl_AcdeB16b32a4b,
+    dnnl_OdhwI16i48o2i = dnnl_AcdeB16b48a2b,
+    dnnl_OdhwI16i48o4i = dnnl_AcdeB16b48a4b,
+    dnnl_OdhwI16i64o2i = dnnl_AcdeB16b64a2b,
+    dnnl_OdhwI16i64o4i = dnnl_AcdeB16b64a4b,
+    dnnl_IdhwO16o32i2o = dnnl_BcdeA16a32b2a,
+    dnnl_IdhwO16o32i4o = dnnl_BcdeA16a32b4a,
+    dnnl_IdhwO16o48i2o = dnnl_BcdeA16a48b2a,
+    dnnl_IdhwO16o48i4o = dnnl_BcdeA16a48b4a,
+    dnnl_IdhwO16o64i2o = dnnl_BcdeA16a64b2a,
+    dnnl_IdhwO16o64i4o = dnnl_BcdeA16a64b4a,
+    dnnl_gOdhwI16i32o2i = dnnl_aBdefC16c32b2c,
+    dnnl_gOdhwI16i32o4i = dnnl_aBdefC16c32b4c,
+    dnnl_gOdhwI16i48o2i = dnnl_aBdefC16c48b2c,
+    dnnl_gOdhwI16i48o4i = dnnl_aBdefC16c48b4c,
+    dnnl_gOdhwI16i64o2i = dnnl_aBdefC16c64b2c,
+    dnnl_gOdhwI16i64o4i = dnnl_aBdefC16c64b4c,
+    dnnl_gIdhwO16o32i2o = dnnl_aCdefB16b32c2b,
+    dnnl_gIdhwO16o32i4o = dnnl_aCdefB16b32c4b,
+    dnnl_gIdhwO16o48i2o = dnnl_aCdefB16b48c2b,
+    dnnl_gIdhwO16o48i4o = dnnl_aCdefB16b48c4b,
+    dnnl_gIdhwO16o64i2o = dnnl_aCdefB16b64c2b,
+    dnnl_gIdhwO16o64i4o = dnnl_aCdefB16b64c4b,
+    dnnl_hwioG16g = dnnl_decbA16a,
+    dnnl_hwioG8g = dnnl_decbA8a,
+    dnnl_dhwioG16g = dnnl_defcbA16a,
+    dnnl_dhwioG8g = dnnl_defcbA8a,
+    dnnl_NCdhw40n16c = dnnl_ABcde40a16b,
+    dnnl_NCw40n16c = dnnl_ABc40a16b,
+    dnnl_NChw40n16c = dnnl_ABcd40a16b,
+    dnnl_NCw40n32c = dnnl_ABc40a32b,
+    dnnl_NChw40n32c = dnnl_ABcd40a32b,
+    dnnl_NCdhw40n32c = dnnl_ABcde40a32b,
+    dnnl_OIdhw4o8i8o2i = dnnl_ABcde4a8b8a2b,
+    dnnl_OIhw4o8i8o2i = dnnl_ABcd4a8b8a2b,
+    dnnl_OIw4o8i8o2i = dnnl_ABc4a8b8a2b,
+    dnnl_gOIdhw4o8i8o2i = dnnl_aBCdef4b8c8b2c,
+    dnnl_gOIhw4o8i8o2i = dnnl_aBCde4b8c8b2c,
+    dnnl_gOIw4o8i8o2i = dnnl_aBCd4b8c8b2c,
+    dnnl_IOdhw4i8o8i2o = dnnl_BAcde4b8a8b2a,
+    dnnl_IOhw4i8o8i2o = dnnl_BAcd4b8a8b2a,
+    dnnl_IOw4i8o8i2o = dnnl_BAc4b8a8b2a,
+    dnnl_gIOdhw4i8o8i2o = dnnl_aCBdef4c8b8c2b,
+    dnnl_gIOhw4i8o8i2o = dnnl_aCBde4c8b8c2b,
+    dnnl_gIOw4i8o8i2o = dnnl_aCBd4c8b8c2b,
+    dnnl_NCw2c32n8c = dnnl_ABc2b32a8b,
+    dnnl_NChw2c32n8c = dnnl_ABcd2b32a8b,
+    dnnl_NCdhw2c32n8c = dnnl_ABcde2b32a8b,
+    dnnl_OIw2i8o16i4o = dnnl_ABc2b8a16b4a,
+    dnnl_OIhw2i8o16i4o = dnnl_ABcd2b8a16b4a,
+    dnnl_OIdhw2i8o16i4o = dnnl_ABcde2b8a16b4a,
+    dnnl_OIw2o8i16o4i = dnnl_ABc2a8b16a4b,
+    dnnl_OIw2o8i16o2i = dnnl_ABc2a8b16a2b,
+    dnnl_IOw2i8o16i4o = dnnl_BAc2b8a16b4a,
+    dnnl_IOw2i8o16i2o = dnnl_BAc2b8a16b2a,
+    dnnl_OIhw2o8i16o4i = dnnl_ABcd2a8b16a4b,
+    dnnl_OIhw2o8i16o2i = dnnl_ABcd2a8b16a2b,
+    dnnl_IOhw2i8o16i4o = dnnl_BAcd2b8a16b4a,
+    dnnl_IOhw2i8o16i2o = dnnl_BAcd2b8a16b2a,
+    dnnl_OIdhw2o8i16o4i = dnnl_ABcde2a8b16a4b,
+    dnnl_OIdhw2o8i16o2i = dnnl_ABcde2a8b16a2b,
+    dnnl_IOdhw2i8o16i4o = dnnl_BAcde2b8a16b4a,
+    dnnl_IOdhw2i8o16i2o = dnnl_BAcde2b8a16b2a,
+    dnnl_gOIw2o8i16o2i = dnnl_aBCd2b8c16b2c,
+    dnnl_gIOw2i8o16i2o = dnnl_aCBd2c8b16c2b,
+    dnnl_gIOhw2i8o16i2o = dnnl_aBCde2c8b16c2b,
+    dnnl_gIOdhw2i8o16i2o = dnnl_aBCdef2c8b16c2b,
+    dnnl_gOIhw2o8i16o2i = dnnl_aBCde2b8c16b2c,
+    dnnl_gOIdhw2o8i16o2i = dnnl_aBCdef2b8c16b2c,
+    dnnl_gOIw2o8i16o4i = dnnl_aBCd2b8c16b4c,
+    dnnl_gOIhw2o8i16o4i = dnnl_aBCde2b8c16b4c,
+} dnnl_format_tag_t;
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Kinds of propagation.
+typedef enum {
+    // TODO: suggest renames
+    /// Undefined propagation type.
+    dnnl_prop_kind_undef = 0,
+    /// Forward data propagation (training mode). In this mode primitives
+    /// perform computations necessary for subsequent backward propagation.
+    dnnl_forward_training = 64,
+    /// Forward data propagation (inference mode). In this mode primitives
+    /// perform only computations that are necessary for inference and omit
+    /// computations that are necessary only for backward propagation.
+    dnnl_forward_inference = 96,
+    /// Forward data propagation (alias for @c dnnl_forward_training).
+    dnnl_forward = dnnl_forward_training,
+    /// Backward propagation (with respect to all parameters).
+    dnnl_backward = 128,
+    /// Backward data propagation.
+    dnnl_backward_data = 160,
+    /// Backward weights propagation.
+    dnnl_backward_weights = 192,
+    /// Backward bias propagation.
+    dnnl_backward_bias = 193,
+} dnnl_prop_kind_t;
+
+/// Kinds of primitives. Used to implement a way to extend the library with new
+/// primitives without changing the ABI.
+typedef enum {
+    /// Undefined primitive
+    dnnl_undefined_primitive,
+    /// A reorder primitive.
+    dnnl_reorder,
+    /// A shuffle primitive.
+    dnnl_shuffle,
+    /// A (out-of-place) concat primitive.
+    dnnl_concat,
+    /// A sum primitive.
+    dnnl_sum,
+    /// A convolution primitive.
+    dnnl_convolution,
+    /// A deconvolution primitive.
+    dnnl_deconvolution,
+    /// An element-wise primitive.
+    dnnl_eltwise,
+    /// An LRN primitive.
+    dnnl_lrn,
+    /// A batch normalization primitive.
+    dnnl_batch_normalization,
+    /// An inner product primitive.
+    dnnl_inner_product,
+    /// A rnn primitive.
+    dnnl_rnn,
+    /// A matrix multiplication primitive (internal).
+    dnnl_gemm,
+    /// A binary primitive.
+    dnnl_binary,
+    /// A matrix multiplication primitive.
+    dnnl_matmul,
+    /// A resampling primitive.
+    dnnl_resampling,
+    /// A pooling primitive.
+    dnnl_pooling,
+    /// A reduction primitive.
+    dnnl_reduction,
+    /// A PReLU primitive.
+    dnnl_prelu,
+    /// A softmax primitive.
+    dnnl_softmax,
+    /// A layer normalization primitive.
+    dnnl_layer_normalization,
+    /// A group normalization primitive.
+    dnnl_group_normalization,
+
+    /// Parameter to allow internal only primitives without undefined behavior.
+    /// This parameter is chosen to be valid for so long as sizeof(int) >= 2.
+    dnnl_primitive_kind_max = 0x7fff,
+} dnnl_primitive_kind_t;
+
+/// Kinds of algorithms.
+typedef enum {
+    dnnl_alg_kind_undef,
+    /// Direct convolution
+    dnnl_convolution_direct = 0x1,
+    /// Winograd convolution
+    dnnl_convolution_winograd = 0x2,
+    /// Convolution algorithm(either direct or Winograd) is chosen just in time
+    dnnl_convolution_auto = 0x3,
+    /// Direct deconvolution
+    dnnl_deconvolution_direct = 0xa,
+    /// Winograd deconvolution
+    dnnl_deconvolution_winograd = 0xb,
+    /// Eltwise: ReLU
+    dnnl_eltwise_relu = 0x20,
+    /// Eltwise: hyperbolic tangent non-linearity (tanh)
+    dnnl_eltwise_tanh,
+    /// Eltwise: exponential linear unit (elu)
+    dnnl_eltwise_elu,
+    /// Eltwise: square
+    dnnl_eltwise_square,
+    /// Eltwise: abs
+    dnnl_eltwise_abs,
+    /// Eltwise: square root
+    dnnl_eltwise_sqrt,
+    /// Eltwise: linear
+    dnnl_eltwise_linear,
+    /// Eltwise: soft_relu
+    dnnl_eltwise_soft_relu,
+    /// Eltwise: hardsigmoid
+    dnnl_eltwise_hardsigmoid,
+    /// Eltwise: logistic
+    dnnl_eltwise_logistic,
+    /// Eltwise: exponent
+    dnnl_eltwise_exp,
+    /// Eltwise: gelu
+    ///
+    /// @note Tanh approximation formula is used to approximate
+    /// the cumulative distribution function of a Gaussian here
+    dnnl_eltwise_gelu_tanh,
+    /// Eltwise: swish
+    dnnl_eltwise_swish,
+    /// Eltwise: natural logarithm
+    dnnl_eltwise_log,
+    /// Eltwise: clip
+    dnnl_eltwise_clip,
+    /// Eltwise: clip version 2
+    dnnl_eltwise_clip_v2,
+    /// Eltwise: pow
+    dnnl_eltwise_pow,
+    /// Eltwise: erf-based gelu
+    dnnl_eltwise_gelu_erf,
+    /// Eltwise: round
+    dnnl_eltwise_round,
+    /// Eltwise: mish
+    dnnl_eltwise_mish,
+    /// Eltwise: hardswish
+    dnnl_eltwise_hardswish,
+    /// Eltwise: ReLU (dst for backward)
+    dnnl_eltwise_relu_use_dst_for_bwd = 0x100,
+    /// Eltwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
+    dnnl_eltwise_tanh_use_dst_for_bwd,
+    /// Eltwise: exponential linear unit (elu) (dst for backward)
+    dnnl_eltwise_elu_use_dst_for_bwd,
+    /// Eltwise: square root (dst for backward)
+    dnnl_eltwise_sqrt_use_dst_for_bwd,
+    /// Eltwise: logistic (dst for backward)
+    dnnl_eltwise_logistic_use_dst_for_bwd,
+    /// Eltwise: exp (dst for backward)
+    dnnl_eltwise_exp_use_dst_for_bwd,
+    /// Eltwise: clip version 2 (dst for backward)
+    dnnl_eltwise_clip_v2_use_dst_for_bwd,
+    /// Max pooling
+    dnnl_pooling_max = 0x1ff,
+    /// Average pooling include padding
+    dnnl_pooling_avg_include_padding = 0x2ff,
+    /// Average pooling exclude padding
+    dnnl_pooling_avg_exclude_padding = 0x3ff,
+    /// Local response normalization (LRN) across multiple channels
+    dnnl_lrn_across_channels = 0xaff,
+    /// LRN within a single channel
+    dnnl_lrn_within_channel = 0xbff,
+    /// RNN cell
+    dnnl_vanilla_rnn = 0x1fff,
+    /// LSTM cell
+    dnnl_vanilla_lstm = 0x2fff,
+    /// GRU cell
+    dnnl_vanilla_gru = 0x3fff,
+    /// GRU cell with linear before reset
+    ///
+    /// Modification of original GRU cell. Differs from #dnnl_vanilla_gru
+    /// in how the new memory gate is calculated:
+    /// \f[ c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f]
+    /// Primitive expects 4 biases on input:
+    /// \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$
+    dnnl_lbr_gru = 0x4fff,
+    /// AUGRU cell
+    dnnl_vanilla_augru = 0x5fff,
+    /// AUGRU cell with linear before reset
+    dnnl_lbr_augru = 0x6fff,
+    /// Binary add
+    dnnl_binary_add = 0x1fff0,
+    /// Binary mul
+    dnnl_binary_mul = 0x1fff1,
+    /// Binary max
+    dnnl_binary_max = 0x1fff2,
+    /// Binary min
+    dnnl_binary_min = 0x1fff3,
+    /// Binary div
+    dnnl_binary_div = 0x1fff4,
+    /// Binary sub
+    dnnl_binary_sub = 0x1fff5,
+    /// Binary greater or equal
+    dnnl_binary_ge = 0x1fff6,
+    /// Binary greater than
+    dnnl_binary_gt = 0x1fff7,
+    /// Binary less or equal
+    dnnl_binary_le = 0x1fff8,
+    /// Binary less than
+    dnnl_binary_lt = 0x1fff9,
+    /// Binary equal
+    dnnl_binary_eq = 0x1fffa,
+    /// Binary not equal
+    dnnl_binary_ne = 0x1fffb,
+    /// Binary select
+    dnnl_binary_select = 0x1fffc,
+    /// Nearest Neighbor Resampling Method
+    dnnl_resampling_nearest = 0x2fff0,
+    /// Linear Resampling Method
+    dnnl_resampling_linear = 0x2fff1,
+    /// Reduction using max
+    dnnl_reduction_max,
+    /// Reduction using min
+    dnnl_reduction_min,
+    /// Reduction using sum
+    dnnl_reduction_sum,
+    /// Reduction using mul
+    dnnl_reduction_mul,
+    /// Reduction using mean
+    dnnl_reduction_mean,
+    /// Reduction using lp norm
+    dnnl_reduction_norm_lp_max,
+    /// Reduction using lp norm
+    dnnl_reduction_norm_lp_sum,
+    /// Reduction using lp norm without final pth-root
+    dnnl_reduction_norm_lp_power_p_max,
+    /// Reduction using lp norm without final pth-root
+    dnnl_reduction_norm_lp_power_p_sum,
+    /// Softmax
+    dnnl_softmax_accurate = 0x30000,
+    /// Logsoftmax
+    dnnl_softmax_log,
+} dnnl_alg_kind_t;
+
+/// Flags for normalization primitives.
+typedef enum {
+    /// Use no normalization flags
+    ///
+    /// If specified
+    ///  - on forward training propagation mean and variance are computed and
+    ///    stored as output
+    ///  - on backward propagation compute full derivative wrt data
+    ///  - on backward propagation prop_kind == #dnnl_backward_data has the same
+    ///    behavior as prop_kind == #dnnl_backward
+    dnnl_normalization_flags_none = 0x0U,
+
+    /// Use global statistics
+    ///
+    /// If specified
+    ///  - on forward propagation use mean and variance provided by user (input)
+    ///  - on backward propagation reduces the amount of computations, since
+    ///    mean and variance are considered as constants
+    ///
+    ///  If not specified:
+    ///   - on forward propagation mean and variance are computed and stored as
+    ///     output
+    ///   - on backward propagation compute full derivative wrt data
+    dnnl_use_global_stats = 0x1U,
+
+    /// Use scale parameter
+    ///
+    /// If specified:
+    ///  - on forward propagation use scale for the normalization results
+    ///  - on backward propagation (for prop_kind == #dnnl_backward) compute
+    ///    diff wrt scale (hence one extra output used)
+    dnnl_use_scale = 0x2U,
+
+    /// Use shift parameter
+    ///
+    /// If specified:
+    ///  - on forward propagation use shift (aka bias) for the normalization
+    ///    results
+    ///  - on backward propagation (for prop_kind == #dnnl_backward) compute
+    ///    diff wrt shift (hence one extra output used)
+    dnnl_use_shift = 0x4U,
+
+    /// Fuse with ReLU
+    ///
+    /// The flag implies negative slope being 0. On training this is the only
+    /// configuration supported. For inference, to use non-zero negative slope
+    /// consider using @ref dev_guide_attributes_post_ops.
+    ///
+    /// If specified:
+    ///  - on inference this option behaves the same as if the primitive were
+    ///    fused with ReLU using post ops API with zero negative slope.
+    ///  - on training primitive requires workspace (required to be able to
+    ///    perform backward pass)
+    dnnl_fuse_norm_relu = 0x8U,
+
+    /// Fuse with Add and then fuse with ReLU
+    ///
+    /// If specified:
+    ///
+    ///  - on forward propagation apply element-wise binary Add operation to
+    ///    to the normalization results with an additional input tensor and then
+    ///    apply ReLU with negative slope being 0.
+    ///  - on training primitive requires workspace (required to be able to
+    ///    perform backward pass).
+    ///  - on backward propagation save the result of backward ReLU operation
+    ///    with input tensor and workspace from forward pass to extra output
+    ///    tensor and then perform backward normalization.
+    dnnl_fuse_norm_add_relu = 0x10U,
+
+} dnnl_normalization_flags_t;
+
+/// @} dnnl_api_primitives_common
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// A wildcard value for dimensions that are unknown at a primitive creation
+/// time.
+#define DNNL_RUNTIME_DIM_VAL INT64_MIN
+
+/// A `size_t` counterpart of the DNNL_RUNTIME_DIM_VAL.
+/// For instance, this value is returned by dnnl_memory_desc_get_size() if
+/// either of the dimensions or strides equal to #DNNL_RUNTIME_DIM_VAL.
+#define DNNL_RUNTIME_SIZE_VAL ((size_t)DNNL_RUNTIME_DIM_VAL)
+
+/// @cond DO_NOT_DOCUMENT_THIS
+/// Hex representation for a **special** quiet NAN (!= NAN from math.h)
+static const union {
+    unsigned u;
+    float f;
+} DNNL_RUNTIME_F32_VAL_REP = {0x7fc000d0};
+/// @endcond
+
+/// A wildcard value for floating point values that are unknown at a primitive
+/// creation time.
+#define DNNL_RUNTIME_F32_VAL (DNNL_RUNTIME_F32_VAL_REP.f)
+
+/// @cond DO_NOT_DOCUMENT_THIS
+static const int DNNL_RUNTIME_S32_VAL_REP = INT32_MIN;
+/// @endcond
+
+/// A wildcard value for int32_t values that are unknown at a primitive creation
+/// time.
+#define DNNL_RUNTIME_S32_VAL DNNL_RUNTIME_S32_VAL_REP
+
+/// @struct dnnl_memory_desc
+/// An opaque structure to describe a memory descriptor.
+struct dnnl_memory_desc;
+
+/// A memory descriptor handle.
+typedef struct dnnl_memory_desc *dnnl_memory_desc_t;
+
+/// A memory descriptor handle.
+typedef const struct dnnl_memory_desc *const_dnnl_memory_desc_t;
+
+/// @struct dnnl_memory
+/// An opaque structure to describe a memory.
+struct dnnl_memory;
+
+/// A memory handle.
+typedef struct dnnl_memory *dnnl_memory_t;
+
+/// A constant memory handle.
+typedef const struct dnnl_memory *const_dnnl_memory_t;
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+
+/// @addtogroup dnnl_api_rnn
+/// @{
+
+/// Flags for RNN cell.
+typedef enum {
+    /// Undefined RNN flags
+    dnnl_rnn_flags_undef = 0x0,
+    /// Do not add weights gradient to existing diff_weights memory
+    dnnl_rnn_flags_diff_weights_overwrite = 0x1,
+} dnnl_rnn_flags_t;
+
+/// A direction of RNN primitive execution.
+typedef enum {
+    /// Undefined RNN direction.
+    dnnl_rnn_direction_undef = 0,
+    /// Unidirectional execution of RNN primitive from left to right.
+    dnnl_unidirectional_left2right,
+    /// Unidirectional execution of RNN primitive from right to left.
+    dnnl_unidirectional_right2left,
+    /// Bidirectional execution of RNN primitive with concatenation of the
+    /// results.
+    dnnl_bidirectional_concat,
+    /// Bidirectional execution of RNN primitive with summation of the
+    /// results.
+    dnnl_bidirectional_sum,
+} dnnl_rnn_direction_t;
+
+/// @} dnnl_api_rnn
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// @struct dnnl_primitive_desc
+/// @brief An opaque structure to describe a primitive descriptor.
+struct dnnl_primitive_desc;
+
+/// @brief A primitive descriptor handle.
+typedef struct dnnl_primitive_desc *dnnl_primitive_desc_t;
+
+/// @brief A constant primitive descriptor handle.
+typedef const struct dnnl_primitive_desc *const_dnnl_primitive_desc_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_attributes
+/// @{
+
+/// Scratchpad mode
+typedef enum {
+    /// The library manages the scratchpad allocation according to the policy
+    /// specified by the `DNNL_ENABLE_CONCURRENT_EXEC`
+    /// [build option](@ref dev_guide_build_options) (default).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=OFF` (default), the library
+    /// scratchpad is common to all primitives to reduce the memory footprint.
+    /// This configuration comes with limited thread-safety properties, namely
+    /// primitives can be created and executed in parallel but cannot migrate
+    /// between threads (in other words, each primitive should be executed in
+    /// the same thread it was created in).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=ON`, the library scratchpad is
+    /// private to each primitive. The memory footprint is larger than when
+    /// using `DNNL_ENABLE_CONCURRENT_EXEC=OFF` but different primitives can be
+    /// created and run concurrently (the same primitive cannot be run
+    /// concurrently from two different threads though).
+    dnnl_scratchpad_mode_library,
+    /// The user manages the scratchpad allocation by querying and providing
+    /// the scratchpad memory to primitives. This mode is thread-safe as long
+    /// as the scratchpad buffers are not used concurrently by two primitive
+    /// executions.
+    dnnl_scratchpad_mode_user,
+} dnnl_scratchpad_mode_t;
+
+/// Rounding mode
+typedef enum {
+    /// rounding mode dictated by the floating-point environment
+    dnnl_rounding_mode_environment,
+    /// stochastic rounding mode where a random bias is added to the
+    /// trailing mantissa bits before conversion.
+    dnnl_rounding_mode_stochastic,
+} dnnl_rounding_mode_t;
+
+/// @struct dnnl_primitive_attr
+/// @brief An opaque structure for primitive descriptor attributes.
+///
+/// Attributes may contain:
+///  - output scales (to scale the result prior to storing it to the memory)
+struct dnnl_primitive_attr;
+
+/// @brief A primitive descriptor attributes handle that controls primitive
+/// behavior.
+typedef struct dnnl_primitive_attr *dnnl_primitive_attr_t;
+
+/// @brief A constant primitive descriptor attributes handle.
+typedef const struct dnnl_primitive_attr *const_dnnl_primitive_attr_t;
+
+/// @struct dnnl_post_ops
+/// @brief An opaque structure for a chain of post operations.
+///
+/// dnnl_post_ops can be used to perform some (trivial) operations like
+/// accumulation or eltwise after certain primitives like convolution.
+///
+/// Post operations might be combined together, making a chain of post
+/// operations. For instance one can configure convolution followed by
+/// accumulation followed by eltwise. This might be especially beneficial
+/// for residual learning blocks.
+///
+/// @warning
+///      Of course not all combinations are supported, so the user should handle
+///      errors accordingly.
+///
+/// Supported post operations:
+///  - accumulation (base primitive: convolution)
+///  - eltwise (base primitive: convolution)
+struct dnnl_post_ops;
+
+/// @brief A post operation chain handle.
+typedef struct dnnl_post_ops *dnnl_post_ops_t;
+
+/// @brief A constant post operation chain handle.
+typedef const struct dnnl_post_ops *const_dnnl_post_ops_t;
+
+/// @} dnnl_api_attributes
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// @struct dnnl_primitive
+/// An opaque structure to describe a primitive.
+struct dnnl_primitive;
+/// A primitive handle.
+typedef struct dnnl_primitive *dnnl_primitive_t;
+/// A constant primitive handle.
+typedef const struct dnnl_primitive *const_dnnl_primitive_t;
+
+/// Undefined argument.
+#define DNNL_ARG_UNDEF 0
+/// Source argument #0.
+#define DNNL_ARG_SRC_0 1
+/// A special mnemonic for source argument for primitives that have a
+/// single source. An alias for #DNNL_ARG_SRC_0.
+#define DNNL_ARG_SRC DNNL_ARG_SRC_0
+/// A special mnemonic for RNN input vector. An alias for
+/// #DNNL_ARG_SRC_0.
+#define DNNL_ARG_SRC_LAYER DNNL_ARG_SRC_0
+/// A special mnemonic for reorder source argument. An alias for
+/// #DNNL_ARG_SRC_0.
+#define DNNL_ARG_FROM DNNL_ARG_SRC_0
+
+/// Source argument #1.
+#define DNNL_ARG_SRC_1 2
+/// A special mnemonic for RNN input recurrent hidden state vector. An alias
+/// for #DNNL_ARG_SRC_1.
+#define DNNL_ARG_SRC_ITER DNNL_ARG_SRC_1
+
+/// Source argument #2.
+#define DNNL_ARG_SRC_2 3
+/// A special mnemonic for RNN input recurrent cell state vector. An alias for
+/// #DNNL_ARG_SRC_2.
+#define DNNL_ARG_SRC_ITER_C DNNL_ARG_SRC_2
+
+/// Source argument #3.
+#define DNNL_ARG_SRC_3 4
+/// A special mnemonic for RNN input recurrent cell attention vector. An alias for
+/// #DNNL_ARG_SRC_3.
+#define DNNL_ARG_AUGRU_ATTENTION DNNL_ARG_SRC_3
+
+/// Destination argument #0.
+#define DNNL_ARG_DST_0 17
+/// A special mnemonic for destination argument for primitives that have a
+/// single destination. An alias for #DNNL_ARG_DST_0.
+#define DNNL_ARG_DST DNNL_ARG_DST_0
+/// A special mnemonic for reorder destination argument. An alias for
+/// #DNNL_ARG_DST_0.
+#define DNNL_ARG_TO DNNL_ARG_DST_0
+/// A special mnemonic for RNN output vector. An alias for #DNNL_ARG_DST_0.
+#define DNNL_ARG_DST_LAYER DNNL_ARG_DST_0
+
+/// Destination argument #1.
+#define DNNL_ARG_DST_1 18
+/// A special mnemonic for RNN input recurrent hidden state vector. An
+/// alias for #DNNL_ARG_DST_1.
+#define DNNL_ARG_DST_ITER DNNL_ARG_DST_1
+
+/// Destination argument #2.
+#define DNNL_ARG_DST_2 19
+/// A special mnemonic for LSTM output recurrent cell state vector. An
+/// alias for #DNNL_ARG_DST_2.
+#define DNNL_ARG_DST_ITER_C DNNL_ARG_DST_2
+
+/// Weights argument #0.
+#define DNNL_ARG_WEIGHTS_0 33
+/// A special mnemonic for primitives that have a single weights
+/// argument. Alias for #DNNL_ARG_WEIGHTS_0.
+#define DNNL_ARG_WEIGHTS DNNL_ARG_WEIGHTS_0
+/// A special mnemonic for RNN weights applied to the layer input. An
+/// alias for #DNNL_ARG_WEIGHTS_0.
+#define DNNL_ARG_WEIGHTS_LAYER DNNL_ARG_WEIGHTS_0
+
+/// Weights argument #1.
+#define DNNL_ARG_WEIGHTS_1 34
+/// A special mnemonic for RNN weights applied to the recurrent input.
+/// An alias for #DNNL_ARG_WEIGHTS_1.
+#define DNNL_ARG_WEIGHTS_ITER DNNL_ARG_WEIGHTS_1
+
+/// Weights argument #2.
+#define DNNL_ARG_WEIGHTS_2 35
+/// A special mnemonic for RNN weights applied to the peephole weights.
+/// An alias for #DNNL_ARG_WEIGHTS_2.
+#define DNNL_ARG_WEIGHTS_PEEPHOLE DNNL_ARG_WEIGHTS_2
+
+/// Weights argument #3.
+#define DNNL_ARG_WEIGHTS_3 36
+/// A special mnemonic for RNN weights applied to the projection weights.
+/// An alias for #DNNL_ARG_WEIGHTS_3.
+#define DNNL_ARG_WEIGHTS_PROJECTION DNNL_ARG_WEIGHTS_3
+
+/// Bias tensor argument.
+#define DNNL_ARG_BIAS 41
+
+/// Mean values tensor argument.
+#define DNNL_ARG_MEAN 49
+/// Variance values tensor argument.
+#define DNNL_ARG_VARIANCE 50
+
+/// A special mnemonic for scale argument of normalization primitives.
+#define DNNL_ARG_SCALE 51
+/// A special mnemonic for shift argument of normalization primitives.
+#define DNNL_ARG_SHIFT 52
+
+/// Workspace tensor argument. Workspace is used to pass information
+/// from forward propagation to backward propagation computations.
+#define DNNL_ARG_WORKSPACE 64
+/// Scratchpad (temporary storage) tensor argument.
+#define DNNL_ARG_SCRATCHPAD 80
+
+/// Gradient (diff) of the source argument #0.
+#define DNNL_ARG_DIFF_SRC_0 129
+/// A special mnemonic for primitives that have a single diff source argument.
+/// An alias for #DNNL_ARG_DIFF_SRC_0.
+#define DNNL_ARG_DIFF_SRC DNNL_ARG_DIFF_SRC_0
+/// A special mnemonic for gradient (diff) of RNN input vector. An alias for
+/// #DNNL_ARG_DIFF_SRC_0.
+#define DNNL_ARG_DIFF_SRC_LAYER DNNL_ARG_DIFF_SRC_0
+
+/// Gradient (diff) of the source argument #1.
+#define DNNL_ARG_DIFF_SRC_1 130
+/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_1.
+#define DNNL_ARG_DIFF_SRC_ITER DNNL_ARG_DIFF_SRC_1
+
+/// Gradient (diff) of the source argument #2.
+#define DNNL_ARG_DIFF_SRC_2 131
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell state
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_1.
+#define DNNL_ARG_DIFF_SRC_ITER_C DNNL_ARG_DIFF_SRC_2
+
+/// Gradient (diff) of the source argument #3.
+#define DNNL_ARG_DIFF_SRC_3 132
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell attention
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_3.
+#define DNNL_ARG_DIFF_AUGRU_ATTENTION DNNL_ARG_DIFF_SRC_3
+
+/// Gradient (diff) of the destination argument #0.
+#define DNNL_ARG_DIFF_DST_0 145
+/// A special mnemonic for primitives that have a single diff destination
+/// argument. An alias for #DNNL_ARG_DIFF_DST_0.
+#define DNNL_ARG_DIFF_DST DNNL_ARG_DIFF_DST_0
+/// A special mnemonic for gradient (diff) of RNN output vector. An alias for
+/// #DNNL_ARG_DIFF_DST_0.
+#define DNNL_ARG_DIFF_DST_LAYER DNNL_ARG_DIFF_DST_0
+
+/// Gradient (diff) of the destination argument #1.
+#define DNNL_ARG_DIFF_DST_1 146
+/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state
+/// vector. An alias for #DNNL_ARG_DIFF_DST_1.
+#define DNNL_ARG_DIFF_DST_ITER DNNL_ARG_DIFF_DST_1
+
+/// Gradient (diff) of the destination argument #2.
+#define DNNL_ARG_DIFF_DST_2 147
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell state
+/// vector. An alias for #DNNL_ARG_DIFF_DST_2.
+#define DNNL_ARG_DIFF_DST_ITER_C DNNL_ARG_DIFF_DST_2
+
+/// Gradient (diff) of the weights argument #0.
+#define DNNL_ARG_DIFF_WEIGHTS_0 161
+/// A special mnemonic for primitives that have a single diff weights
+/// argument. Alias for #DNNL_ARG_DIFF_WEIGHTS_0.
+#define DNNL_ARG_DIFF_WEIGHTS DNNL_ARG_DIFF_WEIGHTS_0
+/// A special mnemonic for diff of RNN weights applied to the layer input. An
+/// alias for #DNNL_ARG_DIFF_WEIGHTS_0.
+#define DNNL_ARG_DIFF_WEIGHTS_LAYER DNNL_ARG_DIFF_WEIGHTS_0
+
+/// Gradient (diff) of the weights argument #1.
+#define DNNL_ARG_DIFF_WEIGHTS_1 162
+/// A special mnemonic for diff of RNN weights applied to the recurrent input.
+/// An alias for #DNNL_ARG_DIFF_WEIGHTS_1.
+#define DNNL_ARG_DIFF_WEIGHTS_ITER DNNL_ARG_DIFF_WEIGHTS_1
+
+/// Gradient (diff) of the weights argument #2.
+#define DNNL_ARG_DIFF_WEIGHTS_2 163
+/// A special mnemonic for diff of RNN weights applied to the peephole weights.
+/// An alias for #DNNL_ARG_DIFF_WEIGHTS_2.
+#define DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE DNNL_ARG_DIFF_WEIGHTS_2
+
+/// Gradient (diff) of the weights argument #3.
+#define DNNL_ARG_DIFF_WEIGHTS_3 164
+/// A special mnemonic for diff of RNN weights applied to the projection
+/// weights. An alias for #DNNL_ARG_DIFF_WEIGHTS_3.
+#define DNNL_ARG_DIFF_WEIGHTS_PROJECTION DNNL_ARG_DIFF_WEIGHTS_3
+
+/// Gradient (diff) of the bias tensor argument.
+#define DNNL_ARG_DIFF_BIAS 169
+
+/// A special mnemonic for scale argument of normalization primitives.
+#define DNNL_ARG_DIFF_SCALE 255
+/// A special mnemonic for shift argument of normalization primitives.
+#define DNNL_ARG_DIFF_SHIFT 256
+
+/// Rounding mode seed for stochastic rounding
+/// Single seed needed independently of how many arguments need stochastic rounding
+#define DNNL_ARG_ATTR_ROUNDING_SEED 508
+
+/// Dropout mask output buffer.
+#define DNNL_ARG_ATTR_DROPOUT_MASK 509
+
+/// Dropout probability value passed via a buffer.
+#define DNNL_ARG_ATTR_DROPOUT_PROBABILITY 510
+
+/// Dropout RNG seed value passed via a buffer.
+#define DNNL_ARG_ATTR_DROPOUT_SEED 511
+
+/// Output scaling factors provided at execution time.
+#define DNNL_ARG_ATTR_OUTPUT_SCALES 513
+
+/// Starting index for source arguments for primitives that take a variable
+/// number of source arguments.
+#define DNNL_ARG_MULTIPLE_SRC 1024
+/// Starting index for destination arguments for primitives that produce a
+/// variable number of destination arguments.
+#define DNNL_ARG_MULTIPLE_DST 2048
+
+/// Scaling factors provided at execution time.
+#define DNNL_ARG_ATTR_SCALES 4096
+
+/// Zero points provided at execution time.
+#define DNNL_ARG_ATTR_ZERO_POINTS 8192
+
+/// Arguments for fused depthwise convolution.
+/// See @ref dev_guide_attributes_post_ops_depthwise_fusion
+#define DNNL_ARG_ATTR_POST_OP_DW 16384
+
+/// Starting point for a binary post operation.
+#define DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE 32768
+
+/// Arguments for a binary post operation. Up to 32 arguments are supported.
+/// See @ref dev_guide_attributes_post_ops_binary_fusion
+#define DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) \
+    (DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE * ((idx) + 1))
+
+/// A structure that contains an index and a memory object, and is used to pass
+/// arguments to dnnl_primitive_execute().
+typedef struct {
+    int arg; ///< An argument index, e.g. DNNL_ARG_SRC
+    dnnl_memory_t memory; ///< Input/output memory
+} dnnl_exec_arg_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Primitive descriptor query specification
+///
+/// For generic function dnnl_primitive_desc_query(), the type of result must
+/// agree with the queried argument. The correspondence table:
+///
+/// Query kind                      | Type of query result
+/// --------------------------------|-----------------------------
+/// dnnl_query_*_engine             | #dnnl_engine_t *
+/// #dnnl_query_primitive_kind      | #dnnl_primitive_kind_t *
+/// dnnl_query_*_s32                | int *
+/// dnnl_query_*_s64                | #dnnl_dim_t * (same as int64_t *)
+/// dnnl_query_*_f32                | float *
+/// dnnl_query_*_f64                | double *
+/// dnnl_query_*_str                | const char **
+/// dnnl_query_*_md                 | #const_dnnl_memory_desc_t *
+/// dnnl_query_*_pd                 | #const_dnnl_primitive_desc_t *
+/// dnnl_query_cache_blob_id        | const uint8_t **
+/// dnnl_query_strides              | const #dnnl_dims_t **
+/// dnnl_query_dilations            | const #dnnl_dims_t **
+/// dnnl_query_padding_l            | const #dnnl_dims_t **
+/// dnnl_query_padding_r            | const #dnnl_dims_t **
+/// dnnl_query_flags                | unsigned *
+/// dnnl_query_alg_kind             | #dnnl_alg_kind_t *
+/// dnnl_query_factors              | const float **
+/// dnnl_query_cell_kind            | #dnnl_alg_kind_t *
+/// dnnl_query_direction            | #dnnl_rnn_direction_t *
+/// dnnl_query_activation_kind      | #dnnl_alg_kind_t *
+/// dnnl_query_kernel               | const #dnnl_dims_t **
+/// dnnl_query_dims                 | const #dnnl_dims_t **
+/// dnnl_query_data_type            | #dnnl_data_type_t *
+/// dnnl_query_padded_dims          | const #dnnl_dims_t **
+/// dnnl_query_padded_offsets       | const #dnnl_dims_t **
+/// dnnl_query_format_kind          | #dnnl_format_kind_t *
+/// dnnl_query_inner_blks           | const #dnnl_dims_t **
+/// dnnl_query_inner_idxs           | const #dnnl_dims_t **
+/// dnnl_query_sparse_encoding      | #dnnl_sparse_encoding_t *
+///
+/// @note
+///     Rule of thumb: all opaque types and structures are returned by
+///     reference. All numbers are returned by value.
+///
+/// @warning
+///     All returned references point to constant objects and are valid only
+///     during the lifetime of the queried primitive descriptor. Returned objects
+///     must not be destroyed by the user. If you need to keep the object longer
+///     than the lifetime of the queried primitive descriptor, use
+///     dnnl_primitive_desc_clone() to make a copy.
+typedef enum {
+    dnnl_query_undef = 0, ///< no query
+
+    dnnl_query_engine, ///< execution engine
+    dnnl_query_primitive_kind, ///< primitive kind
+
+    dnnl_query_num_of_inputs_s32, ///< number of inputs expected
+    dnnl_query_num_of_outputs_s32, ///< number of outputs expected
+
+    dnnl_query_time_estimate_f64, ///< runtime estimation (seconds)
+    dnnl_query_memory_consumption_s64, ///< memory consumption -- extra
+    ///  (scratch) memory, additional to
+    ///  all inputs and outputs memory
+    ///  (bytes)
+
+    dnnl_query_scratchpad_engine, ///< scratchpad engine -- engine to be used
+    ///  for creating scratchpad memory
+
+    dnnl_query_impl_info_str, ///< implementation name
+
+    dnnl_query_reorder_src_engine, ///< source engine
+    dnnl_query_reorder_dst_engine, ///< destination engine
+
+    dnnl_query_prop_kind, ///< propagation kind
+
+    dnnl_query_cache_blob_id_size_s64, ///< size of cache blob ID in bytes
+    dnnl_query_cache_blob_id, ///< cache blob  ID (pointer to array)
+
+    dnnl_query_strides, ///< strides
+    dnnl_query_dilations, ///< dilations
+    dnnl_query_padding_l, ///< left padding
+    dnnl_query_padding_r, ///< right padding
+    dnnl_query_epsilon_f32, ///< epsilon
+    dnnl_query_flags, ///< flags
+    dnnl_query_alg_kind, ///< algorithm kind
+    dnnl_query_alpha_f32, ///< alpha
+    dnnl_query_beta_f32, ///< beta
+    dnnl_query_axis_s32, ///< axis
+    dnnl_query_local_size_s64, ///< LRN parameter local size
+    dnnl_query_k_f32, ///< LRN parameter K
+    dnnl_query_p_f32, ///< Reduction parameter P
+    dnnl_query_factors, ///< Resampling parameter factors
+    dnnl_query_cell_kind, ///< RNN parameter cell kind
+    dnnl_query_direction, ///< RNN parameter direction
+    dnnl_query_activation_kind, ///< RNN parameter activation kind
+    dnnl_query_kernel, ///< Pooling parameter kernel
+    dnnl_query_group_size_s64, ///< Shuffle parameter group size
+
+    // memory descriptor section
+    dnnl_query_some_md = 128, ///< stub
+    dnnl_query_src_md, ///< source memory desc
+    dnnl_query_diff_src_md, ///< source gradient memory desc
+    dnnl_query_weights_md, ///< weights memory descriptor desc
+    dnnl_query_diff_weights_md, ///< weights grad. memory desc
+    dnnl_query_dst_md, ///< destination memory desc
+    dnnl_query_diff_dst_md, ///< destination grad. memory desc
+    dnnl_query_workspace_md, ///< workspace memory desc
+    dnnl_query_scratchpad_md, ///< scratchpad memory desc
+    dnnl_query_exec_arg_md = 255, ///< memory desc of an execute argument
+
+    dnnl_query_ndims_s32, ///< number of dimensions
+    dnnl_query_dims, ///< vector of dimensions
+    dnnl_query_data_type, ///< data type
+    dnnl_query_submemory_offset_s64, ///< submemory offset
+    dnnl_query_padded_dims, ///< vector of padded dimensions
+    dnnl_query_padded_offsets, ///< vector of padded offsets
+    dnnl_query_format_kind, ///< format kind
+    dnnl_query_inner_nblks_s32, ///< number of innermost blocks
+    dnnl_query_inner_blks, ///< vector of sizes of the innermost blocks
+    dnnl_query_inner_idxs, ///< vector of logical indices of the blocks
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    dnnl_query_sparse_encoding, ///< Sparse encoding
+    dnnl_query_nnz_s64, ///< Number of non-zero entries
+    dnnl_query_num_handles_s32, ///< Number of buffers required for a memory
+///  descriptor
+#endif
+    // Max value to prevent UB for internal use only dnnl_query_t
+    dnnl_query_max = 0x7fff,
+} dnnl_query_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// Disable profiling completely
+#define DNNL_JIT_PROFILE_NONE 0u
+
+/// Enable VTune Profiler integration
+#define DNNL_JIT_PROFILE_VTUNE 1u
+
+/// Enable Linux perf integration via perfmap files
+#define DNNL_JIT_PROFILE_LINUX_PERFMAP 2u
+
+/// Enable Linux perf integration via jitdump files
+#define DNNL_JIT_PROFILE_LINUX_JITDUMP 4u
+
+/// Instruct Linux perf integration via jitdump files to use TSC. @ref
+/// DNNL_JIT_PROFILE_LINUX_JITDUMP must be set too for this to take effect.
+#define DNNL_JIT_PROFILE_LINUX_JITDUMP_USE_TSC 8u
+
+/// Enable Linux perf integration (both jitdump and perfmap)
+#define DNNL_JIT_PROFILE_LINUX_PERF \
+    (DNNL_JIT_PROFILE_LINUX_JITDUMP | DNNL_JIT_PROFILE_LINUX_PERFMAP)
+
+/// CPU instruction set flags
+typedef enum {
+    /// Library choice of ISA (excepting those listed as initial support)
+    dnnl_cpu_isa_default = 0x0,
+
+    /// Intel Streaming SIMD Extensions 4.1 (Intel SSE4.1)
+    dnnl_cpu_isa_sse41 = 0x1,
+
+    /// Intel Advanced Vector Extensions (Intel AVX)
+    dnnl_cpu_isa_avx = 0x3,
+
+    /// Intel Advanced Vector Extensions 2 (Intel AVX2)
+    dnnl_cpu_isa_avx2 = 0x7,
+
+    /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost) support
+    dnnl_cpu_isa_avx2_vnni = 0xf,
+
+    /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost)
+    /// with 8-bit integer, float16 and bfloat16 support
+    dnnl_cpu_isa_avx2_vnni_2 = 0x1f,
+
+    /// Intel AVX-512 subset for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core = 0x27,
+
+    /// Intel AVX-512 and Intel Deep Learning Boost (Intel DL Boost) support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core_vnni = 0x67,
+
+    /// Intel AVX-512, Intel DL Boost and bfloat16 support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core_bf16 = 0xe7,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512 = 0x1ef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512
+    dnnl_cpu_isa_avx512_core_fp16 = dnnl_cpu_isa_avx10_1_512,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and
+    /// Intel AMX with 8-bit integer and bfloat16 support
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512_amx = 0xfef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx
+    dnnl_cpu_isa_avx512_core_amx = dnnl_cpu_isa_avx10_1_512_amx,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and
+    /// Intel AMX with 8-bit integer, bfloat16 and float16 support
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512_amx_fp16 = 0x1fef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16
+    dnnl_cpu_isa_avx512_core_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16,
+} dnnl_cpu_isa_t;
+
+/// CPU ISA hints flags
+typedef enum {
+    /// No hints (use default features)
+    dnnl_cpu_isa_no_hints = 0x0,
+
+    /// Prefer to exclusively use Ymm registers for computations
+    dnnl_cpu_isa_prefer_ymm = 0x1,
+} dnnl_cpu_isa_hints_t;
+
+/// @} dnnl_api_service
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_TYPES_H */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..43b4713a603e8cba86ed803e90f1b6375482d3a0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h
@@ -0,0 +1,342 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C API
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_H
+#define ONEAPI_DNNL_DNNL_UKERNEL_H
+
+#include "oneapi/dnnl/dnnl.h"
+#include "oneapi/dnnl/dnnl_ukernel_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_ukernel
+/// @{
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// Creates a ukernel attributes memory storage.
+///
+/// @param attr_params Output ukernel attributes memory storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_create(
+        dnnl_ukernel_attr_params_t *attr_params);
+
+/// Sets post-operations arguments to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param post_ops_args A pointer to pointers of post_ops storages. Expected to
+///     be packed together.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_post_ops_args(
+        dnnl_ukernel_attr_params_t attr_params, const void **post_ops_args);
+
+/// Sets tensor A scales argument to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param a_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_A_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *a_scales);
+
+/// Sets tensor B scales argument to a storage.
+///
+/// If `dnnl_brgemm_set_B_scales` used mask of 2, then at least N values of
+/// selected data type are expected.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param b_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_B_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *b_scales);
+
+/// Sets tensor D scales argument to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param d_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_D_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *d_scales);
+
+/// Destroys a ukernel attributes memory storage.
+///
+/// @param attr_params Memory pointers storage object to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_destroy(
+        dnnl_ukernel_attr_params_t attr_params);
+
+/// @addtogroup dnnl_api_ukernel_brgemm
+/// @{
+
+/// Creates a BRGeMM ukernel object. Operates by the following formula:
+/// `C = [A x B]`.
+///
+/// @param brgemm Output BRGeMM ukernel object.
+/// @param M Dimension M of tensor A.
+/// @param N Dimension N of tensor B.
+/// @param K Dimension K of tensors A and B.
+/// @param batch_size Number of batches to process.
+/// @param lda Leading dimension of tensor A.
+/// @param ldb Leading dimension of tensor B.
+/// @param ldc Leading dimension of tensor C.
+/// @param a_dt Data type of tensor A.
+/// @param b_dt Data type of tensor B.
+/// @param c_dt Data type of tensor C. Must be dnnl_f32.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_create(dnnl_brgemm_t *brgemm, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t batch_size, dnnl_dim_t lda,
+        dnnl_dim_t ldb, dnnl_dim_t ldc, dnnl_data_type_t a_dt,
+        dnnl_data_type_t b_dt, dnnl_data_type_t c_dt);
+
+/// Sets adding an intermediate result to the output tensor C instead of
+/// writing: `C += [A x B]`.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param add_C Value to indicate addition. Can be `0` to skip addition, and
+///     `1` to apply addition.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_add_C(dnnl_brgemm_t brgemm, int add_C);
+
+/// Sets post-operations to a BRGeMM ukernel object: `D = post-operations(C)`.
+///
+/// Post-operations applies if one of the following holds:
+/// * Non-empty attributes are specified.
+/// * Output data type `d_dt` is different from accumulation data type `c_dt`.
+///
+/// If any of conditions happens, the final call of the accumulation chain
+/// must be `dnnl_brgemm_execute_postops`, and `dnnl_brgemm_execute`, otherwise.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param ldd Leading dimension of tensor D.
+/// @param d_dt Data type of tensor D.
+/// @param post_ops Primitive post operations attribute to extend the kernel
+///     operations.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_post_ops(dnnl_brgemm_t brgemm,
+        dnnl_dim_t ldd, dnnl_data_type_t d_dt, const_dnnl_post_ops_t post_ops);
+
+/// Sets tensor A scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor A scales apply to accumulation buffer once C
+/// is ready.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param a_scale_mask Tensor A scale mask. Can be `0` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_A_scales(
+        dnnl_brgemm_t brgemm, int a_scale_mask);
+
+/// Sets tensor B scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor B scales apply to accumulation buffer once C
+/// is ready.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_B_scales(
+        dnnl_brgemm_t brgemm, int b_scale_mask);
+
+/// Sets tensor D scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor D scales apply after all post-ops are
+/// applied.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param d_scale_mask Tensor D scale mask. Can be `0` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_D_scales(
+        dnnl_brgemm_t brgemm, int d_scale_mask);
+
+/// Finalizes initialization of a BRGeMM ukernel object.
+///
+/// This step is mandatory to query information from the object.
+///
+/// @param brgemm Output BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_finalize(dnnl_brgemm_t brgemm);
+
+/// Returns the packing type expected by a tensor B of a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param pack_type Output packing type. Can be `dnnl_brgemm_no_pack` if
+///     packing is not expected, and `dnnl_brgemm_pack_32`, otherwise.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type(
+        const_dnnl_brgemm_t brgemm, dnnl_pack_type_t *pack_type);
+
+/// Returns the size of a scratchpad memory needed for the BRGeMM ukernel
+/// object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param size Output size of a buffer required for the BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_get_scratchpad_size(
+        const_dnnl_brgemm_t brgemm, size_t *size);
+
+/// Returns the flag indicating when the call to `dnnl_brgemm_execute_postops`
+/// is valid.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param valid The flag indicating if `dnnl_brgemm_execute_postops` is valid
+///     for a given ukernel object. `1` is for valid and `0`, otherwise.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_is_execute_postops_valid(
+        const_dnnl_brgemm_t brgemm, int *valid);
+
+/// Initializes the hardware-specific context. If no initialization required,
+/// returns the success status.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_hw_context(const_dnnl_brgemm_t brgemm);
+
+/// Releases the hardware-specific context. Must be used after all the execution
+/// calls to BRGeMM ukernel objects.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_release_hw_context();
+
+/// Generates an executable part of BRGeMM ukernel object.
+/// @param brgemm BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_generate(dnnl_brgemm_t brgemm);
+
+/// Executes a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param A_ptr Base pointer to a tensor A.
+/// @param B_ptr Base pointer to a tensor B.
+/// @param A_B_offsets Pointer to the set of tensor A and tensor B offsets for
+///     each batch; the set must be contiguous in memory. Single batch should
+///     supply offsets for both tensors A and B simultaneously. The number of
+///     batches must coincide with the `batch_size` value passed at the creation
+///     stage.
+/// @param C_ptr Pointer to a tensor C (accumulation buffer).
+/// @param scratchpad_ptr Pointer to a scratchpad buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm,
+        const void *A_ptr, const void *B_ptr, const dnnl_dim_t *A_B_offsets,
+        void *C_ptr, void *scratchpad_ptr);
+
+/// Executes a BRGeMM ukernel object with post operations.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param A Base pointer to a tensor A.
+/// @param B Base pointer to a tensor B.
+/// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for
+///     each batch. A set must be contiguous in memory. A single batch should
+///     supply offsets for both tensors A and B simultaneously. The number of
+///     batches must coincide with the `batch_size` value passed at the creation
+///     stage.
+/// @param C_ptr Pointer to a tensor C (accumulation buffer).
+/// @param D_ptr Pointer to a tensor D (output buffer).
+/// @param scratchpad_ptr Pointer to a scratchpad buffer.
+/// @param attr_params Ukernel attributes memory storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_execute_postops(const_dnnl_brgemm_t brgemm,
+        const void *A, const void *B, const dnnl_dim_t *A_B_offsets,
+        const void *C_ptr, void *D_ptr, void *scratchpad_ptr,
+        const_dnnl_ukernel_attr_params_t attr_params);
+
+/// Destroys a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_destroy(dnnl_brgemm_t brgemm);
+
+/// Creates a transform object.
+///
+/// @param transform Output transform object.
+/// @param K Dimension K.
+/// @param N Dimension N.
+/// @param in_pack_type Input packing type. Must be one of
+///     `dnnl_pack_type_no_trans`, or `dnnl_pack_type_trans`.
+/// @param in_ld Input leading dimension.
+/// @param out_ld Output leading dimension. When packing data, it specifies a
+///     block by N dimension.
+/// @param in_dt Input data type.
+/// @param out_dt Output data type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_create(dnnl_transform_t *transform,
+        dnnl_dim_t K, dnnl_dim_t N, dnnl_pack_type_t in_pack_type,
+        dnnl_dim_t in_ld, dnnl_dim_t out_ld, dnnl_data_type_t in_dt,
+        dnnl_data_type_t out_dt);
+
+/// Generates an executable part of transform object.
+/// @param transform Transform object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_generate(dnnl_transform_t transform);
+
+/// Executes a transform object.
+///
+/// @param transform Transform object.
+/// @param in_ptr Pointer to an input buffer.
+/// @param out_ptr Pointer to an output buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_execute(
+        const_dnnl_transform_t transform, const void *in_ptr, void *out_ptr);
+
+/// Destroys a transform object.
+///
+/// @param transform Transform object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_destroy(dnnl_transform_t transform);
+
+/// @} dnnl_api_ukernel_brgemm
+
+#endif
+
+/// @} dnnl_api_ukernel
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_H */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f2a8ccf57ccb3be88d7902a57908f17b304330b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp
@@ -0,0 +1,470 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C++ API
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_HPP
+#define ONEAPI_DNNL_DNNL_UKERNEL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_ukernel.h"
+
+/// @addtogroup dnnl_api oneDNN API
+/// @{
+
+/// oneDNN namespace
+namespace dnnl {
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// @addtogroup dnnl_api_utils
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+template <>
+struct handle_traits<dnnl_brgemm_t> {
+    static dnnl_status_t destructor(dnnl_brgemm_t p) {
+        return dnnl_brgemm_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_transform_t> {
+    static dnnl_status_t destructor(dnnl_transform_t p) {
+        return dnnl_transform_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_ukernel_attr_params_t> {
+    static dnnl_status_t destructor(dnnl_ukernel_attr_params_t p) {
+        return dnnl_ukernel_attr_params_destroy(p);
+    }
+};
+
+/// @endcond
+
+/// @} dnnl_api_utils
+
+#endif
+
+/// @addtogroup dnnl_api_ukernel Ukernels
+/// Collection of ukernels
+/// @{
+
+/// ukernel namespace
+namespace ukernel {
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// @addtogroup dnnl_api_ukernel_utils ukernel utils
+/// ukernel utility functions
+/// @{
+
+/// Packing specification
+enum class pack_type {
+    /// Undefined pack type. A guard value.
+    undef = dnnl_pack_type_undef,
+    /// Plain, not transposed layout. Similar to format_tag::ab.
+    no_trans = dnnl_pack_type_no_trans,
+    /// Plain, transposed layout. Similar to format_tag::ba.
+    trans = dnnl_pack_type_trans,
+    /// Packed by 32 bits along K dimension layout.
+    pack32 = dnnl_pack_type_pack32,
+};
+
+/// Ukernel attributes memory storage
+struct attr_params : public handle<dnnl_ukernel_attr_params_t> {
+    /// Constructs a ukernel attributes memory storage.
+    attr_params() {
+        dnnl_ukernel_attr_params_t c_params = nullptr;
+        dnnl_status_t status = dnnl_ukernel_attr_params_create(&c_params);
+        error::wrap_c_api(
+                status, "could not create an attributes memory storage");
+        reset(c_params);
+    }
+
+    /// Sets post-operations arguments to a storage.
+    ///
+    /// @param post_ops_args Pointer to pointers of post_ops storages.
+    ///     Expected to be packed together.
+    void set_post_ops_args(const void **post_ops_args) {
+        dnnl_status_t status = dnnl_ukernel_attr_params_set_post_ops_args(
+                get(), post_ops_args);
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not set post operations arguments");
+    }
+
+    /// Sets tensor A scales arguments to a storage.
+    ///
+    /// @param a_scales Pointer to scales storage.
+    void set_A_scales(const void *a_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_A_scales(get(), a_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set A scales argument");
+    }
+
+    /// Sets tensor B scales arguments to a storage.
+    ///
+    /// If @ref attr_params::set_B_scales used mask of 2, then at
+    /// least N values of selected data type are expected.
+    ///
+    /// @param b_scales Pointer to scales storage.
+    void set_B_scales(const void *b_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_B_scales(get(), b_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set B scales argument");
+    }
+
+    /// Sets tensor D scales arguments to a storage.
+    ///
+    /// @param d_scales Pointer to scales storage.
+    void set_D_scales(const void *d_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_D_scales(get(), d_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set D scales argument");
+    }
+};
+/// @} dnnl_api_ukernel_utils
+
+/// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel
+/// BRGeMM ukernel routines
+/// @{
+
+/// BRGeMM ukernel
+struct brgemm : public handle<dnnl_brgemm_t> {
+    /// Default constructor. Produces an empty object.
+    brgemm() = default;
+
+    /// Constructs a BRGeMM ukernel object. Operates by the following formula:
+    /// `C = [A x B]`.
+    ///
+    /// @param M Dimension M of tensor A.
+    /// @param N Dimension N of tensor B.
+    /// @param K Dimension K of tensors A and B.
+    /// @param batch_size Number of batches to process.
+    /// @param lda Leading dimension of tensor A.
+    /// @param ldb Leading dimension of tensor B.
+    /// @param ldc Leading dimension of tensor C.
+    /// @param a_dt Data type of tensor A.
+    /// @param b_dt Data type of tensor B.
+    /// @param c_dt Data type of tensor C.
+    /// @param allow_empty A flag signifying whether construction is
+    ///     allowed to fail without throwing an exception. In this case an
+    ///     empty object will be produced. This flag is optional and
+    ///     defaults to false.
+    brgemm(memory::dim M, memory::dim N, memory::dim K, memory::dim batch_size,
+            memory::dim lda, memory::dim ldb, memory::dim ldc,
+            memory::data_type a_dt, memory::data_type b_dt,
+            memory::data_type c_dt, bool allow_empty = false) {
+
+        dnnl_brgemm_t brgemm = nullptr;
+        dnnl_status_t status = dnnl_brgemm_create(&brgemm, M, N, K, batch_size,
+                lda, ldb, ldc, memory::convert_to_c(a_dt),
+                memory::convert_to_c(b_dt), memory::convert_to_c(c_dt));
+
+        if (!allow_empty)
+            error::wrap_c_api(
+                    status, "could not create a BRGeMM ukernel object");
+        reset(brgemm);
+    }
+
+    /// Sets adding an intermediate result to the output tensor C instead of
+    /// writing: `C += [A x B]`.
+    ///
+    /// @param add_C Value to indicate addition. `false` to skip addition, and
+    ///     `true` to apply addition.
+    void set_add_C(bool add_C) {
+        dnnl_status_t status
+                = dnnl_brgemm_set_add_C(get(), static_cast<int>(add_C));
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set add_C attribute");
+    }
+
+    /// Sets post-operations to a BRGeMM ukernel object:
+    /// `D = post-operations(C)`.
+    ///
+    /// Post-operations applies if one of the following holds:
+    /// * Non-empty post-operations are specified.
+    /// * Output data type `d_dt` is different from accumulation data type
+    ///     `c_dt`.
+    ///
+    /// @param ldd Leading dimension of tensor D.
+    /// @param d_dt Data type of tensor D.
+    /// @param po Primitive post-operation attributes to extend the kernel
+    ///     operations.
+    void set_post_ops(memory::dim ldd, memory::data_type d_dt,
+            const post_ops &po = default_post_ops()) {
+        dnnl_status_t status = dnnl_brgemm_set_post_ops(
+                get(), ldd, memory::convert_to_c(d_dt), po.get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set post operations");
+    }
+
+    /// Sets tensor A scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor A scales apply to accumulation buffer
+    /// once C is ready.
+    ///
+    /// @param a_scale_mask Tensor A scale mask. Can be `0` only.
+    void set_A_scales(int a_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_A_scales(get(), a_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set A scales");
+    }
+
+    /// Sets tensor B scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor B scales apply to accumulation buffer
+    /// once C is ready.
+    ///
+    /// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only.
+    void set_B_scales(int b_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_B_scales(get(), b_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set B scales");
+    }
+
+    /// Sets tensor D scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor D scales apply after all post-ops are
+    /// applied.
+    ///
+    /// @param d_scale_mask Tensor D scale mask. Can be `0` only.
+    void set_D_scales(int d_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_D_scales(get(), d_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set D scales");
+    }
+
+    /// Finalizes initialization of a BRGeMM ukernel object.
+    ///
+    /// This step must be performed prior to querying information from the
+    /// object.
+    void finalize() {
+        dnnl_status_t status = dnnl_brgemm_finalize(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not finalize an object");
+    }
+
+    /// Returns the packing type expected by a tensor B of a BRGeMM ukernel
+    /// object.
+    pack_type get_B_pack_type() const {
+        dnnl_pack_type_t c_pack_type;
+        dnnl_status_t status = dnnl_brgemm_get_B_pack_type(get(), &c_pack_type);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not query B pack type");
+
+        return static_cast<pack_type>(c_pack_type);
+    }
+
+    /// Returns the size of a scratchpad memory needed for the BRGeMM ukernel
+    /// object.
+    size_t get_scratchpad_size() const {
+        size_t size;
+        dnnl_status_t status = dnnl_brgemm_get_scratchpad_size(get(), &size);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not query a scratchpad size from a BRGeMM ukernel "
+                    "object");
+        return size;
+    }
+
+    /// Returns the flag indicating when the call to execute with post
+    /// operations is valid.
+    ///
+    /// `True` is for a valid call, `false`, otherwise.
+    bool is_execute_postops_valid() const {
+        int valid;
+        dnnl_status_t status
+                = dnnl_brgemm_is_execute_postops_valid(get(), &valid);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not query a flag for execute postops from a BRGeMM "
+                    "ukernel object");
+        return static_cast<bool>(valid);
+    }
+
+    /// Initializes the hardware-specific context. Affects the global state for
+    /// all BRGeMM ukernel objects. If no initialization required, returns.
+    void set_hw_context() const {
+        dnnl_status_t status = dnnl_brgemm_set_hw_context(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set hardware context");
+    }
+
+    /// Releases the hardware-specific context. Affects the global state for
+    /// all BRGeMM ukernel objects. Must be used after all the execution calls
+    /// to BRGeMM ukernel objects.
+    static void release_hw_context() {
+        dnnl_status_t status = dnnl_brgemm_release_hw_context();
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not release hardware context");
+    }
+
+    /// Generates an executable part of BRGeMM ukernel object.
+    void generate() {
+        dnnl_status_t status = dnnl_brgemm_generate(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not generate a kernel");
+    }
+
+    /// Executes a BRGeMM ukernel object.
+    ///
+    /// @param A Base pointer to a tensor A.
+    /// @param B Base pointer to a tensor B.
+    /// @param A_B_offsets Vector of pairs of tensors A and B offsets for
+    ///     each batch. The number of batches must coincide with the
+    ///     `batch_size` value passed at object construction stage.
+    /// @param C Pointer to a tensor C (accumulation buffer).
+    /// @param scratchpad Pointer to a scratchpad buffer.
+    void execute(const void *A, const void *B,
+            const std::vector<std::pair<memory::dim, memory::dim>> &A_B_offsets,
+            void *C, void *scratchpad) const {
+        // TODO: export batch_element to C API later for user to fill it and
+        // pass directly to the call.
+        dnnl_status_t status = dnnl_brgemm_execute(get(), A, B,
+                (const dnnl_dim_t *)A_B_offsets.data(), C, scratchpad);
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not execute a BRGeMM ukernel object");
+    }
+
+    /// Executes a BRGeMM ukernel object with post operations.
+    ///
+    /// @param A Base pointer to a tensor A.
+    /// @param B Base pointer to a tensor B.
+    /// @param A_B_offsets Vector of pairs of tensors A and B offsets for
+    ///     each batch. The number of batches must coincide with the
+    ///     `batch_size` value passed at object construction stage.
+    /// @param C Pointer to a tensor C (accumulation buffer).
+    /// @param D Pointer to a tensor D (output buffer).
+    /// @param scratchpad Pointer to a scratchpad buffer.
+    /// @param params Post-op memory arguments. Must be passed If binary
+    ///     post-op or scales were set.
+    void execute(const void *A, const void *B,
+            const std::vector<std::pair<memory::dim, memory::dim>> &A_B_offsets,
+            const void *C, void *D, void *scratchpad,
+            const attr_params &params = default_attr_params()) const {
+        // TODO: export batch_element to C API later for user to fill it and
+        // pass directly to the call.
+        dnnl_status_t status = dnnl_brgemm_execute_postops(get(), A, B,
+                (const dnnl_dim_t *)A_B_offsets.data(), C, D, scratchpad,
+                params.get());
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not execute a BRGeMM ukernel object");
+    }
+
+    /// Returns a constant reference to a static instance of default constructed
+    /// primitive post-operations attribute.
+    static const post_ops &default_post_ops() {
+        static const post_ops po;
+        return po;
+    }
+
+    /// Returns a constant reference to a static instance of default constructed
+    /// ukernel attributes parameters.
+    static const attr_params &default_attr_params() {
+        static const attr_params ap;
+        return ap;
+    }
+};
+/// @} dnnl_api_ukernel_brgemm
+
+/// @addtogroup dnnl_api_ukernel_transform Transform ukernel
+/// Transform routines
+/// @{
+
+/// Transform ukernel
+struct transform : public handle<dnnl_transform_t> {
+    /// Default constructor. Produces an empty object.
+    transform() = default;
+
+    /// Constructs a transform object.
+    ///
+    /// @param K Dimension K.
+    /// @param N Dimension N.
+    /// @param in_pack_type Input packing type. Must be one of
+    ///     `pack_type::no_trans`, or `pack_type::trans`.
+    /// @param in_ld Input leading dimension.
+    /// @param out_ld Output leading dimension. Specifies a block by N dimension
+    ///     during data packing.
+    /// @param in_dt Input data type.
+    /// @param out_dt Output data type.
+    /// @param allow_empty A flag signifying whether construction is
+    ///     allowed to fail without throwing an exception. In this case an
+    ///     empty object will be produced. This flag is optional and
+    ///     defaults to false.
+    transform(memory::dim K, memory::dim N, pack_type in_pack_type,
+            memory::dim in_ld, memory::dim out_ld, memory::data_type in_dt,
+            memory::data_type out_dt, bool allow_empty = false) {
+
+        dnnl_transform_t transform = nullptr;
+        dnnl_status_t status = dnnl_transform_create(&transform, K, N,
+                static_cast<dnnl_pack_type_t>(in_pack_type), in_ld, out_ld,
+                memory::convert_to_c(in_dt), memory::convert_to_c(out_dt));
+
+        if (!allow_empty)
+            error::wrap_c_api(status,
+                    "could not create a BRGeMM ukernel packing B object");
+        reset(transform);
+    }
+
+    /// Generates an executable part of transform object.
+    void generate() {
+        dnnl_status_t status = dnnl_transform_generate(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not generate a BRGeMM ukernel packing B object");
+    }
+
+    /// Executes a transform object.
+    ///
+    /// @param in Pointer to an input buffer.
+    /// @param out Pointer to an output buffer.
+    void execute(const void *in, void *out) const {
+        dnnl_status_t status = dnnl_transform_execute(get(), in, out);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not execute a BRGeMM ukernel packing B object");
+    }
+};
+
+/// @} dnnl_api_ukernel_transform
+
+#endif
+
+} // namespace ukernel
+
+/// @} dnnl_api_ukernel
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_HPP */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1588d4dcf2347dd9752e9435b343f4a17c98923
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h
@@ -0,0 +1,98 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C API types definitions
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H
+#define ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "oneapi/dnnl/dnnl_types.h"
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_ukernel
+/// @{
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// Packing specification
+typedef enum {
+    /// Undefined pack type. A guard value.
+    dnnl_pack_type_undef = 0,
+    /// Plain, not transposed layout. Similar to format_tag::ab.
+    dnnl_pack_type_no_trans,
+    /// Plain, transposed layout. Similar to format_tag::ba.
+    dnnl_pack_type_trans,
+    /// Packed by 32 bits along K dimension layout.
+    dnnl_pack_type_pack32,
+} dnnl_pack_type_t;
+
+/// @struct dnnl_ukernel_attr_params
+/// An opaque structure to describe ukernel attributes memory storage.
+struct dnnl_ukernel_attr_params;
+
+/// A ukernel attributes memory storage handle.
+typedef struct dnnl_ukernel_attr_params *dnnl_ukernel_attr_params_t;
+
+/// A constant ukernel attributes memory storage handle.
+typedef const struct dnnl_ukernel_attr_params *const_dnnl_ukernel_attr_params_t;
+
+/// @addtogroup dnnl_api_ukernel_brgemm
+/// @{
+
+/// @struct dnnl_brgemm
+/// An opaque structure to describe a brgemm ukernel.
+struct dnnl_brgemm;
+
+/// A brgemm ukernel handle.
+typedef struct dnnl_brgemm *dnnl_brgemm_t;
+
+/// A constant brgemm ukernel handle.
+typedef const struct dnnl_brgemm *const_dnnl_brgemm_t;
+
+/// @struct dnnl_transform
+/// An opaque structure to describe a transform routine.
+struct dnnl_transform;
+
+/// A transform routine handle.
+typedef struct dnnl_transform *dnnl_transform_t;
+
+/// A constant transform routine handle.
+typedef const struct dnnl_transform *const_dnnl_transform_t;
+
+/// @} dnnl_api_ukernel_brgemm
+#endif
+
+/// @} dnnl_api_ukernel
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H */
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b7c394258523af0fb6c11ae74deb36d3dfe138a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version.h
@@ -0,0 +1,38 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_VERSION_H
+#define ONEAPI_DNNL_DNNL_VERSION_H
+
+// clang-format off
+
+/// Major version
+#define DNNL_VERSION_MAJOR 3
+
+/// Minor version
+#define DNNL_VERSION_MINOR 7
+
+/// Patch version
+#define DNNL_VERSION_PATCH 1
+
+// clang-format on
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bfe42dbda1bf80004de23cc501e7874d87178ca
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h
@@ -0,0 +1,36 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_VERSION_HASH_H
+#define ONEAPI_DNNL_DNNL_VERSION_HASH_H
+
+// clang-format off
+
+/// Note: this macro and header file were moved to a separate instance to avoid
+/// incremental build issues as moving from commit to commit would trigger a
+/// complete library rebuild. Including a generated header file in a single
+/// translation unit makes this problem go away.
+/// Git commit hash
+#define DNNL_VERSION_HASH  "8d263e693366ef8db40acc569cc7d8edf644556d"
+
+// clang-format on
+
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h
new file mode 100644
index 0000000000000000000000000000000000000000..78db14a6384f3d54e6c70420fda861b395bbe105
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/cpp_conduit.h
@@ -0,0 +1,80 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2024 The pybind Community.
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+#include "internals.h"
+
+#include <typeinfo>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Forward declaration needed here: Refactoring opportunity.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *);
+
+inline bool type_is_managed_by_our_internals(PyTypeObject *type_obj) {
+#if defined(PYPY_VERSION)
+    auto &internals = get_internals();
+    return bool(internals.registered_types_py.find(type_obj)
+                != internals.registered_types_py.end());
+#else
+    return bool(type_obj->tp_new == pybind11_object_new);
+#endif
+}
+
+inline bool is_instance_method_of_type(PyTypeObject *type_obj, PyObject *attr_name) {
+    PyObject *descr = _PyType_Lookup(type_obj, attr_name);
+    return bool((descr != nullptr) && PyInstanceMethod_Check(descr));
+}
+
+inline object try_get_cpp_conduit_method(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return object();
+    }
+    PyTypeObject *type_obj = Py_TYPE(obj);
+    str attr_name("_pybind11_conduit_v1_");
+    bool assumed_to_be_callable = false;
+    if (type_is_managed_by_our_internals(type_obj)) {
+        if (!is_instance_method_of_type(type_obj, attr_name.ptr())) {
+            return object();
+        }
+        assumed_to_be_callable = true;
+    }
+    PyObject *method = PyObject_GetAttr(obj, attr_name.ptr());
+    if (method == nullptr) {
+        PyErr_Clear();
+        return object();
+    }
+    if (!assumed_to_be_callable && PyCallable_Check(method) == 0) {
+        Py_DECREF(method);
+        return object();
+    }
+    return reinterpret_steal<object>(method);
+}
+
+inline void *try_raw_pointer_ephemeral_from_cpp_conduit(handle src,
+                                                        const std::type_info *cpp_type_info) {
+    object method = try_get_cpp_conduit_method(src.ptr());
+    if (method) {
+        capsule cpp_type_info_capsule(const_cast<void *>(static_cast<const void *>(cpp_type_info)),
+                                      typeid(std::type_info).name());
+        object cpp_conduit = method(bytes(PYBIND11_PLATFORM_ABI_ID),
+                                    cpp_type_info_capsule,
+                                    bytes("raw_pointer_ephemeral"));
+        if (isinstance<capsule>(cpp_conduit)) {
+            return reinterpret_borrow<capsule>(cpp_conduit).get_pointer();
+        }
+    }
+    return nullptr;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h
new file mode 100644
index 0000000000000000000000000000000000000000..908aa703741a1f127efa8918c67a5d74064010e3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h
@@ -0,0 +1,44 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include "common.h"
+
+#include <type_traits>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename To, typename From, typename SFINAE = void>
+struct dynamic_raw_ptr_cast_is_possible : std::false_type {};
+
+template <typename To, typename From>
+struct dynamic_raw_ptr_cast_is_possible<
+    To,
+    From,
+    detail::enable_if_t<!std::is_same<To, void>::value && std::is_polymorphic<From>::value>>
+    : std::true_type {};
+
+template <typename To,
+          typename From,
+          detail::enable_if_t<!dynamic_raw_ptr_cast_is_possible<To, From>::value, int> = 0>
+To *dynamic_raw_ptr_cast_if_possible(From * /*ptr*/) {
+    return nullptr;
+}
+
+template <typename To,
+          typename From,
+          detail::enable_if_t<dynamic_raw_ptr_cast_is_possible<To, From>::value, int> = 0>
+To *dynamic_raw_ptr_cast_if_possible(From *ptr) {
+    return dynamic_cast<To *>(ptr);
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee9149931607cf8d448ff75432b456090add8eb4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/function_record_pyobject.h
@@ -0,0 +1,196 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2024-2025 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+// For background see the description of PR google/pybind11clif#30099.
+
+#pragma once
+
+#include <pybind11/attr.h>
+#include <pybind11/conduit/pybind11_platform_abi_id.h>
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+
+#include <cstring>
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+struct function_record_PyObject {
+    PyObject_HEAD
+    function_record *cpp_func_rec;
+};
+
+PYBIND11_NAMESPACE_BEGIN(function_record_PyTypeObject_methods)
+
+PyObject *tp_new_impl(PyTypeObject *type, PyObject *args, PyObject *kwds);
+PyObject *tp_alloc_impl(PyTypeObject *type, Py_ssize_t nitems);
+int tp_init_impl(PyObject *self, PyObject *args, PyObject *kwds);
+void tp_dealloc_impl(PyObject *self);
+void tp_free_impl(void *self);
+
+static PyObject *reduce_ex_impl(PyObject *self, PyObject *, PyObject *);
+
+static PyMethodDef tp_methods_impl[]
+    = {{"__reduce_ex__",
+        // reduce_ex_impl is a PyCFunctionWithKeywords, but PyMethodDef
+        // requires a PyCFunction. The cast through void* is safe and
+        // idiomatic with METH_KEYWORDS, and it successfully sidesteps
+        // unhelpful compiler warnings.
+        // NOLINTNEXTLINE(bugprone-casting-through-void)
+        reinterpret_cast<PyCFunction>(reinterpret_cast<void *>(reduce_ex_impl)),
+        METH_VARARGS | METH_KEYWORDS,
+        nullptr},
+       {nullptr, nullptr, 0, nullptr}};
+
+// Python 3.12+ emits a DeprecationWarning for heap types whose tp_name does
+// not contain a dot ('.') and that lack a __module__ attribute. For pybind11's
+// internal function_record type, we do not have an actual module object to
+// attach, so we cannot use PyType_FromModuleAndSpec (introduced in Python 3.9)
+// to set __module__ automatically.
+//
+// As a workaround, we define a "qualified" type name that includes a dummy
+// module name (PYBIND11_DUMMY_MODULE_NAME). This is non‑idiomatic but avoids
+// the deprecation warning, and results in reprs like
+//
+//     <class 'pybind11_builtins.pybind11_detail_function_record_...'>
+//
+// even though no real pybind11_builtins module exists. If pybind11 gains an
+// actual module object in the future, this code should switch to
+// PyType_FromModuleAndSpec for Python 3.9+ and drop the dummy module
+// workaround.
+//
+// Note that this name is versioned.
+#define PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME                                              \
+    "pybind11_detail_function_record_" PYBIND11_DETAIL_FUNCTION_RECORD_ABI_ID                     \
+    "_" PYBIND11_PLATFORM_ABI_ID
+constexpr char tp_plainname_impl[] = PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME;
+constexpr char tp_qualname_impl[]
+    = PYBIND11_DUMMY_MODULE_NAME "." PYBIND11_DETAIL_FUNCTION_RECORD_TP_PLAINNAME;
+
+PYBIND11_NAMESPACE_END(function_record_PyTypeObject_methods)
+
+static PyType_Slot function_record_PyType_Slots[] = {
+    {Py_tp_dealloc,
+     reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_dealloc_impl)},
+    {Py_tp_methods,
+     reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_methods_impl)},
+    {Py_tp_init, reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_init_impl)},
+    {Py_tp_alloc, reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_alloc_impl)},
+    {Py_tp_new, reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_new_impl)},
+    {Py_tp_free, reinterpret_cast<void *>(function_record_PyTypeObject_methods::tp_free_impl)},
+    {0, nullptr}};
+
+static PyType_Spec function_record_PyType_Spec
+    = {function_record_PyTypeObject_methods::tp_qualname_impl,
+       sizeof(function_record_PyObject),
+       0,
+       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE,
+       function_record_PyType_Slots};
+
+inline PyTypeObject *get_function_record_PyTypeObject() {
+    PyTypeObject *&py_type_obj = detail::get_local_internals().function_record_py_type;
+    if (!py_type_obj) {
+        PyObject *py_obj = PyType_FromSpec(&function_record_PyType_Spec);
+        if (py_obj == nullptr) {
+            throw error_already_set();
+        }
+        py_type_obj = reinterpret_cast<PyTypeObject *>(py_obj);
+    }
+    return py_type_obj;
+}
+
+inline bool is_function_record_PyObject(PyObject *obj) {
+    if (PyType_Check(obj) != 0) {
+        return false;
+    }
+    PyTypeObject *obj_type = Py_TYPE(obj);
+
+    PyTypeObject *frtype = get_function_record_PyTypeObject();
+
+    // Fast path (pointer comparison).
+    if (obj_type == frtype) {
+        return true;
+    }
+    // This works across extension modules. Note that tp_name is versioned.
+    if (strcmp(obj_type->tp_name, function_record_PyTypeObject_methods::tp_qualname_impl) == 0
+        || strcmp(obj_type->tp_name, function_record_PyTypeObject_methods::tp_plainname_impl)
+               == 0) {
+        return true;
+    }
+    return false;
+}
+
+inline function_record *function_record_ptr_from_PyObject(PyObject *obj) {
+    if (is_function_record_PyObject(obj)) {
+        return ((detail::function_record_PyObject *) obj)->cpp_func_rec;
+    }
+    return nullptr;
+}
+
+inline object function_record_PyObject_New() {
+    auto *py_func_rec = PyObject_New(function_record_PyObject, get_function_record_PyTypeObject());
+    if (py_func_rec == nullptr) {
+        throw error_already_set();
+    }
+    py_func_rec->cpp_func_rec = nullptr; // For clarity/purity. Redundant in practice.
+    return reinterpret_steal<object>((PyObject *) py_func_rec);
+}
+
+PYBIND11_NAMESPACE_BEGIN(function_record_PyTypeObject_methods)
+
+// Guard against accidents & oversights, in particular when porting to future Python versions.
+inline PyObject *tp_new_impl(PyTypeObject *, PyObject *, PyObject *) {
+    pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_new_impl");
+    // return nullptr; // Unreachable.
+}
+
+inline PyObject *tp_alloc_impl(PyTypeObject *, Py_ssize_t) {
+    pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_alloc_impl");
+    // return nullptr; // Unreachable.
+}
+
+inline int tp_init_impl(PyObject *, PyObject *, PyObject *) {
+    pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_init_impl");
+    // return -1; // Unreachable.
+}
+
+inline void tp_free_impl(void *) {
+    pybind11_fail("UNEXPECTED CALL OF function_record_PyTypeObject_methods::tp_free_impl");
+}
+
+inline PyObject *reduce_ex_impl(PyObject *self, PyObject *, PyObject *) {
+    // Deliberately ignoring the arguments for simplicity (expected is `protocol: int`).
+    const function_record *rec = function_record_ptr_from_PyObject(self);
+    if (rec == nullptr) {
+        pybind11_fail(
+            "FATAL: function_record_PyTypeObject reduce_ex_impl(): cannot obtain cpp_func_rec.");
+    }
+    if (rec->name != nullptr && rec->name[0] != '\0' && rec->scope
+        && PyModule_Check(rec->scope.ptr()) != 0) {
+        object scope_module = get_scope_module(rec->scope);
+        if (scope_module) {
+            auto builtins = reinterpret_borrow<dict>(PyEval_GetBuiltins());
+            auto builtins_eval = builtins["eval"];
+            auto reconstruct_args = make_tuple(str("__import__('importlib').import_module('")
+                                               + scope_module + str("')"));
+            return make_tuple(std::move(builtins_eval), std::move(reconstruct_args))
+                .release()
+                .ptr();
+        }
+    }
+    set_error(PyExc_RuntimeError, repr(self) + str(" is not pickleable."));
+    return nullptr;
+}
+
+PYBIND11_NAMESPACE_END(function_record_PyTypeObject_methods)
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..384902ccbc2c58505936d77ccbccb06da83e4d32
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/init.h
@@ -0,0 +1,543 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+#include "using_smart_holder.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename>
+    using cast_op_type = value_and_holder &;
+    explicit operator value_and_holder &() { return *value; }
+    static constexpr auto name = const_name<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+PYBIND11_NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(const void *ptr) {
+    if (!ptr) {
+        throw type_error("pybind11::init(): factory function returned nullptr");
+    }
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class>
+using Cpp = typename Class::type;
+template <typename Class>
+using Alias = typename Class::type_alias;
+template <typename Class>
+using Holder = typename Class::holder_type;
+
+template <typename Class>
+using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(const void *) {
+    return false;
+}
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initialization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class(std::forward<Args>(args)...);
+}
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class{std::forward<Args>(args)...};
+}
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h,
+                              Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &,
+                                           Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+                  "pybind11::init(): init function must return a compatible pointer, "
+                  "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // Trick to prevent init_instance from registering it
+        // DANGER ZONE BEGIN: exceptions will leave v_h in an invalid state.
+        v_h.type->init_instance(v_h.inst, nullptr);                        // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+        // DANGER ZONE END.
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder
+// constructors).
+template <typename Class, detail::enable_if_t<!is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    no_nullptr(ptr);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+    }
+
+    // Cast away constness to store in void* storage.
+    // The value_and_holder storage is fundamentally untyped (void**), so we lose
+    // const-correctness here by design. The const qualifier will be restored
+    // when the pointer is later retrieved and cast back to the original type.
+    // This explicit const_cast makes the const-removal clearly visible.
+    v_h.value_ptr() = const_cast<void *>(static_cast<const void *>(ptr));
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    static_assert(is_move_constructible<Cpp<Class>>::value,
+                  "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias) {
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    } else {
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+    }
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(
+        is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+template <typename T, typename D>
+smart_holder init_smart_holder_from_unique_ptr(std::unique_ptr<T, D> &&unq_ptr,
+                                               bool void_cast_raw_ptr) {
+    void *void_ptr = void_cast_raw_ptr ? static_cast<void *>(unq_ptr.get()) : nullptr;
+    return smart_holder::from_unique_ptr(std::move(unq_ptr), void_ptr);
+}
+
+template <typename Class,
+          typename D = std::default_delete<Cpp<Class>>,
+          detail::enable_if_t<is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h, std::unique_ptr<Cpp<Class>, D> &&unq_ptr, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    auto *ptr = unq_ptr.get();
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned std::unique_ptr pointee "
+                         "is not an alias instance");
+    }
+    // Here and below: if the new object is a trampoline, the shared_from_this mechanism needs
+    // to be prevented from accessing the smart_holder vptr, because it does not keep the
+    // trampoline Python object alive. For types that don't inherit from enable_shared_from_this
+    // it does not matter if void_cast_raw_ptr is true or false, therefore it's not necessary
+    // to also inspect the type.
+    auto smhldr = init_smart_holder_from_unique_ptr(
+        std::move(unq_ptr), /*void_cast_raw_ptr*/ Class::has_alias && is_alias<Class>(ptr));
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &smhldr);
+}
+
+template <typename Class,
+          typename D = std::default_delete<Alias<Class>>,
+          detail::enable_if_t<is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h,
+               std::unique_ptr<Alias<Class>, D> &&unq_ptr,
+               bool /*need_alias*/) {
+    auto *ptr = unq_ptr.get();
+    no_nullptr(ptr);
+    auto smhldr
+        = init_smart_holder_from_unique_ptr(std::move(unq_ptr), /*void_cast_raw_ptr*/ true);
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &smhldr);
+}
+
+template <typename PtrType, typename Class>
+void construct_from_shared_ptr(value_and_holder &v_h,
+                               std::shared_ptr<PtrType> &&shd_ptr,
+                               bool need_alias) {
+    static_assert(std::is_same<PtrType, Cpp<Class>>::value
+                      || std::is_same<PtrType, const Cpp<Class>>::value,
+                  "Expected (const) Cpp<Class> as shared_ptr pointee");
+    auto *ptr = shd_ptr.get();
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned std::shared_ptr pointee "
+                         "is not an alias instance");
+    }
+    // Cast to non-const if needed, consistent with internal design
+    auto smhldr
+        = smart_holder::from_shared_ptr(std::const_pointer_cast<Cpp<Class>>(std::move(shd_ptr)));
+    v_h.value_ptr() = const_cast<Cpp<Class> *>(ptr);
+    v_h.type->init_instance(v_h.inst, &smhldr);
+}
+
+template <typename Class, detail::enable_if_t<is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h, std::shared_ptr<Cpp<Class>> &&shd_ptr, bool need_alias) {
+    construct_from_shared_ptr<Cpp<Class>, Class>(v_h, std::move(shd_ptr), need_alias);
+}
+
+template <typename Class, detail::enable_if_t<is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h,
+               std::shared_ptr<const Cpp<Class>> &&shd_ptr,
+               bool need_alias) {
+    construct_from_shared_ptr<const Cpp<Class>, Class>(v_h, std::move(shd_ptr), need_alias);
+}
+
+template <typename Class, detail::enable_if_t<is_smart_holder<Holder<Class>>::value, int> = 0>
+void construct(value_and_holder &v_h,
+               std::shared_ptr<Alias<Class>> &&shd_ptr,
+               bool /*need_alias*/) {
+    auto *ptr = shd_ptr.get();
+    no_nullptr(ptr);
+    auto smhldr = smart_holder::from_shared_ptr(shd_ptr);
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &smhldr);
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h,
+               Args... args) { // NOLINT(performance-unnecessary-value-param)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+                } else {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && !std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args>
+struct alias_constructor {
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>,
+          typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [func = std::move(class_factory)]
+#else
+        auto &func = class_factory;
+        cl.def(
+            "__init__",
+            [func]
+#endif
+            (value_and_holder &v_h, Args... args) {
+                construct<Class>(
+                    v_h, func(std::forward<Args>(args)...), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc,
+          typename CReturn,
+          typename... CArgs,
+          typename AReturn,
+          typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) {}
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        static_assert(Class::has_alias,
+                      "The two-argument version of `py::init()` can "
+                      "only be used if the class has an alias");
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+#else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def(
+            "__init__",
+            [class_func, alias_func]
+#endif
+            (value_and_holder &v_h, CArgs... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    // If the instance type equals the registered type we don't have inheritance,
+                    // so don't need the alias and can construct using the class function:
+                    construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+                } else {
+                    construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class,
+          typename T,
+          typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    auto d = handle(result.second);
+    if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) {
+        // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily.
+        // See PR #2972 for details.
+        return;
+    }
+    // Our tests never run into an unset dict, but being careful here for now (see #5658)
+    auto dict = getattr((PyObject *) v_h.inst, "__dict__", none());
+    if (dict.is_none()) {
+        setattr((PyObject *) v_h.inst, "__dict__", d);
+    } else {
+        // Keep the original object dict and just update it
+        if (PyDict_Update(dict.ptr(), d.ptr()) < 0) {
+            throw error_already_set();
+        }
+    }
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get,
+          typename Set,
+          typename = function_signature_t<Get>,
+          typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get,
+          typename Set,
+          typename RetState,
+          typename Self,
+          typename NewInstance,
+          typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set) : get(std::forward<Get>(get)), set(std::forward<Set>(set)) {}
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get), pos_only());
+
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__setstate__",
+            [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def(
+            "__setstate__",
+            [func]
+#endif
+            (value_and_holder &v_h, ArgState state) {
+                setstate<Class>(
+                    v_h, func(std::forward<ArgState>(state)), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+PYBIND11_NAMESPACE_END(initimpl)
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..f076e79e73ebc7be7bf009ac3c711765a934d9fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/native_enum_data.h
@@ -0,0 +1,214 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2022-2025 The pybind Community.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include "../pytypes.h"
+#include "common.h"
+#include "internals.h"
+
+#include <cassert>
+#include <sstream>
+#include <string>
+#include <typeindex>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// This is a separate function only to enable easy unit testing.
+inline std::string
+native_enum_missing_finalize_error_message(const std::string &enum_name_encoded) {
+    return "pybind11::native_enum<...>(\"" + enum_name_encoded + "\", ...): MISSING .finalize()";
+}
+
+class native_enum_data {
+public:
+    native_enum_data(const object &parent_scope,
+                     const char *enum_name,
+                     const char *native_type_name,
+                     const char *class_doc,
+                     const std::type_index &enum_type_index)
+        : enum_name_encoded{enum_name}, native_type_name_encoded{native_type_name},
+          enum_type_index{enum_type_index}, parent_scope(parent_scope), enum_name{enum_name},
+          native_type_name{native_type_name}, class_doc(class_doc), export_values_flag{false},
+          finalize_needed{false} {}
+
+    void finalize();
+
+    native_enum_data(const native_enum_data &) = delete;
+    native_enum_data &operator=(const native_enum_data &) = delete;
+
+#if !defined(NDEBUG)
+    // This dtor cannot easily be unit tested because it terminates the process.
+    ~native_enum_data() {
+        if (finalize_needed) {
+            pybind11_fail(native_enum_missing_finalize_error_message(enum_name_encoded));
+        }
+    }
+#endif
+
+protected:
+    void disarm_finalize_check(const char *error_context) {
+        if (!finalize_needed) {
+            pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded
+                          + "\"): " + error_context);
+        }
+        finalize_needed = false;
+    }
+
+    void arm_finalize_check() {
+        assert(!finalize_needed); // Catch redundant calls.
+        finalize_needed = true;
+    }
+
+    std::string enum_name_encoded;
+    std::string native_type_name_encoded;
+    std::type_index enum_type_index;
+
+private:
+    object parent_scope;
+    str enum_name;
+    str native_type_name;
+    std::string class_doc;
+
+protected:
+    list members;
+    list member_docs;
+    bool export_values_flag : 1; // Attention: It is best to keep the bools together.
+
+private:
+    bool finalize_needed : 1;
+};
+
+inline void global_internals_native_enum_type_map_set_item(const std::type_index &enum_type_index,
+                                                           PyObject *py_enum) {
+    with_internals(
+        [&](internals &internals) { internals.native_enum_type_map[enum_type_index] = py_enum; });
+}
+
+inline handle
+global_internals_native_enum_type_map_get_item(const std::type_index &enum_type_index) {
+    return with_internals([&](internals &internals) {
+        auto found = internals.native_enum_type_map.find(enum_type_index);
+        if (found != internals.native_enum_type_map.end()) {
+            return handle(found->second);
+        }
+        return handle();
+    });
+}
+
+inline bool
+global_internals_native_enum_type_map_contains(const std::type_index &enum_type_index) {
+    return with_internals([&](internals &internals) {
+        return internals.native_enum_type_map.count(enum_type_index) != 0;
+    });
+}
+
+inline object import_or_getattr(const std::string &fully_qualified_name,
+                                const std::string &append_to_exception_message) {
+    std::istringstream stream(fully_qualified_name);
+    std::string part;
+
+    if (!std::getline(stream, part, '.') || part.empty()) {
+        std::string msg = "Invalid fully-qualified name `";
+        msg += fully_qualified_name;
+        msg += "`";
+        msg += append_to_exception_message;
+        throw value_error(msg);
+    }
+
+    auto curr_scope = reinterpret_steal<object>(PyImport_ImportModule(part.c_str()));
+    if (!curr_scope) {
+        std::string msg = "Failed to import top-level module `";
+        msg += part;
+        msg += "`";
+        msg += append_to_exception_message;
+        raise_from(PyExc_ImportError, msg.c_str());
+        throw error_already_set();
+    }
+
+    // Now recursively getattr or import remaining parts
+    std::string curr_path = part;
+    while (std::getline(stream, part, '.')) {
+        if (part.empty()) {
+            std::string msg = "Invalid fully-qualified name `";
+            msg += fully_qualified_name;
+            msg += "`";
+            msg += append_to_exception_message;
+            throw value_error(msg);
+        }
+        std::string next_path = curr_path;
+        next_path += ".";
+        next_path += part;
+        auto next_scope
+            = reinterpret_steal<object>(PyObject_GetAttrString(curr_scope.ptr(), part.c_str()));
+        if (!next_scope) {
+            error_fetch_and_normalize stored_getattr_error("getattr");
+            // Try importing the next level
+            next_scope = reinterpret_steal<object>(PyImport_ImportModule(next_path.c_str()));
+            if (!next_scope) {
+                error_fetch_and_normalize stored_import_error("import");
+                std::string msg = "Failed to import or getattr `";
+                msg += part;
+                msg += "` from `";
+                msg += curr_path;
+                msg += "`";
+                msg += append_to_exception_message;
+                msg += "\n-------- getattr exception --------\n";
+                msg += stored_getattr_error.error_string();
+                msg += "\n-------- import exception --------\n";
+                msg += stored_import_error.error_string();
+                throw import_error(msg.c_str());
+            }
+        }
+        curr_scope = next_scope;
+        curr_path = next_path;
+    }
+    return curr_scope;
+}
+
+inline void native_enum_data::finalize() {
+    disarm_finalize_check("DOUBLE finalize");
+    if (hasattr(parent_scope, enum_name)) {
+        pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded
+                      + "\"): an object with that name is already defined");
+    }
+    auto py_enum_type = import_or_getattr(native_type_name, " (native_type_name)");
+    auto py_enum = py_enum_type(enum_name, members);
+    object module_name = get_module_name_if_available(parent_scope);
+    if (module_name) {
+        py_enum.attr("__module__") = module_name;
+    }
+    if (hasattr(parent_scope, "__qualname__")) {
+        const auto parent_qualname = parent_scope.attr("__qualname__").cast<std::string>();
+        py_enum.attr("__qualname__") = str(parent_qualname + "." + enum_name.cast<std::string>());
+    }
+    parent_scope.attr(enum_name) = py_enum;
+    if (export_values_flag) {
+        for (auto member : members) {
+            auto member_name = member[int_(0)];
+            if (hasattr(parent_scope, member_name)) {
+                pybind11_fail("pybind11::native_enum<...>(\"" + enum_name_encoded + "\").value(\""
+                              + member_name.cast<std::string>()
+                              + "\"): an object with that name is already defined");
+            }
+            parent_scope.attr(member_name) = py_enum[member_name];
+        }
+    }
+    if (!class_doc.empty()) {
+        py_enum.attr("__doc__") = class_doc.c_str();
+    }
+    for (auto doc : member_docs) {
+        py_enum[doc[int_(0)]].attr("__doc__") = doc[int_(1)];
+    }
+    global_internals_native_enum_type_map_set_item(enum_type_index, py_enum.release().ptr());
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..1272e1fb2275abd469797e8bf81ee49650a31cca
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/pybind11_namespace_macros.h
@@ -0,0 +1,87 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2016-2025 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+// PLEASE DO NOT ADD ANY INCLUDES HERE
+
+// Define some generic pybind11 helper macros for warning management.
+//
+// Note that compiler-specific push/pop pairs are baked into the
+// PYBIND11_NAMESPACE_BEGIN/PYBIND11_NAMESPACE_END pair of macros. Therefore manual
+// PYBIND11_WARNING_PUSH/PYBIND11_WARNING_POP are usually only needed in `#include` sections.
+//
+// If you find you need to suppress a warning, please try to make the suppression as local as
+// possible using these macros. Please also be sure to push/pop with the pybind11 macros. Please
+// only use compiler specifics if you need to check specific versions, e.g. Apple Clang vs. vanilla
+// Clang.
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPILER_MSVC
+#    define PYBIND11_PRAGMA(...) __pragma(__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning(push))
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning(pop))
+#elif defined(__INTEL_COMPILER)
+#    define PYBIND11_COMPILER_INTEL
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning pop)
+#elif defined(__clang__)
+#    define PYBIND11_COMPILER_CLANG
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(clang diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(clang diagnostic pop)
+#elif defined(__GNUC__)
+#    define PYBIND11_COMPILER_GCC
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(GCC diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(GCC diagnostic pop)
+#endif
+
+#ifdef PYBIND11_COMPILER_MSVC
+#    define PYBIND11_WARNING_DISABLE_MSVC(name) PYBIND11_PRAGMA(warning(disable : name))
+#else
+#    define PYBIND11_WARNING_DISABLE_MSVC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_CLANG
+#    define PYBIND11_WARNING_DISABLE_CLANG(name) PYBIND11_PRAGMA(clang diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_CLANG(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_GCC
+#    define PYBIND11_WARNING_DISABLE_GCC(name) PYBIND11_PRAGMA(GCC diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_GCC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_INTEL
+#    define PYBIND11_WARNING_DISABLE_INTEL(name) PYBIND11_PRAGMA(warning disable name)
+#else
+#    define PYBIND11_WARNING_DISABLE_INTEL(name)
+#endif
+
+#define PYBIND11_NAMESPACE_BEGIN(name)                                                            \
+    namespace name {                                                                              \
+    PYBIND11_WARNING_PUSH
+
+#define PYBIND11_NAMESPACE_END(name)                                                              \
+    PYBIND11_WARNING_POP                                                                          \
+    }
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute
+// on the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#    if defined(__GNUG__) && !defined(_WIN32)
+#        define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#    else
+#        define PYBIND11_NAMESPACE pybind11
+#    endif
+#endif
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e7bf659193938643e4d72c80408b4e707d9992a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/pybind11/detail/value_and_holder.h
@@ -0,0 +1,95 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) 2016-2024 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include "common.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <typeinfo>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index)
+        : inst{i}, index{index}, type{type},
+          vh{inst->simple_layout ? inst->simple_value_holder
+                                 : &inst->nonsimple.values_and_holders[vpos]} {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() = default;
+
+    // Used for past-the-end iterator
+    explicit value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void>
+    V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr() != nullptr; }
+
+    template <typename H>
+    H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+                   ? inst->simple_holder_constructed
+                   : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u;
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_holder_constructed = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed;
+        }
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+                   ? inst->simple_instance_registered
+                   : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0);
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_instance_registered = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered;
+        }
+    }
+};
+
+// This is a semi-public API to check if the corresponding instance has been constructed with a
+// holder. That is, if the instance has been constructed with a holder, the `__init__` method is
+// called and the C++ object is valid. Otherwise, the C++ object might only be allocated, but not
+// initialized. This will lead to **SEGMENTATION FAULTS** if the C++ object is used in any way.
+// Example usage: https://pybind11.readthedocs.io/en/stable/advanced/classes.html#custom-type-setup
+//                for `tp_traverse` and `tp_clear` implementations.
+// WARNING: The caller is responsible for ensuring that the `reinterpret_cast` is valid.
+inline bool is_holder_constructed(PyObject *obj) {
+    auto *const instance = reinterpret_cast<pybind11::detail::instance *>(obj);
+    return instance->get_value_and_holder().holder_constructed();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..27c149f5f8ffb98d6693df21db5c93f3f870e5d7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/basic/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace basic {
+
+std::shared_ptr<Context> create();
+
+} // namespace basic
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..868b7d823a3d788907f69127654cebacf1e22358
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cma/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cma {
+
+std::shared_ptr<Context> create();
+
+} // namespace cma
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..8affe24eae78be35640f3e5f4a9ea238e1dc4522
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/context.h
@@ -0,0 +1,113 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <tensorpipe/common/buffer.h>
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+namespace channel {
+
+enum class Endpoint : bool { kConnect, kListen };
+
+class Channel;
+
+// Abstract base class for channel context classes.
+//
+// Instances of these classes are expected to be registered with a
+// context. All registered instances are assumed to be eligible
+// channels for all pairs.
+//
+class Context {
+ public:
+  // Return whether the context is able to operate correctly.
+  //
+  // Some channel types may be unable to perform as intended under some
+  // circumstances (e.g., specialized hardware unavailable, lack of
+  // permissions). They can report it through this method in order for
+  // the core context to avoid registering them in the first place.
+  //
+  virtual bool isViable() const = 0;
+
+  // Return the number of control connections needed to create an instance of
+  // this channel.
+  //
+  // Most channels require only one, but some require more (cuda_basic), and
+  // some might require none.
+  //
+  virtual size_t numConnectionsNeeded() const = 0;
+
+  // Return a map from supported devices to strings describing the device from
+  // the channel's perspective.
+  //
+  // Two processes with a channel context of the same type can leverage this
+  // channel to make two devices communicate if one side's device descriptor is
+  // "accepted" by the other one, using the canCommunicateWithRemote method
+  // below. That method must be symmetric, and unless overridden defaults to
+  // string comparison.
+  //
+  virtual const std::unordered_map<Device, std::string>& deviceDescriptors()
+      const = 0;
+
+  // Compare local and remote device descriptors for compatibility.
+  //
+  // Determine whether a channel can be opened between a local device and
+  // a remote one that has the given device descriptor. This function
+  // needs to be symmetric: if we called this method on the remote
+  // context with the local descriptor we should get the same answer.
+  // Unless overridden it defaults to string comparison.
+  //
+  virtual bool canCommunicateWithRemote(
+      const std::string& localDeviceDescriptor,
+      const std::string& remoteDeviceDescriptor) const = 0;
+
+  // Return newly created channel using the specified connections.
+  //
+  // It is up to the channel to either use these connections for further
+  // initialization, or use them directly. Either way, the returned
+  // channel should be immediately usable. If the channel isn't fully
+  // initialized yet, take care to queue these operations to execute
+  // as soon as initialization has completed.
+  //
+  virtual std::shared_ptr<Channel> createChannel(
+      std::vector<std::shared_ptr<transport::Connection>>,
+      Endpoint) = 0;
+
+  // Tell the context what its identifier is.
+  //
+  // This is only supposed to be called from the high-level context. It will
+  // only used for logging and debugging purposes.
+  virtual void setId(std::string id) = 0;
+
+  // Put the channel context in a terminal state, in turn closing all of its
+  // channels, and release its resources. This may be done asynchronously, in
+  // background.
+  virtual void close() = 0;
+
+  // Wait for all resources to be released and all background activity to stop.
+  virtual void join() = 0;
+
+  virtual ~Context() = default;
+
+ private:
+  std::string name_;
+};
+
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..715d476a2618e54b6d828168edf6f4aa57ca41ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_basic/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cuda_basic {
+
+std::shared_ptr<Context> create(std::shared_ptr<Context> cpuContext);
+
+} // namespace cuda_basic
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cb6e8b1ed9088d261ef8fd56d0172bb8a2d670f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/error.h
@@ -0,0 +1,38 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/channel/error.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cuda_gdr {
+
+class IbvError final : public BaseError {
+ public:
+  explicit IbvError(std::string error) : error_(error) {}
+
+  std::string what() const override {
+    return error_;
+  }
+
+ private:
+  std::string error_;
+};
+
+} // namespace cuda_gdr
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..32695bedbd095ba759433ce409426904b9308f07
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_gdr/factory.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <tensorpipe/channel/context.h>
+#include <tensorpipe/common/optional.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cuda_gdr {
+
+std::shared_ptr<Context> create(
+    optional<std::vector<std::string>> gpuIdxToNicName = nullopt);
+
+} // namespace cuda_gdr
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..673ff4b9189164f7bc646ea6133a140dd7ee1d99
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_ipc/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cuda_ipc {
+
+std::shared_ptr<Context> create();
+
+} // namespace cuda_ipc
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9ea4a4f2682cafc0d6ce28d583e2f9bb3334a86
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/cuda_xth/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace cuda_xth {
+
+std::shared_ptr<Context> create();
+
+} // namespace cuda_xth
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..edacbdc2aa9007117ac4afb69f76eeeb47f7883b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/error.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/common/error.h>
+
+namespace tensorpipe {
+namespace channel {
+
+class ContextClosedError final : public BaseError {
+ public:
+  ContextClosedError() {}
+
+  std::string what() const override;
+};
+
+class ChannelClosedError final : public BaseError {
+ public:
+  ChannelClosedError() {}
+
+  std::string what() const override;
+};
+
+class ContextNotViableError final : public BaseError {
+ public:
+  ContextNotViableError() {}
+
+  std::string what() const override;
+};
+
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1ad01173834d1b4810ac8fc7408e1a3542ef496
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/mpt/factory.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <tensorpipe/channel/context.h>
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace mpt {
+
+std::shared_ptr<Context> create(
+    std::vector<std::shared_ptr<transport::Context>> contexts,
+    std::vector<std::shared_ptr<transport::Listener>> listeners);
+
+} // namespace mpt
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..07a0f5f27d925c96fa4a151bcb3db3e9c0fdae01
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/channel/xth/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+namespace channel {
+namespace xth {
+
+std::shared_ptr<Context> create();
+
+} // namespace xth
+} // namespace channel
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c250c27e810659ddf1635cff80f77f8d14fb2d2a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/buffer.h
@@ -0,0 +1,140 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <tensorpipe/common/cpu_buffer.h>
+#include <tensorpipe/common/device.h>
+
+namespace tensorpipe {
+
+class Buffer {
+  class AbstractBufferWrapper {
+   public:
+    virtual Device device() const = 0;
+    virtual void copyConstructInto(void* ptr) const = 0;
+    virtual void moveConstructInto(void* ptr) = 0;
+    virtual ~AbstractBufferWrapper() = default;
+  };
+
+  template <typename TBuffer>
+  class BufferWrapper : public AbstractBufferWrapper {
+    static_assert(
+        std::is_trivially_copyable<TBuffer>::value,
+        "wrapping non-trivially copyable class");
+
+   public:
+    TBuffer buffer;
+
+    explicit BufferWrapper(TBuffer buffer) : buffer(std::move(buffer)) {}
+
+    Device device() const override {
+      return buffer.getDevice();
+    }
+
+    void copyConstructInto(void* ptr) const override {
+      new (ptr) BufferWrapper(*this);
+    }
+
+    void moveConstructInto(void* ptr) override {
+      new (ptr) BufferWrapper(std::move(*this));
+    }
+  };
+
+ public:
+  template <typename TBuffer>
+  /* implicit */ Buffer(TBuffer b) {
+    static_assert(
+        sizeof(BufferWrapper<TBuffer>) <= kStructSize, "kStructSize too small");
+    static_assert(
+        alignof(BufferWrapper<TBuffer>) <= kStructAlign,
+        "kStructAlign too small");
+    new (&raw_) BufferWrapper<TBuffer>(std::move(b));
+  }
+
+  Buffer() : Buffer(CpuBuffer{}) {}
+
+  Buffer(const Buffer& other) {
+    other.ptr()->copyConstructInto(&raw_);
+  }
+
+  Buffer& operator=(const Buffer& other) {
+    if (this != &other) {
+      ptr()->~AbstractBufferWrapper();
+      other.ptr()->copyConstructInto(&raw_);
+    }
+    return *this;
+  }
+
+  Buffer(Buffer&& other) noexcept {
+    other.ptr()->moveConstructInto(&raw_);
+  }
+
+  Buffer& operator=(Buffer&& other) {
+    if (this != &other) {
+      ptr()->~AbstractBufferWrapper();
+      other.ptr()->moveConstructInto(&raw_);
+    }
+    return *this;
+  }
+
+  ~Buffer() {
+    ptr()->~AbstractBufferWrapper();
+  }
+
+  template <typename TBuffer>
+  TBuffer& unwrap() {
+    BufferWrapper<TBuffer>* wrapperPtr =
+        dynamic_cast<BufferWrapper<TBuffer>*>(ptr());
+    if (wrapperPtr == nullptr) {
+      throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer");
+    }
+    return wrapperPtr->buffer;
+  }
+
+  template <typename TBuffer>
+  const TBuffer& unwrap() const {
+    const BufferWrapper<TBuffer>* wrapperPtr =
+        dynamic_cast<const BufferWrapper<TBuffer>*>(ptr());
+    if (wrapperPtr == nullptr) {
+      throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer");
+    }
+    return wrapperPtr->buffer;
+  }
+
+  Device device() const {
+    return ptr()->device();
+  }
+
+ private:
+  static constexpr int kStructSize = 32;
+  static constexpr int kStructAlign = 8;
+  std::aligned_storage<kStructSize, kStructAlign>::type raw_{};
+
+  const AbstractBufferWrapper* ptr() const {
+    // FIXME: Once we go C++17, use std::launder on the returned pointer.
+    return reinterpret_cast<const AbstractBufferWrapper*>(&raw_);
+  }
+
+  AbstractBufferWrapper* ptr() {
+    // FIXME: Once we go C++17, use std::launder on the returned pointer.
+    return reinterpret_cast<AbstractBufferWrapper*>(&raw_);
+  }
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f34eb829ca08673643b615147b44931ad062e2e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cpu_buffer.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <tensorpipe/common/device.h>
+
+namespace tensorpipe {
+
+struct CpuBuffer {
+  void* ptr{nullptr};
+
+  Device getDevice() const {
+    return Device{kCpuDeviceType, 0};
+  }
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f377784914c96f30071fe2cb7b720d6e8ee2d80b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/cuda_buffer.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <tensorpipe/common/device.h>
+
+namespace tensorpipe {
+
+struct CudaBuffer {
+  void* ptr{nullptr};
+  cudaStream_t stream{cudaStreamDefault};
+
+  Device getDevice() const;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4a814563b585ddd437b4b1d86fe526e420abf81
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/device.h
@@ -0,0 +1,69 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace tensorpipe {
+
+const std::string kCpuDeviceType{"cpu"};
+const std::string kCudaDeviceType{"cuda"};
+
+struct Device {
+  std::string type;
+  int index;
+
+  // This pointless constructor is needed to work around a bug in GCC 5.5 (and
+  // possibly other versions). It appears to be needed in the nop types that
+  // are used inside nop::Optional.
+  Device() {}
+
+  Device(std::string type, int index) : type(std::move(type)), index(index) {}
+
+  std::string toString() const {
+    std::stringstream ss;
+    ss << type << ":" << index;
+    return ss.str();
+  }
+
+  bool operator==(const Device& other) const {
+    return type == other.type && index == other.index;
+  }
+};
+
+} // namespace tensorpipe
+
+namespace std {
+
+template <>
+struct hash<::tensorpipe::Device> {
+  size_t operator()(const ::tensorpipe::Device& device) const noexcept {
+    return std::hash<std::string>{}(device.toString());
+  }
+};
+
+template <>
+struct hash<std::pair<::tensorpipe::Device, ::tensorpipe::Device>> {
+  size_t operator()(const std::pair<::tensorpipe::Device, ::tensorpipe::Device>&
+                        p) const noexcept {
+    size_t h1 = std::hash<::tensorpipe::Device>{}(p.first);
+    size_t h2 = std::hash<::tensorpipe::Device>{}(p.second);
+    // Shifting one hash to avoid collisions between (a, b) and (b, a).
+    return h1 ^ (h2 << 1);
+  }
+};
+
+} // namespace std
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..22d56522a1563d03ab471402a91ff24748a2a9af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/error.h
@@ -0,0 +1,132 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+namespace tensorpipe {
+
+// Base class for actual errors.
+class BaseError {
+ public:
+  virtual ~BaseError() = default;
+
+  // Returns an explanatory string.
+  // Like `std::exception` but returns a `std::string`.
+  virtual std::string what() const = 0;
+};
+
+// Wrapper class for errors.
+//
+// Background: we wish to not use exceptions yet need an error
+// representation that can propagate across function and thread
+// boundaries. This representation must be copyable (so we can store
+// and return it at a later point in time) and retain downstream type
+// information. This implies a heap allocation because it's the
+// easiest way to deal with variable size objects (barring a union of
+// all downstream error classes and a lot of custom code). Instead of
+// passing a shared_ptr around directly, we use this wrapper class to
+// keep implementation details hidden from calling code.
+//
+class Error final {
+ public:
+  // Constant instance that indicates success.
+  static const Error kSuccess;
+
+  // Default constructor for error that is not an error.
+  Error() {}
+
+  Error(std::shared_ptr<BaseError> error, std::string file, int line)
+      : error_(std::move(error)), file_(std::move(file)), line_(line) {}
+
+  ~Error() = default;
+
+  // Converting to boolean means checking if there is an error. This
+  // means we don't need to use an `std::optional` and allows for a
+  // snippet like the following:
+  //
+  //   if (error) {
+  //     // Deal with it.
+  //   }
+  //
+  operator bool() const {
+    return static_cast<bool>(error_);
+  }
+
+  template <typename T>
+  std::shared_ptr<T> castToType() const {
+    return std::dynamic_pointer_cast<T>(error_);
+  }
+
+  template <typename T>
+  bool isOfType() const {
+    return castToType<T>() != nullptr;
+  }
+
+  // Like `std::exception` but returns a `std::string`.
+  std::string what() const;
+
+ private:
+  std::shared_ptr<BaseError> error_;
+  std::string file_;
+  int line_;
+};
+
+class SystemError final : public BaseError {
+ public:
+  explicit SystemError(const char* syscall, int error)
+      : syscall_(syscall), error_(error) {}
+
+  std::string what() const override;
+
+  int errorCode() const;
+
+ private:
+  const char* syscall_;
+  const int error_;
+};
+
+class ShortReadError final : public BaseError {
+ public:
+  ShortReadError(ssize_t expected, ssize_t actual)
+      : expected_(expected), actual_(actual) {}
+
+  std::string what() const override;
+
+ private:
+  const ssize_t expected_;
+  const ssize_t actual_;
+};
+
+class ShortWriteError final : public BaseError {
+ public:
+  ShortWriteError(ssize_t expected, ssize_t actual)
+      : expected_(expected), actual_(actual) {}
+
+  std::string what() const override;
+
+ private:
+  const ssize_t expected_;
+  const ssize_t actual_;
+};
+
+class EOFError final : public BaseError {
+ public:
+  EOFError() {}
+
+  std::string what() const override;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h
new file mode 100644
index 0000000000000000000000000000000000000000..14f2ef1a80cad8ea5b1599751895dc69daa6121b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/common/optional.h
@@ -0,0 +1,16 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+#include <optional>
+
+namespace tensorpipe {
+
+using std::optional;
+using std::nullopt;
+
+} // namespace tensorpipe
+
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..8564d6b6685d10ce6f7b55e8dca1c003def24ae1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/context.h
@@ -0,0 +1,101 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <tensorpipe/transport/context.h>
+
+#include <tensorpipe/channel/context.h>
+
+namespace tensorpipe {
+
+class ContextImpl;
+class Listener;
+class Pipe;
+
+class ContextOptions {
+ public:
+  // The name should be a semantically meaningful description of this context.
+  // It will only be used for logging and debugging purposes, to identify the
+  // endpoints of a pipe.
+  ContextOptions&& name(std::string name) && {
+    name_ = std::move(name);
+    return std::move(*this);
+  }
+
+ private:
+  std::string name_;
+
+  friend ContextImpl;
+};
+
+class PipeOptions {
+ public:
+  // The name should be a semantically meaningful description of the context
+  // that the pipe is connecting to. It will only be used for logging and
+  // debugging purposes, to identify the endpoints of a pipe.
+  PipeOptions&& remoteName(std::string remoteName) && {
+    remoteName_ = std::move(remoteName);
+    return std::move(*this);
+  }
+
+ private:
+  std::string remoteName_;
+
+  friend ContextImpl;
+};
+
+class Context final {
+ public:
+  explicit Context(ContextOptions opts = ContextOptions());
+
+  void registerTransport(
+      int64_t priority,
+      std::string transport,
+      std::shared_ptr<transport::Context> context);
+
+  void registerChannel(
+      int64_t priority,
+      std::string channel,
+      std::shared_ptr<channel::Context> context);
+
+  std::shared_ptr<Listener> listen(const std::vector<std::string>& urls);
+
+  std::shared_ptr<Pipe> connect(
+      const std::string& url,
+      PipeOptions opts = PipeOptions());
+
+  // Put the context in a terminal state, in turn closing all of its pipes and
+  // listeners, and release its resources. This may be done asynchronously, in
+  // background.
+  void close();
+
+  // Wait for all resources to be released and all background activity to stop.
+  void join();
+
+  ~Context();
+
+ private:
+  // The implementation is managed by a shared_ptr because each child object
+  // will also hold a shared_ptr to it. However, its lifetime is tied to the one
+  // of this public object since when the latter is destroyed the implementation
+  // is closed and joined.
+  const std::shared_ptr<ContextImpl> impl_;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..b20c9cadbc58959e81ff7b97577000f146f067a4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/error.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/common/error.h>
+
+namespace tensorpipe {
+
+class LogicError final : public BaseError {
+ public:
+  explicit LogicError(std::string reason) : reason_(std::move(reason)) {}
+
+  std::string what() const override;
+
+ private:
+  const std::string reason_;
+};
+
+class ContextClosedError final : public BaseError {
+ public:
+  explicit ContextClosedError() {}
+
+  std::string what() const override;
+};
+
+class ListenerClosedError final : public BaseError {
+ public:
+  explicit ListenerClosedError() {}
+
+  std::string what() const override;
+};
+
+class PipeClosedError final : public BaseError {
+ public:
+  explicit PipeClosedError() {}
+
+  std::string what() const override;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h
new file mode 100644
index 0000000000000000000000000000000000000000..122de98c9d7b8e8ee12463a431dec4e3a82177a2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/listener.h
@@ -0,0 +1,101 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <tensorpipe/common/error.h>
+
+namespace tensorpipe {
+
+class ContextImpl;
+class ListenerImpl;
+class Pipe;
+
+// The listener.
+//
+// Listeners are used to produce pipes. Depending on the type of the
+// context, listeners may use a variety of addresses to listen on. For
+// example, for TCP/IP sockets they listen on an IPv4 or IPv6 address,
+// for Unix domain sockets they listen on a path, etcetera.
+//
+// A pipe can only be accepted from this listener after it has been
+// fully established. This means that both its connection and all its
+// side channels have been established.
+//
+class Listener final {
+  // Use the passkey idiom to allow make_shared to call what should be a private
+  // constructor. See https://abseil.io/tips/134 for more information.
+  struct ConstructorToken {};
+
+ public:
+  Listener(
+      ConstructorToken token,
+      std::shared_ptr<ContextImpl> context,
+      std::string id,
+      const std::vector<std::string>& urls);
+
+  //
+  // Entry points for user code
+  //
+
+  using accept_callback_fn =
+      std::function<void(const Error&, std::shared_ptr<Pipe>)>;
+
+  void accept(accept_callback_fn fn);
+
+  // Returns map with the materialized address of listeners by transport.
+  //
+  // If you don't bind a transport listener to a specific port or address, it
+  // may generate its address automatically. Then, in order to connect to the
+  // listener, the user must use a separate mechanism to communicate the
+  // materialized address to whoever wants to connect.
+  //
+  const std::map<std::string, std::string>& addresses() const;
+
+  // Returns materialized address for specific transport.
+  //
+  // See `addresses()` for more information.
+  //
+  const std::string& address(const std::string& transport) const;
+
+  // Returns URL with materialized address for specific transport.
+  //
+  // See `addresses()` for more information.
+  //
+  std::string url(const std::string& transport) const;
+
+  // Put the listener in a terminal state, aborting its pending operations and
+  // rejecting future ones, and release its resrouces. This may be carried out
+  // asynchronously, in background. Since the pipes may occasionally use the
+  // listener to open new connections, closing a listener may trigger errors
+  // in the pipes.
+  void close();
+
+  ~Listener();
+
+ private:
+  // Using a shared_ptr allows us to detach the lifetime of the implementation
+  // from the public object's one and perform the destruction asynchronously.
+  const std::shared_ptr<ListenerImpl> impl_;
+
+  // Allow context to access constructor token.
+  friend ContextImpl;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..87106638ca97509d7ba4cbfb767ce8634046f527
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/message.h
@@ -0,0 +1,109 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include <tensorpipe/common/buffer.h>
+#include <tensorpipe/common/optional.h>
+
+namespace tensorpipe {
+
+// Messages consist of a primary buffer and zero or more separate
+// buffers. The primary buffer is always a host-side memory region that
+// contains a serialized version of the message we're dealing with. This
+// serialized message, in turn, may have references to the separate
+// buffers that accompany the primary buffer. These separate buffers may
+// point to any type of memory, host-side or device-side.
+//
+class Message final {
+ public:
+  std::string metadata;
+
+  struct Payload {
+    void* data{nullptr};
+    size_t length{0};
+
+    // Users may include arbitrary metadata in the following fields.
+    // This may contain allocation hints for the receiver, for example.
+    std::string metadata;
+  };
+
+  // Holds the payloads that are transferred over the primary connection.
+  std::vector<Payload> payloads;
+
+  struct Tensor {
+    tensorpipe::Buffer buffer;
+    size_t length{0};
+
+    // Users may optionally specify the target device, on which the receiver
+    // should allocate memory for this tensor. If left unset, the receiver will
+    // choose one at their convenience.
+    optional<Device> targetDevice;
+
+    // Users may include arbitrary metadata in the following field.
+    // This may contain allocation hints for the receiver, for example.
+    std::string metadata;
+  };
+
+  // Holds the tensors that are offered to the side channels.
+  std::vector<Tensor> tensors;
+};
+
+// Descriptors consist of metadata required by the receiver to allocate memory
+// for an incoming message.
+class Descriptor final {
+ public:
+  std::string metadata;
+
+  struct Payload {
+    size_t length{0};
+    std::string metadata;
+  };
+  std::vector<Payload> payloads;
+
+  struct Tensor {
+    size_t length{0};
+
+    // This is the sender-side device from which this tensor is being sent.
+    Device sourceDevice;
+
+    // The sender may optionally specify a target device, in which case the
+    // receiver must allocate memory for this tensor on the specified device.
+    optional<Device> targetDevice;
+
+    std::string metadata;
+  };
+  std::vector<Tensor> tensors;
+};
+
+// Allocations consist of actual memory allocations provided by the receiver for
+// an incoming message. They must match the length and target devices specified
+// in the corresponding Descriptor.
+class Allocation final {
+ public:
+  struct Payload {
+    void* data{nullptr};
+  };
+  std::vector<Payload> payloads;
+
+  struct Tensor {
+    tensorpipe::Buffer buffer;
+  };
+  std::vector<Tensor> tensors;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7192bf2155033c84a2577dae9c48951a06dbbf6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/core/pipe.h
@@ -0,0 +1,103 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <tensorpipe/common/error.h>
+#include <tensorpipe/core/message.h>
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+
+class ContextImpl;
+class ListenerImpl;
+class PipeImpl;
+
+// The pipe.
+//
+// Pipes represent a set of connections between a pair of processes.
+// Unlike POSIX pipes, they are message oriented instead of byte
+// oriented. Messages that are sent through the pipe may use whatever
+// channels are at their disposal to make it happen. If the pair of
+// processes happen to be colocated on the same machine, they may
+// leverage a region of shared memory to communicate the primary
+// buffer of a message. Secondary buffers may use shared memory as
+// well, if they're located in CPU memory, or use a CUDA device to
+// device copy if they're located in NVIDIA GPU memory. If the pair is
+// located across the world, they may simply use a set of TCP
+// connections to communicate.
+//
+class Pipe final {
+  // Use the passkey idiom to allow make_shared to call what should be a private
+  // constructor. See https://abseil.io/tips/134 for more information.
+  struct ConstructorToken {};
+
+ public:
+  //
+  // Initialization
+  //
+
+  Pipe(
+      ConstructorToken token,
+      std::shared_ptr<ContextImpl> context,
+      std::string id,
+      std::string remoteName,
+      const std::string& url);
+
+  Pipe(ConstructorToken token, std::shared_ptr<PipeImpl> impl);
+
+  //
+  // Entry points for user code
+  //
+
+  using read_descriptor_callback_fn =
+      std::function<void(const Error&, Descriptor)>;
+
+  void readDescriptor(read_descriptor_callback_fn fn);
+
+  using read_callback_fn = std::function<void(const Error&)>;
+
+  void read(Allocation allocation, read_callback_fn fn);
+
+  using write_callback_fn = std::function<void(const Error&)>;
+
+  void write(Message message, write_callback_fn fn);
+
+  // Retrieve the user-defined name that was given to the constructor of the
+  // context on the remote side, if any (if not, this will be the empty string).
+  // This is intended to help in logging and debugging only.
+  const std::string& getRemoteName();
+
+  // Put the pipe in a terminal state, aborting its pending operations and
+  // rejecting future ones, and release its resrouces. This may be carried out
+  // asynchronously, in background.
+  void close();
+
+  ~Pipe();
+
+ private:
+  // Using a shared_ptr allows us to detach the lifetime of the implementation
+  // from the public object's one and perform the destruction asynchronously.
+  const std::shared_ptr<PipeImpl> impl_;
+
+  // Allow context to access constructor token.
+  friend ContextImpl;
+  // Allow listener to access constructor token.
+  friend ListenerImpl;
+};
+
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cbecc4880b08343ca91c43217412476aeeee806
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/context.h
@@ -0,0 +1,83 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+namespace tensorpipe {
+namespace transport {
+
+class Connection;
+class Listener;
+
+class Context {
+ public:
+  virtual std::shared_ptr<Connection> connect(std::string addr) = 0;
+
+  virtual std::shared_ptr<Listener> listen(std::string addr) = 0;
+
+  // Return whether the context is able to operate correctly.
+  //
+  // Some transport types may be unable to perform as intended under
+  // some circumstances (e.g., specialized hardware unavailable, lack
+  // of permissions). They can report it through this method in order
+  // for the core context to avoid registering them in the first place.
+  //
+  virtual bool isViable() const = 0;
+
+  // Return string to describe the domain for this context.
+  //
+  // Two processes with a context of the same type can connect to each
+  // other if one side's domain descriptor is "accepted" by the other
+  // one, using the canCommunicateWithRemote method below. That method
+  // must be symmetric, and unless overridden defaults to string
+  // comparison.
+  //
+  // For example, for a transport that leverages TCP/IP, this may be
+  // as simple as the address family (assuming we can route between
+  // any two processes). For a transport that leverages shared memory,
+  // this descriptor must uniquely identify the machine, such that
+  // only co-located processes generate the same domain descriptor.
+  //
+  virtual const std::string& domainDescriptor() const = 0;
+
+  // Compare local and remote domain descriptor for compatibility.
+  //
+  // Determine whether a connection can be opened between this context
+  // and a remote one that has the given domain descriptor. This
+  // function needs to be symmetric: if we called this method on the
+  // remote context with the local descriptor we should get the same
+  // answer. Unless overridden it defaults to string comparison.
+  //
+  virtual bool canCommunicateWithRemote(
+      const std::string& remoteDomainDescriptor) const {
+    return domainDescriptor() == remoteDomainDescriptor;
+  }
+
+  // Tell the context what its identifier is.
+  //
+  // This is only supposed to be called from the high-level context or from
+  // channel contexts. It will only used for logging and debugging purposes.
+  virtual void setId(std::string id) = 0;
+
+  virtual void close() = 0;
+
+  virtual void join() = 0;
+
+  virtual ~Context() = default;
+};
+
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f7c866850144dfe9c0f465d733ea472336c95de
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/error.h
@@ -0,0 +1,52 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/common/error.h>
+
+namespace tensorpipe {
+namespace transport {
+
+class ContextClosedError final : public BaseError {
+ public:
+  ContextClosedError() {}
+
+  std::string what() const override;
+};
+
+class ListenerClosedError final : public BaseError {
+ public:
+  ListenerClosedError() {}
+
+  std::string what() const override;
+};
+
+class ConnectionClosedError final : public BaseError {
+ public:
+  ConnectionClosedError() {}
+
+  std::string what() const override;
+};
+
+class ContextNotViableError final : public BaseError {
+ public:
+  ContextNotViableError() {}
+
+  std::string what() const override;
+};
+
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..de0bb2fc3516dde57ac71961136322b85a3c63fa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/error.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/transport/error.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace ibv {
+
+class IbvError final : public BaseError {
+ public:
+  explicit IbvError(std::string error) : error_(error) {}
+
+  std::string what() const override;
+
+ private:
+  std::string error_;
+};
+
+class GetaddrinfoError final : public BaseError {
+ public:
+  explicit GetaddrinfoError(int error) : error_(error) {}
+
+  std::string what() const override;
+
+ private:
+  int error_;
+};
+
+class NoAddrFoundError final : public BaseError {
+ public:
+  NoAddrFoundError() {}
+
+  std::string what() const override;
+};
+
+} // namespace ibv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..b708a78b38c40c77122b1d0fc6f3389adec61725
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace ibv {
+
+std::shared_ptr<Context> create();
+
+} // namespace ibv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbaac69ca4162d2db739311b029be5868037a156
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/ibv/utility.h
@@ -0,0 +1,31 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+#include <tuple>
+
+#include <tensorpipe/common/error.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace ibv {
+
+std::tuple<Error, std::string> lookupAddrForIface(std::string iface);
+
+std::tuple<Error, std::string> lookupAddrForHostname();
+
+} // namespace ibv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1124fba3072ae66c1a77dcd881a7fbfbbed4da8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/shm/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace shm {
+
+std::shared_ptr<Context> create();
+
+} // namespace shm
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..efb26e52877aea08aacf4f03b37fa3c5bae1c55a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/error.h
@@ -0,0 +1,43 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <tensorpipe/transport/error.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace uv {
+
+class UVError final : public BaseError {
+ public:
+  explicit UVError(int error) : error_(error) {}
+
+  std::string what() const override;
+
+ private:
+  int error_;
+};
+
+class NoAddrFoundError final : public BaseError {
+ public:
+  NoAddrFoundError() {}
+
+  std::string what() const override;
+};
+
+} // namespace uv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a4ec2cabc45da9a481e836c1d07cb18a38ae5e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/factory.h
@@ -0,0 +1,28 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <tensorpipe/transport/context.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace uv {
+
+std::shared_ptr<Context> create();
+
+} // namespace uv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3b757b1e8c2b27327abb6274d5b3a0d60a2085
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/tensorpipe/transport/uv/utility.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <string>
+#include <tuple>
+
+#include <sys/socket.h>
+
+#include <tensorpipe/common/error.h>
+#include <tensorpipe/common/optional.h>
+
+namespace tensorpipe {
+namespace transport {
+namespace uv {
+
+std::tuple<Error, std::string> lookupAddrForIface(std::string iface);
+
+std::tuple<Error, std::string> lookupAddrForHostname();
+
+// Try to replicate the same logic used by NCCL to find a node's own address.
+// Roughly, it returns the "first" usable address it can find, and prioritizes
+// the interfaces with an `ib` prefix and de-prioritizes those with a `docker`
+// or `lo` prefix. It can optionally only return only IPv4 or IPv4 addresses.
+std::tuple<Error, std::string> lookupAddrLikeNccl(
+    optional<sa_family_t> familyFilter = nullopt);
+
+} // namespace uv
+} // namespace transport
+} // namespace tensorpipe
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53156bfaf0fa8488f6fbee5cdb9bb230b8b7fe57
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecf2fd803f71d966944d3bb18bb5ca5c6356a297
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/_reduction.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4319a31810762cca4ef656caf92dcf9278cdca53
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/common_types.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60932c45cc966cc136a13e6e70a69f75cf2769a8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/cpp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e47580790177768defa324d74b29797c256014e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/grad.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1f75dec49dee7b1fc70e7821b1d05400f109742
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/init.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64385e60d07b87b2859775340ef274861bcb3286
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/__pycache__/parameter.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..438a8bc55caf0b73780288496a943848d7a71191
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/__init__.py
@@ -0,0 +1,187 @@
+# mypy: allow-untyped-defs
+"""This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention"""
+
+import contextlib
+from collections.abc import Iterable
+from typing import Union
+from warnings import warn
+
+import torch.backends.cuda
+from torch._C import _SDPBackend as SDPBackend
+from torch.backends.cuda import (
+    can_use_efficient_attention,
+    can_use_flash_attention,
+    SDPAParams,
+)
+
+
+__all__: list[str] = [
+    "SDPBackend",
+    "sdpa_kernel",
+    "WARN_FOR_UNFUSED_KERNELS",
+    "register_flash_attention_impl",
+    "activate_flash_attention_impl",
+    "list_flash_attention_impls",
+    "current_flash_attention_impl",
+]
+
+
+# Note: [SDPA warnings]
+# TODO: Consider using this for sdpa regardless of subclasses
+# This only effects users of bias subclasses
+# If this is set to True, we will warn the user if they are not using the fused kernels
+# As well, it will raise warnings for all the reasons why the fused kernels can't be run.
+# To set this to True, run
+# torch.nn.attention.WARN_FOR_UNFUSED_KERNELS = True
+WARN_FOR_UNFUSED_KERNELS = False
+
+
+r"""An enum-like class that contains the different backends for scaled dot product attention.
+    This backend class is designed to be used with the sdpa_kernel context manager.
+
+    The following Enums are available:
+        - ERROR: An error occurred when trying to determine the backend.
+        - MATH: The math backend for scaled dot product attention.
+        - FLASH_ATTENTION: The flash attention backend for scaled dot product attention.
+        - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention.
+        - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention.
+        - OVERRIDEABLE: The overridable backend for extension.
+
+    See :func:`torch.nn.attention.sdpa_kernel` for more details.
+
+    .. warning:: This class is in beta and subject to change.
+"""
+SDPBackend.__module__ = __name__
+SDPBackend.__name__ = "SDPBackend"
+
+
+def _raise_kernel_warnings(params: SDPAParams) -> None:
+    """
+    If WARN_FOR_UNFUSED_KERNELS is set to True, this will raise warnings
+    for all the reasons why the fused kernels can't be run. If using subclasses
+    """
+    if WARN_FOR_UNFUSED_KERNELS:
+        if not can_use_efficient_attention(params):
+            warn("Efficient attention can't be used because:", stacklevel=2)
+            can_use_efficient_attention(params, True)
+        if not can_use_flash_attention(params):
+            warn("Flash attention can't be used because:", stacklevel=2)
+            can_use_flash_attention(params, True)
+
+
+_backend_names = {
+    "cudnn": "CUDNN_ATTENTION",
+    "flash": "FLASH_ATTENTION",
+    "mem_efficient": "EFFICIENT_ATTENTION",
+    "math": "MATH",
+    "overrideable": "OVERRIDEABLE",
+}
+
+
+def _backend_from_string(name: str):
+    return getattr(SDPBackend, name)
+
+
+def _cur_sdpa_kernel_backends(with_priority: bool = False):
+    backends = []
+    for name, val in _backend_names.items():
+        if getattr(torch._C, f"_get_{name}_sdp_enabled")():
+            backends.append(getattr(SDPBackend, val))
+    if with_priority:
+        curr_priority = torch._C._get_sdp_priority_order()
+        backends = sorted(
+            backends, key=lambda backend: curr_priority.index(int(backend))
+        )
+    return backends
+
+
+def _sdpa_kernel(backends: Iterable, set_priority: bool = False) -> None:
+    for name, val in _backend_names.items():
+        enabled = getattr(SDPBackend, val) in backends
+        getattr(torch._C, f"_set_sdp_use_{name}")(enabled)
+    if set_priority:
+        # backends should be a unique list
+        user_priority = [int(backend) for backend in backends]
+        previous_priority = torch._C._get_sdp_priority_order()
+        for backend in previous_priority:
+            if backend not in user_priority:
+                user_priority.append(int(backend))
+        torch._C._set_sdp_priority_order(user_priority)
+
+
+@contextlib.contextmanager
+def sdpa_kernel(backends: list[SDPBackend] | SDPBackend, set_priority: bool = False):
+    r"""
+    Context manager to select which backend to use for scaled dot product attention.
+
+    .. warning:: This function is beta and subject to change.
+
+    Args:
+        backends (Union[List[SDPBackend], SDPBackend]): A backend or list of backends for scaled dot product attention.
+        set_priority_order (bool=False): Whether the ordering of the backends is interpreted as their priority order.
+
+    Example:
+
+    .. code-block:: python
+
+        from torch.nn.functional import scaled_dot_product_attention
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+
+        # Only enable flash attention backend
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            scaled_dot_product_attention(...)
+
+        # Enable the Math or Efficient attention backends
+        with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+            scaled_dot_product_attention(...)
+
+    This context manager can be used to select which backend to use for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends.
+    """
+    assert isinstance(backends, (list, SDPBackend)), (
+        "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+    )
+
+    if isinstance(backends, SDPBackend):
+        backends = [backends]
+
+    backends = list(dict.fromkeys(backends))
+
+    previous_backends = _cur_sdpa_kernel_backends(with_priority=set_priority)
+    try:
+        _sdpa_kernel(backends, set_priority)
+        yield {}
+    finally:
+        _sdpa_kernel(previous_backends, set_priority)
+
+
+# variadic version of sdpa_kernel for dynamo to use while reconstructing
+@contextlib.contextmanager
+def _sdpa_kernel_variadic(*backends: SDPBackend):
+    with sdpa_kernel(list(backends)):
+        yield
+
+
+def _get_flash_version() -> str:
+    """This returns the closest matching tag for the flash attention backend"""
+    return "2.5.7"
+
+
+from . import _registry
+
+
+# Re-export registry types and functions for public API
+_FlashAttentionImpl = _registry._FlashAttentionImpl
+_RegisterFn = _registry._RegisterFn
+register_flash_attention_impl = _registry.register_flash_attention_impl
+activate_flash_attention_impl = _registry.activate_flash_attention_impl
+list_flash_attention_impls = _registry.list_flash_attention_impls
+current_flash_attention_impl = _registry.current_flash_attention_impl
+
+register_flash_attention_impl.__module__ = __name__
+activate_flash_attention_impl.__module__ = __name__
+list_flash_attention_impls.__module__ = __name__
+current_flash_attention_impl.__module__ = __name__
+
+# Import built-in implementations to trigger self-registration
+from . import _fa4  # noqa: F401
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be960ee53218e4fe01ac1f16c7416ecc0ff3822
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.py
@@ -0,0 +1,456 @@
+"""UBER PROTOTYPE!!!"""
+# mypy: allow-untyped-defs
+
+from __future__ import annotations
+
+import importlib
+from dataclasses import dataclass
+from functools import cache
+from typing import Any, TYPE_CHECKING
+from typing_extensions import TypeVarTuple, Unpack
+
+from . import _registry
+
+
+if TYPE_CHECKING:
+    from types import ModuleType
+
+import torch
+from torch.library import Library
+
+
+__all__ = [
+    "register_flash_attention_fa4",
+]
+
+
+_FA4_MODULE_PATH: str | None = None
+
+
+@dataclass
+class _FA4Handle:
+    library: Library | None
+
+    def remove(self) -> None:
+        self.library = None
+
+
+@cache
+def _get_device_major(device: torch.device) -> int:
+    major, _ = torch.cuda.get_device_capability(device)
+    return major
+
+
+def register_flash_attention_fa4(
+    module_path: str = "flash_attn.cute.interface",
+) -> _FA4Handle:
+    """
+    Register FA4 flash attention kernels with the PyTorch dispatcher.
+
+    Args:
+        module_path: Python module path to the FA4 implementation.
+    """
+    global _FA4_MODULE_PATH
+    _ = _fa4_import_module(module_path)
+    _FA4_MODULE_PATH = module_path
+    return _FA4Handle(_fa4_register_kernels())
+
+
+@cache
+def _fa4_import_module(module_path: str) -> ModuleType:
+    module = importlib.import_module(module_path)
+    if not hasattr(module, "_flash_attn_fwd") or not hasattr(module, "_flash_attn_bwd"):
+        raise RuntimeError(f"Module '{module_path}' does not expose FA4 kernels")
+    return module
+
+
+def _fa4_register_kernels() -> Library:
+    lib = Library("aten", "IMPL", "CUDA")  # noqa: TOR901
+    lib.impl("_flash_attention_forward", _fa4_flash_attention_forward_impl, "CUDA")
+    lib.impl("_flash_attention_backward", _fa4_flash_attention_backward_impl, "CUDA")
+    lib.impl(
+        "_scaled_dot_product_flash_attention",
+        _fa4_scaled_dot_product_flash_attention_forward_impl,
+        "CUDA",
+    )
+    lib.impl(
+        "_scaled_dot_product_flash_attention_backward",
+        _fa4_scaled_dot_product_flash_attention_backward_impl,
+        "CUDA",
+    )
+    return lib
+
+
+def _fa4_common_support_error(
+    query: torch.Tensor,
+    tensors: tuple[torch.Tensor, ...],
+    cum_seq_q: torch.Tensor | None,
+    require_fp32: tuple[tuple[str, torch.Tensor], ...] = (),
+) -> str | None:
+    if not all(t.is_cuda for t in tensors):
+        return "inputs must be CUDA tensors"
+    if len({t.device for t in tensors}) != 1:
+        return "inputs must share device"
+    if query.dtype not in (torch.float16, torch.bfloat16):
+        return "query dtype must be float16 or bfloat16"
+    for name, tensor in require_fp32:
+        if tensor.dtype != torch.float32:
+            return f"{name} dtype must be float32"
+    if cum_seq_q is None and query.dim() != 4:
+        return "dense query must be 4D"
+    if cum_seq_q is not None and query.dim() != 3:
+        return "ragged query must be 3D"
+    if not torch.cuda.is_available():
+        return "CUDA not available"
+    if _get_device_major(query.device) not in (9, 10):
+        return "FA4 requires compute capability 9.0 or 10.0"
+    return None
+
+
+def _fa4_forward_support_error(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float,
+    return_debug_mask: bool,
+    alibi_slopes: torch.Tensor | None,
+    seqused_k: torch.Tensor | None,
+    cum_seq_q: torch.Tensor | None,
+) -> str | None:
+    if dropout_p != 0.0:
+        return "dropout_p must be 0"
+    if return_debug_mask:
+        return "return_debug_mask must be False"
+    if alibi_slopes is not None:
+        return "alibi_slopes not supported"
+    if seqused_k is not None:
+        if seqused_k.dtype != torch.int32:
+            return "seqused_k must be int32"
+        if not seqused_k.is_cuda:
+            return "seqused_k must be CUDA"
+    error = _fa4_common_support_error(
+        query,
+        (query, key, value),
+        cum_seq_q,
+    )
+    if error is not None:
+        if error == "inputs must share device":
+            return "query, key, value must be on same device"
+        return error
+    return None
+
+
+def _fa4_backward_support_error(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    dropout_p: float,
+    cum_seq_q: torch.Tensor | None,
+    window_size_left: int | None,
+    window_size_right: int | None,
+) -> str | None:
+    if dropout_p != 0.0:
+        return "dropout_p must be 0"
+    if window_size_left is not None or window_size_right is not None:
+        return "windowed attention not supported"
+    error = _fa4_common_support_error(
+        query,
+        (grad_out, query, key, value, out, logsumexp),
+        cum_seq_q,
+        require_fp32=(("logsumexp", logsumexp),),
+    )
+    if error is not None:
+        return error
+    return None
+
+
+Ts = TypeVarTuple("Ts")
+
+
+def _transpose_dense(*tensors: Unpack[Ts]) -> tuple[Unpack[Ts]]:
+    return tuple(t.transpose(1, 2) for t in tensors)  # type: ignore[attr-defined]
+
+
+def _fa4_run_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor | None,
+    cu_seq_k: torch.Tensor | None,
+    scale: float | None,
+    is_causal: bool,
+    window_size_left: int | None,
+    window_size_right: int | None,
+    seqused_k: torch.Tensor | None,
+    out: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if _FA4_MODULE_PATH is None:
+        raise RuntimeError("FA4 not registered")
+    module = _fa4_import_module(_FA4_MODULE_PATH)
+
+    kwargs: dict[str, Any] = {
+        "softmax_scale": scale,
+        "causal": is_causal,
+        "window_size_left": window_size_left,
+        "window_size_right": window_size_right,
+        "return_lse": True,
+        "cu_seqlens_q": cu_seq_q,
+        "cu_seqlens_k": cu_seq_k,
+        "seqused_k": seqused_k.contiguous() if seqused_k is not None else None,
+    }
+    if out is not None:
+        kwargs["out"] = out
+    out, lse = module._flash_attn_fwd(query, key, value, **kwargs)
+    return out, lse.contiguous()
+
+
+def _fa4_run_backward(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cu_seq_q: torch.Tensor | None,
+    cu_seq_k: torch.Tensor | None,
+    scale: float | None,
+    is_causal: bool,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if _FA4_MODULE_PATH is None:
+        raise RuntimeError("FA4 not registered")
+    module = _fa4_import_module(_FA4_MODULE_PATH)
+    dq, dk, dv = module._flash_attn_bwd(
+        query,
+        key,
+        value,
+        out,
+        grad_out,
+        logsumexp.contiguous(),
+        softmax_scale=scale,
+        causal=is_causal,
+        cu_seqlens_q=cu_seq_q,
+        cu_seqlens_k=cu_seq_k,
+    )
+    return dq, dk, dv
+
+
+def _fa4_flash_attention_forward_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    return_debug_mask: bool,
+    *,
+    scale: float | None = None,
+    window_size_left: int | None = None,
+    window_size_right: int | None = None,
+    seqused_k: torch.Tensor | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+    out: torch.Tensor | None = None,
+):
+    error = _fa4_forward_support_error(
+        query,
+        key,
+        value,
+        dropout_p,
+        return_debug_mask,
+        alibi_slopes,
+        seqused_k,
+        cum_seq_q,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 flash_attention forward unsupported: {error}")
+    out, lse = _fa4_run_forward(
+        query,
+        key,
+        value,
+        cum_seq_q,
+        cum_seq_k,
+        scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        seqused_k,
+        out,
+    )
+    rng_state = torch.zeros((2,), dtype=torch.uint64, device=query.device)
+    philox_offset = torch.zeros((), dtype=torch.uint64, device=query.device)
+    debug_mask = torch.empty(0, dtype=query.dtype, device=query.device)
+    return out, lse, rng_state, philox_offset, debug_mask
+
+
+def _fa4_flash_attention_backward_impl(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+    unused: torch.Tensor,
+    *,
+    scale: float | None = None,
+    window_size_left: int | None = None,
+    window_size_right: int | None = None,
+):
+    error = _fa4_backward_support_error(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        dropout_p,
+        cum_seq_q,
+        window_size_left,
+        window_size_right,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 flash_attention backward unsupported: {error}")
+    dq, dk, dv = _fa4_run_backward(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        cum_seq_q,
+        cum_seq_k,
+        scale,
+        is_causal,
+    )
+    return dq, dk, dv
+
+
+def _fa4_scaled_dot_product_flash_attention_forward_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: float | None = None,
+):
+    error = _fa4_forward_support_error(
+        query,
+        key,
+        value,
+        dropout_p,
+        return_debug_mask,
+        None,
+        None,
+        None,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 SDPA forward unsupported: {error}")
+    q, k, v = _transpose_dense(query, key, value)
+
+    # Pre-allocate output with query's strides (BHSD layout), then create
+    # a BSHD view for the kernel. This ensures the returned output has
+    # the same memory layout as the input query.
+    out_bhsd = torch.empty_like(query)
+    out_bshd = out_bhsd.transpose(1, 2)
+
+    max_q_flash = q.size(1)
+    max_k_flash = k.size(1)
+    _, lse, rng_state, philox_offset, debug_mask = _fa4_flash_attention_forward_impl(
+        q,
+        k,
+        v,
+        None,
+        None,
+        max_q_flash,
+        max_k_flash,
+        dropout_p,
+        is_causal,
+        return_debug_mask,
+        scale=scale,
+        out=out_bshd,
+    )
+    max_q = query.size(2)
+    max_k = key.size(2)
+    return (
+        out_bhsd,
+        lse,
+        None,
+        None,
+        max_q,
+        max_k,
+        rng_state,
+        philox_offset,
+        debug_mask,
+    )
+
+
+def _fa4_scaled_dot_product_flash_attention_backward_impl(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    cum_seq_q: torch.Tensor | None,
+    cum_seq_k: torch.Tensor | None,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: torch.Tensor,
+    philox_offset: torch.Tensor,
+    *,
+    scale: float | None = None,
+):
+    error = _fa4_backward_support_error(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        dropout_p,
+        None,
+        None,
+        None,
+    )
+    if error is not None:
+        raise RuntimeError(f"FA4 SDPA backward unsupported: {error}")
+    q, k, v, o, go = _transpose_dense(query, key, value, out, grad_out)
+    max_q = query.size(2)
+    max_k = key.size(2)
+    dq, dk, dv = _fa4_flash_attention_backward_impl(
+        go,
+        q,
+        k,
+        v,
+        o,
+        logsumexp,
+        None,
+        None,
+        max_q,
+        max_k,
+        dropout_p,
+        is_causal,
+        philox_seed,
+        philox_offset,
+        scale=scale,
+    )
+    dq, dk, dv = _transpose_dense(dq, dk, dv)
+    return dq, dk, dv
+
+
+_registry.register_flash_attention_impl("FA4", register_fn=register_flash_attention_fa4)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..883252d56f8b65cfa258d9d77ed463b374fd77ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_registry.py
@@ -0,0 +1,109 @@
+# mypy: allow-untyped-defs
+"""Registry for flash attention implementations.
+
+This module contains the registration system for flash attention implementations.
+It has no torch dependencies to avoid circular imports during initialization.
+"""
+
+from collections.abc import Callable
+from typing import Literal, Protocol
+
+
+class FlashAttentionHandle(Protocol):
+    def remove(self) -> None: ...
+
+
+_RegisterFn = Callable[..., FlashAttentionHandle | None]
+_FlashAttentionImpl = Literal["FA4"]
+
+_FLASH_ATTENTION_IMPLS: dict[str, _RegisterFn] = {}
+
+_FLASH_ATTENTION_ACTIVE: str | None = None
+_FLASH_ATTENTION_HANDLES: dict[str, FlashAttentionHandle] = {}
+
+
+def register_flash_attention_impl(
+    impl: str | _FlashAttentionImpl,
+    *,
+    register_fn: _RegisterFn,
+) -> None:
+    """
+    Register the callable that activates a flash attention impl.
+
+    .. note::
+        This function is intended for SDPA backend providers to register their
+        implementations. End users should use :func:`activate_flash_attention_impl`
+        to activate a registered implementation.
+
+    Args:
+        impl: Implementation identifier (e.g., ``"FA4"``).
+        register_fn: Callable that performs the actual dispatcher registration.
+            This function will be invoked by :func:`activate_flash_attention_impl`
+            and should register custom kernels with the PyTorch dispatcher.
+            It may optionally return a handle implementing
+            :class:`FlashAttentionHandle` to keep any necessary state alive.
+
+    Example:
+        >>> def my_impl_register(module_path: str = "my_flash_impl"):
+        ...     # Register custom kernels with torch dispatcher
+        ...     pass  # doctest: +SKIP
+        >>> register_flash_attention_impl(
+        ...     "MyImpl", register_fn=my_impl_register
+        ... )  # doctest: +SKIP
+    """
+    _FLASH_ATTENTION_IMPLS[impl] = register_fn
+
+
+def activate_flash_attention_impl(
+    impl: str | _FlashAttentionImpl,
+) -> None:
+    """
+    Activate into the dispatcher a previously registered flash attention impl.
+
+    .. note::
+        Backend providers should NOT automatically activate their implementation
+        on import. Users should explicitly opt-in by calling this function or via
+        environment variables to ensure multiple provider libraries can coexist.
+
+    Args:
+        impl: Implementation identifier to activate. See
+            :func:`~torch.nn.attention.list_flash_attention_impls` for available
+            implementations.
+            If the backend's :func:`register_flash_attention_impl` callable
+            returns a :class:`FlashAttentionHandle`, the registry keeps that
+            handle alive for the lifetime of the process (until explicit
+            uninstall support exists).
+
+    Example:
+        >>> activate_flash_attention_impl("FA4")  # doctest: +SKIP
+    """
+    global _FLASH_ATTENTION_ACTIVE
+    register_fn = _FLASH_ATTENTION_IMPLS.get(impl)
+    if register_fn is None:
+        raise ValueError(
+            f"Unknown flash attention impl '{impl}'. "
+            f"Available implementations: {list_flash_attention_impls()}"
+        )
+    # TODO: The only way to actually register a new impl is to unregister the current impl
+    # reinstall the default impl and then register the new impl
+    if _FLASH_ATTENTION_ACTIVE == impl:
+        return
+
+    handle = register_fn()
+    if handle is not None:
+        _FLASH_ATTENTION_HANDLES[impl] = handle
+    _FLASH_ATTENTION_ACTIVE = impl
+
+
+def list_flash_attention_impls() -> list[str]:
+    """Return the names of all available flash attention implementations."""
+    return sorted(_FLASH_ATTENTION_IMPLS.keys())
+
+
+def current_flash_attention_impl() -> str | None:
+    """
+    Return the currently activated flash attention impl name, if any.
+
+    ``None`` indicates that no custom impl has been activated.
+    """
+    return _FLASH_ATTENTION_ACTIVE
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd530bb675e8fce9164b7de0d75fd9dce90edec8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/_utils.py
@@ -0,0 +1,59 @@
+# mypy: allow-untyped-defs
+"""Defines utilities for interacting with scaled_dot_product_attention"""
+
+import math
+
+import torch
+
+
+__all__: list[str] = []
+
+
+def _input_requires_grad(*tensors: torch.Tensor) -> bool:
+    """Returns True if any of the tensors requires grad"""
+    return any(t.requires_grad for t in tensors)
+
+
+def _postprocess_flash_output(inpt_tensor: torch.Tensor, og_size: int) -> torch.Tensor:
+    """Handles the unpad of the last dimension"""
+    if inpt_tensor.size(-1) != og_size:
+        return inpt_tensor[..., :og_size]
+    return inpt_tensor
+
+
+def _calculate_scale(head_dim_size: int, scale: float | None) -> float:
+    """
+    For FlashAttention we pad the head dimension to be a multiple of 8 so we need to scale the output
+    by the original head size and not the padded.
+    """
+    if scale is not None:
+        return scale
+    return 1.0 / math.sqrt(head_dim_size)
+
+
+def _validate_sdpa_input(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: torch.Tensor | None = None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+) -> None:
+    if query.dtype != key.dtype or query.dtype != value.dtype:
+        raise ValueError(
+            f"Expected query, key, and value to have the same dtype, "
+            f"but got query.dtype: {query.dtype}, key.dtype: {key.dtype}, "
+            f"and value.dtype: {value.dtype} instead."
+        )
+    if query.device != key.device or query.device != value.device:
+        raise ValueError(
+            f"Expected query, key, and value to have the same device type, "
+            f"but got query.device: {query.device}, key.device: {key.device}, "
+            f"and value.device: {value.device} instead."
+        )
+    if query.dim() < 2 or key.dim() < 2 or value.dim() < 2:
+        raise ValueError(
+            f"Expected query, key, and value to all be  at least 2 dimensional, but got query.dim: "
+            f"{query.dim()}, key.dim: {key.dim()} and value.dim: {value.dim()} instead."
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..746e04c01f3d571fc61a06a62332f229ada4e6c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/bias.py
@@ -0,0 +1,371 @@
+# mypy: allow-untyped-defs
+"""Defines bias subclasses that work with scaled_dot_product_attention"""
+
+from enum import auto, IntEnum
+from warnings import warn
+
+import torch
+import torch.nn.functional as F
+from torch.backends.cuda import (
+    can_use_efficient_attention,
+    can_use_flash_attention,
+    is_flash_attention_available,
+    SDPAParams,
+)
+from torch.nn.attention import _raise_kernel_warnings
+from torch.nn.attention._utils import (
+    _calculate_scale,
+    _input_requires_grad,
+    _postprocess_flash_output,
+    _validate_sdpa_input,
+)
+
+
+__all__ = ["causal_upper_left", "causal_lower_right", "CausalVariant", "CausalBias"]
+
+
+torch._dynamo.allow_in_graph(is_flash_attention_available)
+torch._dynamo.allow_in_graph(can_use_flash_attention)
+torch._dynamo.allow_in_graph(can_use_efficient_attention)
+torch._dynamo.allow_in_graph(SDPAParams)
+
+
+class CausalVariant(IntEnum):
+    r"""
+    Enum for causal variants used in attention mechanisms.
+
+    Defines two types of causal biases:
+
+    ``UPPER_LEFT``: Represents upper-left triangular bias for standard causal attention.
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        torch.tril(torch.ones(size, dtype=torch.bool))
+
+    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0]]
+
+
+    ``LOWER_RIGHT``: Represents lower-right triangular bias, the include values are aligned to the lower
+    right corner of the matrix.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        diagonal_offset = size[1] - size[0]
+        torch.tril(
+            torch.ones(size, dtype=torch.bool),
+            diagonal=diagonal_offset,
+        )
+
+    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 1]]
+
+    Note that these variants are equivalent to each other when the sequence lengths of the query and key/value
+    tensors are equal since the triangular matrix is square.
+
+    .. warning:: This enum is a prototype and subject to change.
+    """
+
+    UPPER_LEFT = auto()
+    LOWER_RIGHT = auto()
+
+
+class CausalBias(torch.Tensor):
+    """
+    A bias representing causal attention patterns. For an overview of the bias structure, see the :class:`CausalVariant` enum.
+
+    This class is used for defining causal (triangular) attention biases. For construing the bias, there exist
+    two factory functions: :func:`causal_upper_left` and :func:`causal_lower_right`.
+
+    Example:
+
+    .. code-block:: python
+
+        from torch.nn.attention.bias import causal_lower_right
+
+        bsz, num_heads, seqlen_q, seqlen_kv, head_dim = 32, 8, 4, 12, 8
+
+        # Create a lower-right causal bias
+        attn_bias = causal_lower_right(seqlen_q, seqlen_kv)
+
+        q = torch.randn(
+            bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16
+        )
+        k = torch.randn(
+            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
+        )
+        v = torch.randn(
+            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
+        )
+
+        out = F.scaled_dot_product_attention(q, k, v, attn_bias)
+
+    .. warning:: This class is a prototype and subject to change.
+    """
+
+    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int) -> None:
+        """
+        Initializes the CausalBias instance with a specified variant and sequence lengths.
+
+        Args:
+            variant (CausalVariant): The type of causal bias to use (either UPPER_LEFT or LOWER_RIGHT).
+            seq_len_q (int): The sequence length of the query tensor.
+            seq_len_kv (int): The sequence length of the key/value tensor.
+
+        Raises a warning if the LOWER_RIGHT variant is used with seq_len_q > seq_len_kv, as it may produce NaNs.
+        """
+        assert isinstance(variant, CausalVariant)
+        super().__init__()
+        self.variant = variant
+        self.seq_len_q = seq_len_q
+        self.seq_len_kv = seq_len_kv
+        if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT:
+            warn(
+                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!",
+                stacklevel=2,
+            )
+
+    def _upper_left(self, device: torch.device) -> torch.Tensor:
+        """Upper left causal bias"""
+        return torch.tril(
+            torch.ones(self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool)
+        )
+
+    def _lower_right(self, device: torch.device) -> torch.Tensor:
+        """Lower right causal bias"""
+        diagonal_offset = self.seq_len_kv - self.seq_len_q
+        return torch.tril(
+            torch.ones(
+                self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool
+            ),
+            diagonal=diagonal_offset,
+        )
+
+    # pyrefly: ignore [bad-return]
+    def _materialize(self, device: torch.device | None = None) -> torch.Tensor:
+        """
+        Materializes the causal bias into a tensor form.
+
+        Depending on the variant, this method generates either an upper-left or lower-right
+        triangular matrix to represent the causal bias.
+
+        Args:
+            device (Optional[torch.device]): The device on which to create the tensor. Defaults to CPU.
+
+        Returns:
+            torch.Tensor: The materialized bias tensor.
+        """
+        if device is None:
+            device = torch.device("cpu")
+        if self.variant == CausalVariant.UPPER_LEFT:
+            return self._upper_left(device)
+        elif self.variant == CausalVariant.LOWER_RIGHT:
+            return self._lower_right(device)
+
+    @staticmethod
+    def _dispatch(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: "CausalBias",
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: float | None = None,
+        enable_gqa: bool = False,
+    ) -> torch.Tensor:
+        r"""
+        Handles the logic for computing attention with the specified causal bias.
+
+        Args:
+            query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
+            key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
+            value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
+            attn_mask (CausalBias): The type of causal attention to apply.
+                A boolean mask where a value of True indicates that the element *should* take part in attention.
+                A float mask of the same type as query, key, value that is added to the attention score.
+            dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
+            is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
+                are set.
+            scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set
+                to :math:`\frac{1}{\sqrt{E}}`.
+            enable_gqa (optional bool): If set to True, Grouped Query Attention (GQA) is enabled, by default it is set to False.
+
+        Returns:
+            output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.
+
+        Raises:
+            ValueError: If the causal bias variant is not a CausalVariant type.
+
+        """
+        if is_causal:
+            raise ValueError("CausalBias should not be used with causal=True")
+
+        if (
+            attn_mask.seq_len_q == attn_mask.seq_len_kv
+            or attn_mask.variant == CausalVariant.UPPER_LEFT
+        ):
+            return F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=None,
+                dropout_p=dropout_p,
+                is_causal=True,
+                scale=scale,
+                enable_gqa=enable_gqa,
+            )
+        elif attn_mask.variant == CausalVariant.LOWER_RIGHT:
+            _validate_sdpa_input(query, key, value, None, dropout_p, is_causal, scale)
+            sdpa_params = SDPAParams(
+                query, key, value, None, dropout_p, is_causal, enable_gqa
+            )
+            if can_use_flash_attention(sdpa_params):
+                alignment = 64 if query.device.type == "xpu" else 8
+                og_head_size = query.size(-1)
+                og_scale = _calculate_scale(og_head_size, scale)
+                needs_padding = og_head_size % alignment != 0
+                if needs_padding:
+                    pad_len = alignment - (og_head_size % alignment)
+                    query = torch.nn.functional.pad(query, (0, pad_len))
+                    key = torch.nn.functional.pad(key, (0, pad_len))
+                    value = torch.nn.functional.pad(value, (0, pad_len))
+                out = torch.ops.aten._scaled_dot_product_flash_attention(
+                    query,
+                    key,
+                    value,
+                    dropout_p,
+                    is_causal=True,  # TODO: Flash accepts causal = True and for this particular op it means lower right
+                    return_debug_mask=False,
+                    scale=og_scale,
+                )[0]
+                return _postprocess_flash_output(out, og_head_size)
+            if can_use_efficient_attention(sdpa_params):
+                compute_log_sumexp = False
+                if _input_requires_grad(query, key, value):
+                    compute_log_sumexp = True
+                return torch.ops.aten._efficient_attention_forward(
+                    query.transpose(1, 2),
+                    key.transpose(1, 2),
+                    value.transpose(1, 2),
+                    bias=None,
+                    cu_seqlens_q=None,
+                    cu_seqlens_k=None,
+                    max_seqlen_q=None,
+                    max_seqlen_k=None,
+                    dropout_p=dropout_p,
+                    custom_mask_type=int(attn_mask.variant),
+                    compute_log_sumexp=compute_log_sumexp,
+                    scale=scale,
+                    seqlen_k=None,
+                )[0].transpose(1, 2)
+            else:
+                _raise_kernel_warnings(sdpa_params)
+                # We can't use efficient attention the only support for lower right is via materialization
+                return F.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attn_mask._materialize(query.device),
+                    dropout_p=dropout_p,
+                    is_causal=False,
+                    scale=scale,
+                    enable_gqa=enable_gqa,
+                )
+        else:
+            raise ValueError(
+                f"CausalBias.variant must be a CausalVariant type, but found: {attn_mask.variant}"
+            )
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """Defines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias"""
+        if kwargs is None:
+            kwargs = {}
+        if func is torch.nn.functional.scaled_dot_product_attention:
+            return cls._dispatch(*args, **kwargs)
+        return super().__torch_function__(func, types, args, kwargs)
+
+    def __repr__(self) -> str:  # type:ignore[override]
+        return self._materialize().__repr__()
+
+
+def causal_upper_left(*size) -> CausalBias:
+    """
+    Creates an upper-left triangular causal bias.
+
+    This function generates a upper-left triangular matrix to represent causal attention bias with a
+    diagonal offset set so that the inclusive values are aligned to the upper left corner of the matrix.
+    This equivalent to the `is_causal=True` argument in `scaled_dot_product_attention`.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        torch.tril(torch.ones(size, dtype=torch.bool))
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0]]
+
+    Args:
+        size: The size of the bias matrix.
+
+    Returns:
+        CausalBias: The UPPER_LEFT triangular causal bias variant.
+    """
+    assert len(size) == 2, "causal_upper_left only supports 2D tensors"
+    seq_len_q, seq_len_kv = size
+    return CausalBias(CausalVariant.UPPER_LEFT, seq_len_q, seq_len_kv)
+
+
+def causal_lower_right(*size) -> CausalBias:
+    """
+    Creates a lower-right triangular causal bias.
+
+    This function generates a lower-right triangular matrix to represent causal attention bias with a
+    diagonal offset set so that the inclusive values are aligned to the lower right corner of the matrix.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        diagonal_offset = size[1] - size[0]
+        torch.tril(
+            torch.ones(size, dtype=torch.bool),
+            diagonal=diagonal_offset,
+        )
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 1]]
+
+    Args:
+        size: The size of the bias matrix.
+
+    Returns:
+        CausalBias: The LOWER_RIGHT triangular causal bias variant.
+    """
+    assert len(size) == 2, "causal_lower_right only supports 2D tensors"
+    seq_len_q, seq_len_kv = size
+    return CausalBias(CausalVariant.LOWER_RIGHT, seq_len_q, seq_len_kv)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad922227ccff80de42fcefe74c52ea861124add4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/flex_attention.py
@@ -0,0 +1,1676 @@
+# mypy: allow-untyped-defs
+# flake8: noqa: B950
+"""This module implements the user facing API for flex_attention in PyTorch."""
+
+import functools
+import inspect
+import itertools
+import math
+import operator
+import typing
+import warnings
+from collections.abc import Callable
+from enum import Enum
+from typing import Any, Literal, NamedTuple, TypeAlias
+
+import torch
+from torch import Tensor
+
+
+try:
+    from typing import TypedDict
+except ImportError:
+    from typing_extensions import TypedDict
+
+try:
+    from typing import NotRequired
+except ImportError:
+    from typing_extensions import NotRequired
+
+from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
+from torch._higher_order_ops.utils import _set_compilation_env
+from torch._prims_common import DeviceLikeType
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+)
+from torch.nn.attention._utils import _validate_sdpa_input
+from torch.utils._pytree import GetAttrKey, tree_map_only
+
+
+# Private debug flag to disable internal compilation wrapping for debugging purposes.
+# WARNING: This is intended ONLY for debugging score_mod and mask_mod functions.
+# When enabled, this bypasses the required internal compilation that ensures correctness
+# and performance. Only use this temporarily when you need to set breakpoints
+# in your score_mod/mask_mod functions during development.
+#
+# This flag only affects the internal compilation when flex_attention is called directly.
+# If you have already wrapped flex_attention in torch.compile(), this flag has no effect
+# and the user's compilation will still occur.
+#
+# Usage:
+#   import torch.nn.attention.flex_attention as fa
+#   fa._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = True
+#   # Now you can set breakpoints in your score_mod/mask_mod
+#   output = fa.flex_attention(q, k, v, score_mod=my_score_mod)
+#
+_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = False
+
+_WARNINGS_SHOWN: set[str] = set()
+
+
+def _warn_once(
+    warning_id: str, message: str, category: type[Warning] = UserWarning
+) -> None:
+    """Helper to ensure each warning is shown only once per process."""
+    if warning_id not in _WARNINGS_SHOWN:
+        warnings.warn(message, category, stacklevel=2)
+        _WARNINGS_SHOWN.add(warning_id)
+
+
+__all__ = [
+    "BlockMask",
+    "flex_attention",
+    "AuxOutput",
+    "AuxRequest",
+    "FlexKernelOptions",
+    "create_block_mask",
+    "create_mask",
+    "or_masks",
+    "and_masks",
+    "noop_mask",
+]
+
+_score_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor, Tensor], Tensor]
+_mask_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor], Tensor]
+_Backend: TypeAlias = Literal["AUTO", "TRITON", "FLASH", "TRITON_DECODE"]
+
+
+# pyrefly: ignore [invalid-inheritance]
+class FlexKernelOptions(TypedDict, total=False):
+    """Options for controlling the behavior of FlexAttention kernels.
+
+    These options are passed to the underlying Triton kernels to control performance
+    and numerical behavior. Most users will not need to specify these options as the
+    default autotuning provides good performance.
+
+    The options can be prefixed with ``fwd_`` or ``bwd_`` to apply only to forward or
+    backward pass respectively. For example: ``fwd_BLOCK_M`` and ``bwd_BLOCK_M1``.
+
+    Note:
+      We currently do not provide any backward compatibility guarantees for these options.
+      That being said most of these have remained pretty stable since their introduction. But
+      We do not consider this part of the public API just yet. We think that some documentation
+      Is better than secret hidden flags, but we may change these options in the future.
+
+    Example Usage:
+        .. code-block:: python
+
+            # Using dictionary (backward compatible)
+            kernel_opts = {"BLOCK_M": 64, "BLOCK_N": 64, "PRESCALE_QK": True}
+            output = flex_attention(q, k, v, kernel_options=kernel_opts)
+
+            # Using TypedDict (recommended for type safety)
+            from torch.nn.attention.flex_attention import FlexKernelOptions
+
+            kernel_opts: FlexKernelOptions = {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "PRESCALE_QK": True,
+            }
+            output = flex_attention(q, k, v, kernel_options=kernel_opts)
+
+            # Forward/backward specific options
+            kernel_opts: FlexKernelOptions = {
+                "fwd_BLOCK_M": 64,
+                "bwd_BLOCK_M1": 32,
+                "PRESCALE_QK": False,
+            }
+            output = flex_attention(q, k, v, kernel_options=kernel_opts)
+    """
+
+    # Performance tuning options
+    # pyrefly: ignore [invalid-annotation]
+    num_warps: NotRequired[int]
+    """Number of warps to use in the CUDA kernel. Higher values may improve performance
+    but increase register pressure. Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    num_stages: NotRequired[int]
+    """Number of pipeline stages in the CUDA kernel. Higher values may improve performance
+    but increase shared memory usage. Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_M: NotRequired[int]
+    """Thread block size for the sequence length dimension of Q in forward pass.
+    Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_N: NotRequired[int]
+    """Thread block size for the sequence length dimension of K/V in forward pass.
+    Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning."""
+
+    # Backward-specific block sizes (when prefixed with 'bwd_')
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_M1: NotRequired[int]
+    """Thread block size for Q dimension in backward pass. Use as 'bwd_BLOCK_M1'.
+    Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_N1: NotRequired[int]
+    """Thread block size for K/V dimension in backward pass. Use as 'bwd_BLOCK_N1'.
+    Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_M2: NotRequired[int]
+    """Thread block size for second Q dimension in backward pass. Use as 'bwd_BLOCK_M2'.
+    Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCK_N2: NotRequired[int]
+    """Thread block size for second K/V dimension in backward pass. Use as 'bwd_BLOCK_N2'.
+    Default is determined by autotuning."""
+
+    # pyrefly: ignore [invalid-annotation]
+    PRESCALE_QK: NotRequired[bool]
+    """Whether to pre-scale QK by 1/sqrt(d) and change of base. This is slightly faster but
+    may have more numerical error. Default: False."""
+
+    # pyrefly: ignore [invalid-annotation]
+    ROWS_GUARANTEED_SAFE: NotRequired[bool]
+    """If True, guarantees that at least one value in each row is not masked out.
+    Allows skipping safety checks for better performance. Only set this if you are certain
+    your mask guarantees this property. For example, causal attention is guaranteed safe
+    because each query has at least 1 key-value to attend to. Default: False."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BLOCKS_ARE_CONTIGUOUS: NotRequired[bool]
+    """If True, guarantees that all blocks in the mask are contiguous.
+    Allows optimizing block traversal. For example, causal masks would satisfy this,
+    but prefix_lm + sliding window would not. Default: False."""
+
+    # pyrefly: ignore [invalid-annotation]
+    WRITE_DQ: NotRequired[bool]
+    """Controls whether gradient scatters are done in the DQ iteration loop of the backward pass.
+    Setting this to False will force this to happen in the DK loop which depending on your
+    specific score_mod and mask_mod might be faster. Default: True."""
+
+    # pyrefly: ignore [invalid-annotation]
+    FORCE_USE_FLEX_ATTENTION: NotRequired[bool]
+    """If True, forces the use of the flex attention kernel instead of potentially using
+    the more optimized flex-decoding kernel for short sequences. This can be a helpful
+    option for debugging. Default: False."""
+
+    # pyrefly: ignore [invalid-annotation]
+    USE_TMA: NotRequired[bool]
+    """Whether to use Tensor Memory Accelerator (TMA) on supported hardware.
+    This is experimental and may not work on all hardware, currently specific
+    to NVIDIA GPUs Hopper+. Default: False."""
+
+    # ROCm-specific options
+    # pyrefly: ignore [invalid-annotation]
+    kpack: NotRequired[int]
+    """ROCm-specific kernel packing parameter."""
+
+    # pyrefly: ignore [invalid-annotation]
+    matrix_instr_nonkdim: NotRequired[int]
+    """ROCm-specific matrix instruction non-K dimension."""
+
+    # pyrefly: ignore [invalid-annotation]
+    waves_per_eu: NotRequired[int]
+    """ROCm-specific waves per execution unit."""
+
+    # pyrefly: ignore [invalid-annotation]
+    BACKEND: NotRequired[_Backend]
+    """Selects a specific kernel backend.
+
+    Options:
+        - "AUTO": Use current heuristics (typically Triton-based kernels with
+          automatic selection between flex_attention and flex_decoding)
+        - "TRITON": Standard Triton flex_attention kernel
+        - "TRITON_DECODE": Triton flex_decoding kernel, only available for short sequence lengths with specific configurations
+        - "FLASH": Experimental: Flash Attention kernel (cute-dsl), user needs to have flash installed
+
+    This option cannot be combined with legacy knobs such as ``FORCE_USE_FLEX_ATTENTION``.
+    Raises an error if the requested backend cannot be used. Default: "AUTO"
+    """
+
+
+class AuxRequest(NamedTuple):
+    """Request which auxiliary outputs to compute from flex_attention.
+
+    Each field is a boolean indicating whether that auxiliary output should be computed.
+    """
+
+    lse: bool = False
+    max_scores: bool = False
+
+
+class AuxOutput(NamedTuple):
+    """Auxiliary outputs from flex_attention operation.
+
+    Fields will be None if not requested, or contain the tensor if requested.
+    """
+
+    lse: Tensor | None = None
+    max_scores: Tensor | None = None
+
+
+class _ModificationType(Enum):
+    """Enum for the type of modification function.
+    - SCORE_MOD: score_mod function which accepts a score as the first argument
+    - mask_mod: mask function which does not accept a score and is only used for generating
+    block mask
+    """
+
+    SCORE_MOD = 1
+    MASK_MOD = 2
+    UNKNOWN = 3
+
+
+def _get_mod_type(fn: Callable) -> _ModificationType:
+    """Get the type of modification function.
+    This function inspects the number of positional arguments of the function to determine
+    the type of modification function. If the function has 5 positional arguments, it is
+    considered as a score_mod function. If the function has 4 positional arguments, it is
+    considered as a mask function.
+    """
+    if hasattr(fn, "__code__"):
+        code = fn.__code__
+        num_positional_total = code.co_argcount
+        defaults = ()
+        if hasattr(fn, "__defaults__"):
+            defaults = fn.__defaults__ or ()
+        num_defaults = len(defaults)
+        num_positional_args = num_positional_total - num_defaults
+    else:
+        num_positional_args = sum(
+            1
+            for param in inspect.signature(fn).parameters.values()
+            if param.default is inspect.Parameter.empty
+        )
+    assert num_positional_args == 5 or num_positional_args == 4
+    if num_positional_args == 5:
+        return _ModificationType.SCORE_MOD
+    elif num_positional_args == 4:
+        return _ModificationType.MASK_MOD
+    else:
+        return _ModificationType.UNKNOWN
+
+
+# Need to define it here so that Dynamo doesn't skip it
+def _vmap_for_bhqkv(
+    fn: Callable,
+    prefix: tuple[int | None, ...],
+    suffix: tuple[int | None, ...] = (),
+    out_dims: int | list[int | None] = 0,
+    group_dim: bool = False,
+):
+    """Used to vmap both score_mods and mask_mods over 4-dimensional/5-dimension inputs.
+    Mapping over the [b, hq, q_idx, kv_idx] or [b, hkv, g, q_idx, kv_idx] dimensions.
+
+    Args:
+        fn (callable): The function to vmap.
+        prefix (tuple): The prefix of the vmap. For score mod functions,
+                        this should be set to (0,). For mask_mods = ()
+        suffix (tuple): We need to add (0,) if gradOut is being mapped over,
+                        and (None,) * len(other_buffers).
+        out_dims (tuple): For forward cases, keep this as the default 0 since
+                          we are only returning 1 output. For backwards, the joint
+                          graph returns grads for B, H, Q_idx, KV_idx and other_buffers,
+                          so we set this to (0, None, None, None, None) + (None,) * len(other_buffers).
+
+    Returns:
+        callable: The vmapped function.
+    """
+    # We vamp a function 4 times, broadcasting the [b, h, q_idx, kv_idx] dimensions
+    dimensions: list[tuple[None | int, None | int, None | int, None | int]] = []
+    dimensions = [
+        (None, None, None, 0),
+        (None, None, 0, None),
+        (None, 0, None, None),
+    ]
+
+    if group_dim:
+        dimensions += [
+            (None, 0, None, None),
+        ]
+
+    dimensions += [
+        (0, None, None, None),
+    ]
+
+    for dims in dimensions:
+        fn = torch.vmap(fn, in_dims=prefix + dims + suffix, out_dims=out_dims)  # type: ignore[arg-type]
+    return fn
+
+
+def _identity(
+    score: Tensor,
+    batch: Tensor,
+    head: Tensor,
+    token_q: Tensor,
+    token_kv: Tensor,
+) -> Tensor:
+    return score
+
+
+def noop_mask(
+    batch: Tensor,
+    head: Tensor,
+    token_q: Tensor,
+    token_kv: Tensor,
+) -> Tensor:
+    """Returns a noop mask_mod"""
+    return batch.new_ones(size=(), dtype=torch.bool, device=batch.device)
+
+
+def _sliced_mask_mod_error(
+    batch: Tensor,
+    head: Tensor,
+    token_q: Tensor,
+    token_kv: Tensor,
+) -> Tensor:
+    """
+    Raises helpful error when using mask_mod from a sliced BlockMask.
+
+    After slicing a BlockMask, the mask_mod is reset and cannot be used directly.
+    Users must reassign mask_mod from the original (unsliced) BlockMask.
+    """
+    raise RuntimeError(
+        "Cannot use mask_mod from a sliced BlockMask. "
+        "When you slice a BlockMask using [], the mask_mod attribute is reset. "
+        "You must set it from the original BlockMask's mask_mod."
+        "\n\nIncorrect usage:"
+        "\n  base_mask = create_block_mask(my_mask_fn, ...)"
+        "\n  sliced_mask = base_mask[:, :, block_idx]"
+        "\n  sliced_mask.mask_mod = apply_offset(sliced_mask.mask_mod, offset)  # WRONG!"
+        "\n\nCorrect usage:"
+        "\n  base_mask = create_block_mask(my_mask_fn, ...)"
+        "\n  sliced_mask = base_mask[:, :, block_idx]"
+        "\n  sliced_mask.mask_mod = apply_offset(base_mask.mask_mod, offset)  # Use base_mask!"
+    )
+
+
+_DEFAULT_SPARSE_BLOCK_SIZE = 128
+_LARGE_SPARSE_BLOCK_SIZE = 1 << 30
+
+
+def _ordered_to_dense(num_blocks_in_row: Tensor, col_indices: Tensor):
+    num_rows = col_indices.shape[-2]
+    num_cols = col_indices.shape[-1]
+    batch_dims = num_blocks_in_row.shape[:-1]
+    device = num_blocks_in_row.device
+
+    def create_dense_one(kv_num_blocks, kv_indices):
+        dense_mask = kv_indices.new_zeros(num_rows, num_cols + 1, dtype=torch.int32)
+
+        row_indices = torch.arange(num_rows, dtype=torch.int, device=device).unsqueeze(
+            -1
+        )
+        col_range = torch.arange(num_cols, dtype=torch.int, device=device)
+        index_mask = col_range < kv_num_blocks.unsqueeze(-1)
+
+        # We write to one spot "out of bounds"
+        valid_indices = torch.where(index_mask, kv_indices, num_cols)
+
+        # set the values in 'a' to 1 where the indices are valid
+        dense_mask[row_indices, valid_indices] = dense_mask.new_ones(())
+        return dense_mask[:, :num_cols].contiguous()
+
+    create_dense_batched = create_dense_one
+    for _ in range(len(batch_dims)):
+        create_dense_batched = torch.vmap(create_dense_batched, in_dims=(0, 0))
+
+    out = create_dense_batched(num_blocks_in_row, col_indices)
+    return out
+
+
+def _dense_to_ordered(dense_mask) -> tuple[Tensor, Tensor]:
+    dense_mask = dense_mask.to(dtype=torch.int32)
+    num_blocks_in_row = dense_mask.sum(dim=-1)
+    col_indices = torch.argsort(dense_mask, dim=-1, descending=True, stable=True)
+    return (
+        num_blocks_in_row.to(torch.int32, memory_format=torch.contiguous_format),
+        col_indices.to(torch.int32, memory_format=torch.contiguous_format),
+    )
+
+
+def _transpose_ordered(num_blocks_in_row: Tensor, col_indices: Tensor):
+    dense = _ordered_to_dense(num_blocks_in_row, col_indices)
+    return _dense_to_ordered(dense.transpose(-2, -1))
+
+
+def _adjust_num_blocks_and_indices(
+    num_blocks: Tensor,
+    indices: Tensor,
+    new_num_rows: int,
+    new_num_cols: int,
+):
+    indices = indices[:, :, :new_num_rows, :new_num_cols]
+    num_blocks = num_blocks[:, :, :new_num_rows]
+    num_blocks = torch.where(num_blocks < new_num_cols, num_blocks, new_num_cols)
+    num_blocks = torch.sum(indices < num_blocks[:, :, :, None], dim=-1).to(torch.int32)
+    return num_blocks, indices
+
+
+class BlockMask:
+    r"""
+    BlockMask is our format for representing a block-sparse attention mask.
+    It is somewhat of a cross in-between BCSR and a non-sparse format.
+
+    **Basics**
+
+    A block-sparse mask means that instead of representing the sparsity of
+    individual elements in the mask, a KV_BLOCK_SIZE x Q_BLOCK_SIZE block is
+    considered sparse only if every element within that block is sparse.
+    This aligns well with hardware, which generally expects to perform
+    contiguous loads and computation.
+
+    This format is primarily optimized for 1. simplicity, and 2. kernel
+    efficiency. Notably, it is *not* optimized for size, as this mask is always
+    reduced by a factor of KV_BLOCK_SIZE * Q_BLOCK_SIZE. If the size is a
+    concern, the tensors can be reduced in size by increasing the block size.
+
+    The essentials of our format are:
+
+    num_blocks_in_row: Tensor[ROWS]:
+    Describes the number of blocks present in each row.
+
+    col_indices: Tensor[ROWS, MAX_BLOCKS_IN_COL]:
+    `col_indices[i]` is the sequence of block positions for row i. The values of
+    this row after `col_indices[i][num_blocks_in_row[i]]` are undefined.
+
+    For example, to reconstruct the original tensor from this format:
+
+    .. code-block:: python
+
+        dense_mask = torch.zeros(ROWS, COLS)
+        for row in range(ROWS):
+            for block_idx in range(num_blocks_in_row[row]):
+                dense_mask[row, col_indices[row, block_idx]] = 1
+
+    Notably, this format makes it easier to implement a reduction along the
+    *rows* of the mask.
+
+    **Details**
+
+    The basics of our format require only kv_num_blocks and kv_indices. But, we
+    have up to 8 tensors on this object. This represents 4 pairs:
+
+    1. (kv_num_blocks, kv_indices): Used for the forwards pass of attention, as
+    we reduce along the KV dimension.
+
+    2. [OPTIONAL] (full_kv_num_blocks, full_kv_indices): This is optional and
+    purely an optimization. As it turns out, applying masking to every block
+    is quite expensive! If we specifically know which blocks are "full" and
+    don't require masking at all, then we can skip applying mask_mod to these
+    blocks. This requires the user to split out a separate mask_mod from the
+    score_mod. For causal masks, this is about a 15% speedup.
+
+    3. [GENERATED] (q_num_blocks, q_indices): Required for the backwards pass,
+    as computing dKV requires iterating along the mask along the Q dimension. These are autogenerated from 1.
+
+    4. [GENERATED] (full_q_num_blocks, full_q_indices): Same as above, but for
+    the backwards pass. These are autogenerated from 2.
+    """
+
+    seq_lengths: tuple[int, int]
+    kv_num_blocks: Tensor
+    kv_indices: Tensor
+    full_kv_num_blocks: Tensor | None
+    full_kv_indices: Tensor | None
+    q_num_blocks: Tensor | None
+    q_indices: Tensor | None
+    full_q_num_blocks: Tensor | None
+    full_q_indices: Tensor | None
+    BLOCK_SIZE: tuple[int, int]
+    mask_mod: _mask_mod_signature
+
+    # Attribute lists for pytree flatten/unflatten
+    _TENSOR_ATTRS = [
+        "kv_num_blocks",
+        "kv_indices",
+        "full_kv_num_blocks",
+        "full_kv_indices",
+        "q_num_blocks",
+        "q_indices",
+        "full_q_num_blocks",
+        "full_q_indices",
+    ]
+
+    _CONTEXT_ATTRS = [
+        "seq_lengths",
+        "BLOCK_SIZE",
+        "mask_mod",
+    ]
+
+    def __init__(
+        self,
+        seq_lengths: tuple[int, int],
+        kv_num_blocks: Tensor,
+        kv_indices: Tensor,
+        full_kv_num_blocks: Tensor | None,
+        full_kv_indices: Tensor | None,
+        q_num_blocks: Tensor | None,
+        q_indices: Tensor | None,
+        full_q_num_blocks: Tensor | None,
+        full_q_indices: Tensor | None,
+        BLOCK_SIZE: tuple[int, int],
+        mask_mod: _mask_mod_signature,
+    ) -> None:
+        if kv_indices.dim() < 2:
+            raise RuntimeError("BlockMask must have at least 2 dimensions")
+        assert kv_num_blocks is not None, "kv_num_blocks must be provided"
+        assert kv_indices is not None, "kv_indices must be provided"
+        assert (full_kv_num_blocks is None) == (full_kv_indices is None), (
+            "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
+        )
+        assert (full_q_num_blocks is None) == (full_q_indices is None), (
+            "full_q_num_blocks and full_q_indices must be both provided or omitted"
+        )
+
+        self.seq_lengths = seq_lengths
+        self.kv_num_blocks = kv_num_blocks
+        self.kv_indices = kv_indices
+        self.full_kv_num_blocks = full_kv_num_blocks
+        self.full_kv_indices = full_kv_indices
+        self.q_num_blocks = q_num_blocks
+        self.q_indices = q_indices
+        self.full_q_num_blocks = full_q_num_blocks
+        self.full_q_indices = full_q_indices
+        self.BLOCK_SIZE = BLOCK_SIZE
+        self.mask_mod = mask_mod
+
+    @classmethod
+    def from_kv_blocks(
+        cls,
+        kv_num_blocks: Tensor,
+        kv_indices: Tensor,
+        full_kv_num_blocks: Tensor | None = None,
+        full_kv_indices: Tensor | None = None,
+        BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE,
+        mask_mod: _mask_mod_signature | None = None,
+        seq_lengths: tuple[int, int] | None = None,
+        compute_q_blocks: bool = True,
+    ):
+        """
+        Creates a BlockMask instance from key-value block information.
+
+        Args:
+            kv_num_blocks (Tensor): Number of kv_blocks in each Q_BLOCK_SIZE row tile.
+            kv_indices (Tensor): Indices of key-value blocks in each Q_BLOCK_SIZE row tile.
+            full_kv_num_blocks (Optional[Tensor]): Number of full kv_blocks in each Q_BLOCK_SIZE row tile.
+            full_kv_indices (Optional[Tensor]): Indices of full key-value blocks in each Q_BLOCK_SIZE row tile.
+            BLOCK_SIZE (Union[int, tuple[int, int]]): Size of KV_BLOCK_SIZE x Q_BLOCK_SIZE tiles.
+            mask_mod (Optional[Callable]): Function to modify the mask.
+
+        Returns:
+            BlockMask: Instance with full Q information generated via _transposed_ordered
+
+        Raises:
+            RuntimeError: If kv_indices has < 2 dimensions.
+            AssertionError: If only one of full_kv_* args is provided.
+        """
+        if kv_indices.dim() < 2:
+            raise RuntimeError("BlockMask must have at least 2 dimensions")
+
+        assert (full_kv_num_blocks is None) == (full_kv_indices is None), (
+            "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
+        )
+
+        # Generate q_num_blocks and q_indices
+        if compute_q_blocks:
+            q_num_blocks, q_indices = _transpose_ordered(kv_num_blocks, kv_indices)
+            if full_kv_num_blocks is not None:
+                assert full_kv_indices is not None
+                full_q_num_blocks, full_q_indices = _transpose_ordered(
+                    full_kv_num_blocks, full_kv_indices
+                )
+            else:
+                full_q_num_blocks, full_q_indices = None, None
+        else:
+            q_num_blocks, q_indices = None, None
+            full_q_num_blocks, full_q_indices = None, None
+
+        if isinstance(BLOCK_SIZE, int):
+            BLOCK_SIZE = (BLOCK_SIZE, BLOCK_SIZE)
+
+        mask_mod = mask_mod if mask_mod is not None else noop_mask
+        if seq_lengths is None:
+            q_length = kv_indices.shape[-2] * BLOCK_SIZE[0]
+            kv_length = kv_indices.shape[-1] * BLOCK_SIZE[1]
+            seq_lengths = (q_length, kv_length)
+
+        return cls(
+            seq_lengths=seq_lengths,
+            kv_num_blocks=kv_num_blocks,
+            kv_indices=kv_indices,
+            full_kv_num_blocks=full_kv_num_blocks,
+            full_kv_indices=full_kv_indices,
+            q_num_blocks=q_num_blocks,
+            q_indices=q_indices,
+            full_q_num_blocks=full_q_num_blocks,
+            full_q_indices=full_q_indices,
+            BLOCK_SIZE=BLOCK_SIZE,
+            mask_mod=mask_mod,
+        )
+
+    def as_tuple(self, flatten: bool = True):
+        """
+        Returns a tuple of the attributes of the BlockMask.
+
+        Args:
+            flatten (bool): If True, it will flatten the tuple of (KV_BLOCK_SIZE, Q_BLOCK_SIZE)
+        """
+        if flatten:
+            block_size = (self.BLOCK_SIZE[0], self.BLOCK_SIZE[1])  # type: ignore[assignment]
+            seq_lengths = (self.seq_lengths[0], self.seq_lengths[1])  # type: ignore[assignment]
+        else:
+            block_size = (self.BLOCK_SIZE,)  # type: ignore[assignment]
+            seq_lengths = (self.seq_lengths,)  # type: ignore[assignment]
+
+        # pyrefly: ignore [not-iterable]
+        return (
+            *seq_lengths,
+            self.kv_num_blocks,
+            self.kv_indices,
+            self.full_kv_num_blocks,
+            self.full_kv_indices,
+            self.q_num_blocks,
+            self.q_indices,
+            self.full_q_num_blocks,
+            self.full_q_indices,
+            *block_size,
+            self.mask_mod,
+        )
+
+    @property
+    def shape(self):
+        *batch_dims, _, _ = self.kv_indices.shape
+        return tuple(batch_dims) + self.seq_lengths
+
+    def __str__(self) -> str:
+        s = f"BlockMask(shape={self.shape}, sparsity={self.sparsity():.2f}%, \n"
+        mask_str = self.to_string().strip()
+        s += mask_str
+        s += "\n)"
+        return s
+
+    def __getitem__(self, index) -> "BlockMask":
+        """
+        Returns a new BlockMask instance by getting the mask for the given index position.
+
+        Args:
+            index: Index to apply to all attributes.
+
+        Example Usage:
+            .. code-block:: python
+
+                def causal_mask(b, h, q_idx, kv_idx):
+                    return q_idx >= kv_idx
+
+
+                block_mask = create_block_mask(
+                    causal_mask, 4, 2, 512, 512, device="cuda"
+                )
+                assert block_mask.kv_num_blocks.shape == (4, 2, 4)
+                assert block_mask.kv_indices.shape == (4, 2, 4, 4)
+
+                # Index on batch dimension
+                new_block_mask = block_mask[0]
+                assert new_block_mask.kv_num_blocks.shape == (2, 4)
+                assert new_block_mask.kv_indices.shape == (2, 4, 4)
+
+                # Index on batch and head dimension
+                new_block_mask = block_mask[0, 1]
+                assert new_block_mask.kv_num_blocks.shape == (4,)
+                assert new_block_mask.kv_indices.shape == (4, 4)
+
+                # slicing on batch and head dimension
+                new_block_mask = block_mask[0:2, 1:2]
+                assert new_block_mask.kv_num_blocks.shape == (2, 1, 4)
+                assert new_block_mask.kv_indices.shape == (2, 1, 4, 4)
+
+                # slicing on batch, head, and query dimension
+                new_block_mask = block_mask[
+                    0:2, 1:2, torch.tensor([1], dtype=torch.int32)
+                ]
+                assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
+                assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
+        """
+        index = (index,) if not isinstance(index, tuple) else index
+        padded = (*index, slice(None), slice(None), slice(None))[:3]
+        sizes = self.kv_num_blocks.shape[:3]
+        index = tuple(
+            (slice(i + n, i + n + 1) if -n <= i < 0 else slice(i, i + 1))
+            if isinstance(i, int)
+            else i
+            for i, n in zip(padded, sizes, strict=True)
+        )
+        new_kv_num_blocks = self.kv_num_blocks[index]
+        new_kv_indices = self.kv_indices[index]
+        if self.full_kv_num_blocks is not None:
+            assert self.full_kv_indices is not None
+            new_full_kv_num_blocks = self.full_kv_num_blocks[index]
+            new_full_kv_indices = self.full_kv_indices[index]
+        else:
+            new_full_kv_num_blocks = None
+            new_full_kv_indices = None
+        return BlockMask.from_kv_blocks(
+            new_kv_num_blocks,
+            new_kv_indices,
+            new_full_kv_num_blocks,
+            new_full_kv_indices,
+            BLOCK_SIZE=self.BLOCK_SIZE,
+            mask_mod=_sliced_mask_mod_error,
+            seq_lengths=self.seq_lengths,
+            compute_q_blocks=self.q_indices is not None,
+        )
+
+    def __repr__(self) -> str:
+        def shape_or_none(x: torch.Tensor | None):
+            return x.shape if x is not None else None
+
+        return (
+            f"BlockMask(\n"
+            f"    kv_num_blocks={self.kv_num_blocks.shape},\n"
+            f"    kv_indices={self.kv_indices.shape},\n"
+            f"    full_kv_num_blocks={shape_or_none(self.full_kv_num_blocks)},\n"
+            f"    full_kv_indices={shape_or_none(self.full_kv_indices)},\n"
+            f"    q_num_blocks={shape_or_none(self.q_num_blocks)},\n"
+            f"    q_indices={shape_or_none(self.q_indices)},\n"
+            f"    full_q_num_blocks={shape_or_none(self.full_q_num_blocks)},\n"
+            f"    full_q_indices={shape_or_none(self.full_q_indices)},\n"
+            f"    BLOCK_SIZE={self.BLOCK_SIZE},\n"
+            f"    shape={self.shape},\n"
+            f"    sparsity={self.sparsity():.2f}%,\n"
+            f"    mask_mod={self.mask_mod.__name__ if hasattr(self.mask_mod, '__name__') else self.mask_mod}\n"
+            f")"
+        )
+
+    def _adjust(self, new_q_len: int, new_kv_len: int):
+        new_num_rows = (new_q_len + self.BLOCK_SIZE[0] - 1) // self.BLOCK_SIZE[0]
+        new_num_cols = (new_kv_len + self.BLOCK_SIZE[1] - 1) // self.BLOCK_SIZE[1]
+        new_kv_num_blocks, new_kv_indices = _adjust_num_blocks_and_indices(
+            self.kv_num_blocks, self.kv_indices, new_num_rows, new_num_cols
+        )
+        if self.full_kv_num_blocks is not None:
+            assert self.full_kv_indices is not None
+            (
+                new_full_kv_num_blocks,
+                new_full_kv_indices,
+            ) = _adjust_num_blocks_and_indices(
+                self.full_kv_num_blocks,
+                self.full_kv_indices,
+                new_num_rows,
+                new_num_cols,
+            )
+        else:
+            new_full_kv_num_blocks = None
+            new_full_kv_indices = None
+        return self.from_kv_blocks(
+            new_kv_num_blocks,
+            new_kv_indices,
+            new_full_kv_num_blocks,
+            new_full_kv_indices,
+            self.BLOCK_SIZE,
+            self.mask_mod,
+        )
+
+    def numel(self):
+        """Returns the number of elements (not accounting for sparsity) in the mask."""
+        shape = self.shape
+
+        def _prod(xs):
+            return functools.reduce(operator.mul, xs, 1)
+
+        return _prod(shape)
+
+    def sparsity(self) -> float:
+        """Computes the percentage of blocks that are sparse (i.e. not computed)"""
+        total_size = self.numel()
+        computed_blocks = self.kv_num_blocks.sum()
+        if self.full_kv_num_blocks is not None:
+            computed_blocks += self.full_kv_num_blocks.sum()
+
+        computed_size = computed_blocks.item() * self.BLOCK_SIZE[0] * self.BLOCK_SIZE[1]
+        dense_ratio = computed_size / total_size
+        return 100 * (1 - dense_ratio)
+
+    def to_dense(self) -> Tensor:
+        """Returns a dense block that is equivalent to the block mask."""
+        partial_dense = _ordered_to_dense(self.kv_num_blocks, self.kv_indices)
+        if self.full_kv_num_blocks is not None:
+            assert self.full_kv_indices is not None
+            # pyrefly: ignore [bad-return]
+            return partial_dense | _ordered_to_dense(
+                self.full_kv_num_blocks, self.full_kv_indices
+            )
+        return partial_dense
+
+    def to_string(self, grid_size=(20, 20), limit=4):
+        """Returns a string representation of the block mask. Quite nifty.
+
+        If grid_size is -1, prints out an uncompressed version. Warning, it can be quite big!
+        """
+        dense_mask = self.to_dense()
+        *batch_dims, num_rows, num_cols = dense_mask.shape
+        if isinstance(grid_size, int):
+            max_rows = grid_size
+            max_cols = grid_size
+        elif grid_size == -1:
+            max_rows = num_rows
+            max_cols = num_cols
+        else:
+            max_rows, max_cols = grid_size
+
+        def create_block_vis(*batch_idx):
+            descriptors = []
+
+            descriptors.append(f"{batch_idx}")
+
+            vis = ", ".join(reversed(descriptors)) + "\n"
+
+            def summarize_section(section) -> str:
+                percentage = section.float().mean().item()
+                if percentage == 1:
+                    return "█"
+                elif percentage == 0:
+                    return " "
+                else:
+                    return "░"
+
+            def cdiv(a, b):
+                return (a + (b - 1)) // b
+
+            row_step = max(1, cdiv(num_rows, max_rows))
+            col_step = max(1, cdiv(num_cols, max_cols))
+
+            for r in range(0, num_rows, row_step):
+                for c in range(0, num_cols, col_step):
+                    cur_mask = dense_mask
+                    for idx in batch_idx:
+                        cur_mask = cur_mask[idx]
+                    char = summarize_section(
+                        cur_mask[r : r + row_step, c : c + col_step]
+                    )
+                    vis += char * 2
+                vis += "\n"
+            return vis
+
+        total_vis = []
+        for idx, batch_idx in enumerate(
+            itertools.product(*[range(i) for i in batch_dims])
+        ):
+            if idx == limit:
+                total_vis.append("...")
+                total_vis.append("To print out more, set BlockMask.to_string(limit=N)")
+                total_vis.append(
+                    "You can also index (BlockMask[batch, head]) to choose a specific batch or head"
+                )
+                break
+            block_vis = create_block_vis(*batch_idx)
+            total_vis.append(block_vis)
+
+        return "\n".join(total_vis)
+
+    def to(self, device: torch.device | str) -> "BlockMask":
+        """Moves the BlockMask to the specified device.
+
+        Args:
+            device (torch.device or str): The target device to move the BlockMask to.
+                Can be a torch.device object or a string (e.g., 'cpu', 'cuda:0').
+
+        Returns:
+            BlockMask: A new BlockMask instance with all tensor components moved
+            to the specified device.
+
+        Note:
+            This method does not modify the original BlockMask in-place.
+            Instead, it returns a new BlockMask instance where individual tensor attributes
+            may or may not be moved to the specified device, depending on their
+            current device placement.
+        """
+        mapped_attributes = tree_map_only(
+            torch.Tensor,
+            lambda x: x.to(device),
+            self.as_tuple(flatten=False),
+        )
+        return BlockMask(*mapped_attributes)
+
+    def _flatten(self):
+        """Flatten BlockMask into a list of tensors and context."""
+        tensors = tuple(getattr(self, attr) for attr in self._TENSOR_ATTRS)
+        context = tuple(getattr(self, attr) for attr in self._CONTEXT_ATTRS)
+        return tensors, context
+
+    @classmethod
+    def _unflatten(cls, tensors, context):
+        """Unflatten tensors and context back into a BlockMask."""
+        kwargs = {
+            **dict(zip(cls._CONTEXT_ATTRS, context)),
+            **dict(zip(cls._TENSOR_ATTRS, tensors)),
+        }
+        # pyrefly: ignore [bad-argument-type]
+        return cls(**kwargs)
+
+    def _flatten_with_keys(self):
+        """Flatten BlockMask with keys for better tracing."""
+        tensors = tuple(
+            (GetAttrKey(attr), getattr(self, attr)) for attr in self._TENSOR_ATTRS
+        )
+        context = tuple(
+            (GetAttrKey(attr), getattr(self, attr)) for attr in self._CONTEXT_ATTRS
+        )
+        return tensors, context
+
+
+def _broadcast_to_dim(x, dim):
+    while x.dim() < dim:
+        x = x.unsqueeze(0)
+    return x
+
+
+def _round_up_to_multiple(x, multiple):
+    return (x + multiple - 1) // multiple * multiple
+
+
+def _convert_mask_to_block_mask(
+    mask: Tensor,
+    Q_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
+    KV_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
+    separate_full_blocks: bool = False,
+) -> tuple[Tensor, Tensor | None]:
+    assert mask.dtype == torch.bool
+    mask = _broadcast_to_dim(mask, 4)
+
+    def padding_needed_for_multiple(x, multiple):
+        return _round_up_to_multiple(x, multiple) - x
+
+    mask = torch.nn.functional.pad(
+        mask,
+        (
+            0,
+            padding_needed_for_multiple(mask.shape[-1], KV_BLOCK_SIZE),
+            0,
+            padding_needed_for_multiple(mask.shape[-2], Q_BLOCK_SIZE),
+        ),
+    )
+    B, H, Q, KV = mask.shape
+    assert Q % Q_BLOCK_SIZE == 0
+    assert KV % KV_BLOCK_SIZE == 0
+    mask = mask.view(
+        B, H, Q // Q_BLOCK_SIZE, Q_BLOCK_SIZE, KV // KV_BLOCK_SIZE, KV_BLOCK_SIZE
+    )  # [B, H, Q//Q_BLOCK_SIZE, Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE, KV_BLOCK_SIZE]
+    mask = mask.permute(
+        0, 1, 2, 4, 3, 5
+    )  # [B, H, Q//Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE, Q_BLOCK_SIZE, KV_BLOCK_SIZE]
+    mask_block_sum = mask.sum(
+        dim=[-2, -1]
+    )  # [B, H, Q//Q_BLOCK_SIZE, KV//KV_BLOCK_SIZE]
+    if separate_full_blocks:
+        full_block_sum = Q_BLOCK_SIZE * KV_BLOCK_SIZE
+        full_blocks = mask_block_sum == full_block_sum
+        partial_blocks = (mask_block_sum > 0) & (mask_block_sum < full_block_sum)
+        partial_blocks = partial_blocks.to(dtype=torch.int8)
+        full_blocks = full_blocks.to(dtype=torch.int8)
+        return partial_blocks, full_blocks
+    else:
+        partial_blocks = mask_block_sum > 0
+        partial_blocks = partial_blocks.to(dtype=torch.int8)
+        return partial_blocks, None
+
+
+def or_masks(*mask_mods: _mask_mod_signature) -> _mask_mod_signature:
+    """Returns a mask_mod that's the union of provided mask_mods"""
+    if not all(callable(arg) for arg in mask_mods):
+        raise RuntimeError(f"All inputs should be callable mask_mods: {mask_mods}")
+
+    def or_mask(b, h, q_idx, kv_idx):
+        result = b.new_zeros((), dtype=torch.bool)
+        for mask in mask_mods:
+            result = result | mask(b, h, q_idx, kv_idx)
+        return result
+
+    return or_mask
+
+
+def and_masks(*mask_mods: _mask_mod_signature) -> _mask_mod_signature:
+    """Returns a mask_mod that's the intersection of provided mask_mods"""
+    if not all(callable(arg) for arg in mask_mods):
+        raise RuntimeError(f"All inputs should be callable mask_mods: {mask_mods}")
+
+    def and_mask(b, h, q_idx, kv_idx):
+        result = b.new_ones((), dtype=torch.bool)
+        for mask in mask_mods:
+            result = result & mask(b, h, q_idx, kv_idx)
+        return result
+
+    return and_mask
+
+
+def _convert_block_mask_to_mask(
+    block_mask,
+    KV_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
+    Q_BLOCK_SIZE=_DEFAULT_SPARSE_BLOCK_SIZE,
+) -> Tensor:
+    assert block_mask.dim() == 4
+    B, H, Q, KV = block_mask.shape
+    block_mask = block_mask.expand(Q_BLOCK_SIZE, KV_BLOCK_SIZE, *block_mask.shape)
+    block_mask = block_mask.permute(2, 3, 4, 0, 5, 1).reshape(
+        B, H, Q * Q_BLOCK_SIZE, KV * KV_BLOCK_SIZE
+    )
+    return block_mask
+
+
+def _create_sparse_block_from_block_mask(
+    block_mask: tuple[Tensor, Tensor | None],
+    mask_mod: Callable | None,
+    seq_lengths: tuple[int, int],
+    Q_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE,
+    KV_BLOCK_SIZE: int = _DEFAULT_SPARSE_BLOCK_SIZE,
+) -> BlockMask:
+    partial_blocks, full_blocks = block_mask
+
+    partial_bm = _dense_to_ordered(partial_blocks)
+    if full_blocks is not None:
+        full_bm: tuple[Tensor | None, Tensor | None] = _dense_to_ordered(full_blocks)
+    else:
+        full_bm = (None, None)
+
+    return BlockMask.from_kv_blocks(
+        partial_bm[0],
+        partial_bm[1],
+        full_bm[0],
+        full_bm[1],
+        BLOCK_SIZE=(Q_BLOCK_SIZE, KV_BLOCK_SIZE),
+        mask_mod=mask_mod,
+        seq_lengths=seq_lengths,
+    )
+
+
+def create_mask(
+    mod_fn: _score_mod_signature | _mask_mod_signature,
+    B: int | None,
+    H: int | None,
+    Q_LEN: int,
+    KV_LEN: int,
+    device: DeviceLikeType | None = None,
+) -> Tensor:
+    r"""This function creates a mask tensor from a mod_fn function.
+
+    Args:
+        mod_fn (Union[_score_mod_signature, _mask_mod_signature]): Function to modify attention scores.
+        B (int): Batch size.
+        H (int): Number of query heads.
+        Q_LEN (int): Sequence length of query.
+        KV_LEN (int): Sequence length of key/value.
+        device (str): Device to run the mask creation on.
+
+    Returns:
+        mask (Tensor): A mask tensor with shape (B, H, M, N).
+    """
+    if device is None:
+        device = torch.accelerator.current_accelerator() or "cpu"
+    if B is None:
+        B = 1
+    if H is None:
+        H = 1
+    b = torch.arange(0, B, device=device)
+    h = torch.arange(0, H, device=device)
+    m = torch.arange(0, Q_LEN, device=device)
+    n = torch.arange(0, KV_LEN, device=device)
+    mod_type = _get_mod_type(mod_fn)
+
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+
+    with TransformGetItemToIndex():
+        if mod_type == _ModificationType.SCORE_MOD:
+            score_mod = mod_fn
+            score_mod = _vmap_for_bhqkv(score_mod, prefix=(0,))  # first input is score
+            out = score_mod(torch.zeros(B, H, Q_LEN, KV_LEN, device=device), b, h, m, n)
+            mask = torch.where(torch.isneginf(out), False, True)
+            return mask
+        elif mod_type == _ModificationType.MASK_MOD:
+            mask_mod = mod_fn
+            mask_mod = _vmap_for_bhqkv(mask_mod, prefix=())
+            mask = mask_mod(b, h, m, n)
+            return mask
+        else:
+            raise AssertionError
+
+
+def create_block_mask(
+    mask_mod: _mask_mod_signature,
+    B: int | None,
+    H: int | None,
+    Q_LEN: int,
+    KV_LEN: int,
+    device: DeviceLikeType | None = None,
+    BLOCK_SIZE: int | tuple[int, int] = _DEFAULT_SPARSE_BLOCK_SIZE,
+    _compile=False,
+) -> BlockMask:
+    r"""This function creates a block mask tuple from a mask_mod function.
+
+    Args:
+        mask_mod (Callable): mask_mod function. This is a callable that defines the
+            masking pattern for the attention mechanism. It takes four arguments:
+            b (batch size), h (number of heads), q_idx (query index), and kv_idx (key/value index).
+            It should return a boolean tensor indicating which attention connections are allowed (True)
+            or masked out (False).
+        B (int): Batch size.
+        H (int): Number of query heads.
+        Q_LEN (int): Sequence length of query.
+        KV_LEN (int): Sequence length of key/value.
+        device (str): Device to run the mask creation on.
+        BLOCK_SIZE (int or tuple[int, int]): Block size for the block mask. If a single int is provided it is used for both query and key/value.
+
+    Returns:
+        BlockMask:  A BlockMask object that contains the block mask information.
+
+    Example Usage:
+        .. code-block:: python
+
+            def causal_mask(b, h, q_idx, kv_idx):
+                return q_idx >= kv_idx
+
+
+            block_mask = create_block_mask(causal_mask, 1, 1, 8192, 8192, device="cuda")
+            query = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
+            key = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
+            value = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
+            output = flex_attention(query, key, value, block_mask=block_mask)
+    """
+    if device is None:
+        device = torch.accelerator.current_accelerator() or "cpu"
+    mod_type = _get_mod_type(mask_mod)
+    assert mod_type == _ModificationType.MASK_MOD, (
+        f"create-block_mask requires a mask_mod function! Got {mask_mod}"
+    )
+    if B is None:
+        B = 1
+    if H is None:
+        H = 1
+    if isinstance(BLOCK_SIZE, int):
+        Q_BLOCK_SIZE = BLOCK_SIZE
+        KV_BLOCK_SIZE = BLOCK_SIZE
+    else:
+        Q_BLOCK_SIZE, KV_BLOCK_SIZE = BLOCK_SIZE
+
+    if _compile:
+        warnings.warn(
+            "_compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return torch.compile(create_block_mask)(
+            mask_mod, B, H, Q_LEN, KV_LEN, device, BLOCK_SIZE
+        )
+
+    mask_tensor = create_mask(mask_mod, B, H, Q_LEN, KV_LEN, device)
+    partial_block_mask, full_block_mask = _convert_mask_to_block_mask(
+        mask_tensor,
+        Q_BLOCK_SIZE=Q_BLOCK_SIZE,
+        KV_BLOCK_SIZE=KV_BLOCK_SIZE,
+        separate_full_blocks=True,
+    )
+    block_mask = _create_sparse_block_from_block_mask(
+        (partial_block_mask, full_block_mask),
+        mask_mod,
+        (Q_LEN, KV_LEN),
+        Q_BLOCK_SIZE,
+        KV_BLOCK_SIZE,
+    )
+    return block_mask
+
+
+def _create_empty_block_mask(query: Tensor, key: Tensor) -> BlockMask:
+    r"""Default block mask for flex attention.
+    If users don't specify any block sparse mask info, we create this
+    empty block sparse mask. Which creates a BlockMask with 1 block that is the full length
+    of the query and key tensors.
+    """
+    device = query.device
+    return BlockMask.from_kv_blocks(
+        kv_num_blocks=torch.ones([1, 1, 1], dtype=torch.int32, device=device),
+        kv_indices=torch.zeros([1, 1, 1, 1], dtype=torch.int32, device=device),
+        BLOCK_SIZE=_LARGE_SPARSE_BLOCK_SIZE,
+        seq_lengths=(1, 1),
+    )
+
+
+def _apply_kernel_options(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    return_lse: bool,
+    kernel_options,
+    return_aux: AuxRequest | None = None,
+):
+    kernel_options = {} if kernel_options is None else dict(kernel_options)
+
+    if "BACKEND" in kernel_options and kernel_options.get(
+        "FORCE_USE_FLEX_ATTENTION", False
+    ):
+        # TODO: remove FORCE_USE_FLEX_ATTENTION once BACKEND is fully adopted.
+        raise RuntimeError(
+            "BACKEND cannot be combined with legacy FORCE_USE_FLEX_ATTENTION. "
+            "BACKEND supersedes the legacy knob; please drop FORCE_USE_FLEX_ATTENTION "
+            "and only specify the desired BACKEND."
+        )
+
+    if "BACKEND" in kernel_options:
+        valid_backends = typing.get_args(_Backend)
+        if kernel_options["BACKEND"] not in valid_backends:
+            raise ValueError(
+                f"Invalid BACKEND value '{kernel_options['BACKEND']}'. "
+                f"Must be one of {valid_backends}"
+            )
+
+    kernel_options.setdefault("BACKEND", "AUTO")
+    kernel_options.setdefault("PRESCALE_QK", False)
+    kernel_options.setdefault("ROWS_GUARANTEED_SAFE", False)
+    kernel_options.setdefault("BLOCKS_ARE_CONTIGUOUS", False)
+    # This forces all biases grad scatters to be done in the DQ iteration loop of the backwards
+    kernel_options.setdefault("WRITE_DQ", True)
+
+    any_inputs_on_cpu_device = (
+        query.device.type == "cpu"
+        or key.device.type == "cpu"
+        or value.device.type == "cpu"
+    )
+
+    # Determine what auxiliary outputs are needed
+    output_lse = return_lse
+    output_max = False
+
+    if return_aux is not None:
+        # New API takes precedence over legacy parameters
+        output_lse = return_aux.lse
+        output_max = return_aux.max_scores
+
+    # If forward kernel needs to return logsumexp is decided by this rule internally.
+    assert "OUTPUT_LOGSUMEXP" not in kernel_options
+    kernel_options["OUTPUT_LOGSUMEXP"] = True
+    if not output_lse:
+        # We used to check if q,k,v required grads but since captured buffers can require grad
+        # we always write unless in no_grad
+        kernel_options["OUTPUT_LOGSUMEXP"] = torch.is_grad_enabled()
+        if any_inputs_on_cpu_device:
+            # CPU with torch.compile now supports inference, and will not return lse
+            # TODO: support CPU for training and return lse
+            kernel_options["OUTPUT_LOGSUMEXP"] = False
+
+    # If forward kernel needs to return max is decided by this rule internally.
+    assert "OUTPUT_MAX" not in kernel_options
+    kernel_options["OUTPUT_MAX"] = output_max
+    if any_inputs_on_cpu_device and output_max:
+        # CPU doesn't support returning max yet
+        # TODO: support CPU for returning max
+        raise NotImplementedError("Returning max scores is not supported on CPU.")
+        kernel_options["OUTPUT_MAX"] = False
+
+    return kernel_options
+
+
+def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor) -> None:
+    if query.size(-1) != key.size(-1):
+        raise ValueError(
+            f"Expect query and key/value to have the same embedding dimension "
+            f"but got E={query.size(-1)} and E={key.size(-1)}."
+        )
+
+
+def _validate_device(query: Tensor, key: Tensor, value: Tensor) -> None:
+    """TODO: Remove once non cuda/cpu devices support is added
+    We only need to check query since we have already that q,k,v are on the same device
+    """
+    supported_devices = {"cuda", "cpu", "xpu", "hpu"}
+    if query.device.type not in supported_devices:
+        raise ValueError(
+            "FlexAttention is only supported on CUDA, CPU or HPU devices. "
+            f"Found input tensors on {query.device.type} device."
+        )
+
+
+def _enforce_mem_layouts(
+    query: Tensor, key: Tensor, value: Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Enforce memory layouts for query, key, and value tensors.
+
+    For non-FP8 dtypes, no action is taken.
+
+    For FP8 dtypes, we enforce the following memory layouts:
+    - Query tensor must be in row-major memory layout, as it will be the left-operand in the FP8 GEMM `q @ k.T`.
+    - Key tensor must be in row-major memory layout, as it will be transposed when used as the right-operand
+      in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM.
+    - Value tensor must be in column-major memory layout, as it will be the right-operand in the FP8 GEMM `softmax_scores @ v`.
+
+    Returns the query, key, and value tensors with the enforced memory layouts.
+    """
+
+    def is_row_major(tensor: Tensor) -> bool:
+        return tensor.stride()[-1] == 1
+
+    def is_col_major(tensor: Tensor) -> bool:
+        return tensor.stride()[-2] == 1
+
+    # These memory layout constraint are only for FP8 GEMMs on NVIDIA GPU architectures >= SM89 and < SM100.
+    # This is because GPU arch < SM89 does not not support FP8 GEMMs, and
+    # SM100 has support for TN, NT, TT, NN layouts for FP8 GEMMs
+    # (i.e., left and right operands can be in row or column major layouts)
+    # so this check is only needed for older architectures.
+    # See: https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/blackwell_functionality.md
+    fp8_dtypes = (
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    )
+    gemm_precision = query.dtype
+
+    should_enforce_mem_layout = (
+        gemm_precision in fp8_dtypes
+        and torch.version.cuda is not None
+        and torch.cuda.get_device_capability("cuda") >= (8, 9)
+        and torch.cuda.get_device_capability("cuda") < (10, 0)
+    )
+    if not should_enforce_mem_layout:
+        return query, key, value
+
+    # Query must be in row-major memory layout as the left-operand in the FP8 GEMM `q @ k.T`
+    if not is_row_major(query):
+        query = query.contiguous()
+
+    # Key must be in row-major memory layout as it will be transposed when used as the right-operand
+    # in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM.
+    if not is_row_major(key):
+        key = key.contiguous()
+
+    # Value must be in column-major memory layout as the right-operand in the FP8 GEMM `softmax_scores @ v`
+    if not is_col_major(value):
+        value = value.transpose(-2, -1).contiguous().transpose(-2, -1)
+    return query, key, value
+
+
+def flex_attention(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    score_mod: _score_mod_signature | None = None,
+    block_mask: BlockMask | None = None,
+    scale: float | None = None,
+    enable_gqa: bool = False,
+    return_lse: bool = False,
+    kernel_options: FlexKernelOptions | None = None,
+    *,
+    return_aux: AuxRequest | None = None,
+) -> Tensor | tuple[Tensor, Tensor] | tuple[Tensor, AuxOutput]:
+    r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
+
+    This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
+    attention score modification function. The attention score modification function will be applied after the attention
+    scores have been calculated between the query and key tensors. The attention scores are calculated as follows:
+
+    The ``score_mod`` function should have the following signature:
+
+    .. code-block:: python
+
+        def score_mod(
+            score: Tensor,
+            batch: Tensor,
+            head: Tensor,
+            q_idx: Tensor,
+            k_idx: Tensor
+        ) -> Tensor:
+
+    Where:
+        - ``score``: A scalar tensor representing the attention score,
+          with the same data type and device as the query, key, and value tensors.
+        - ``batch``, ``head``, ``q_idx``, ``k_idx``: Scalar tensors indicating
+          the batch index, query head index, query index, and key/value index, respectively.
+          These should have the ``torch.int`` data type and be located on the same device as the score tensor.
+
+    Args:
+        query (Tensor): Query tensor; shape :math:`(B, Hq, L, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
+        key (Tensor): Key tensor; shape :math:`(B, Hkv, S, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
+        value (Tensor): Value tensor; shape :math:`(B, Hkv, S, Ev)`. For FP8 dtypes, should be in column-major memory layout for optimal performance.
+        score_mod (Optional[Callable]): Function to modify attention scores. By default no score_mod is applied.
+        block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
+        scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
+        enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads.
+        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead.
+        kernel_options (Optional[FlexKernelOptions]):
+            Options to control the behavior of the underlying Triton kernels.
+            See :class:`FlexKernelOptions` for available options and usage examples.
+        return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return.
+            If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)``
+            to request both auxiliary outputs.
+
+    Returns:
+        output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`.
+
+        When ``return_aux`` is not None:
+            aux (AuxOutput): Auxiliary outputs with requested fields populated.
+
+        When ``return_aux`` is None (deprecated paths):
+            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``.
+
+    Shape legend:
+        - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
+        - :math:`S: \text{Source sequence length}`
+        - :math:`L: \text{Target sequence length}`
+        - :math:`E: \text{Embedding dimension of the query and key}`
+        - :math:`Ev: \text{Embedding dimension of the value}`
+
+    .. warning::
+        `torch.nn.attention.flex_attention` is a prototype feature in PyTorch.
+        Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    """
+    # Some basic input validation
+    _validate_sdpa_input(query, key, value)
+    _validate_embed_dim(query, key, value)
+    _validate_device(query, key, value)
+    query, key, value = _enforce_mem_layouts(query, key, value)
+    if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
+        raise NotImplementedError("NYI: query, key, and value must be 4D tensors")
+    if (not enable_gqa) and query.size(-3) != key.size(-3):
+        raise ValueError(
+            f"Expect query and key/value to have the same number of heads "
+            f"but got Hq={query.size(-3)} and Hkv={key.size(-3)}. "
+            f"Try setting enable_gqa=True for GQA."
+        )
+    if enable_gqa:
+        Hq = query.size(1)
+        Hkv = key.size(1)
+        if Hq % Hkv != 0:
+            raise ValueError(
+                f"Expect number of query heads to be a multiple of kv heads for GQA "
+                f"but got Hq={Hq} and Hkv={Hkv}."
+            )
+    if query.size(0) != key.size(0):
+        if block_mask is None:
+            raise ValueError(
+                f"Expect query and key/value to have the same batch size, "
+                f"or non-none block_mask, "
+                f"but got block_mask=None, Bq={query.size(0)}, and Bkv={key.size(0)}."
+            )
+
+        if block_mask.kv_num_blocks.size(0) != query.size(0):
+            raise ValueError(
+                f"Expect query and key/value to have the same batch size, "
+                f"or block_mask and query to have the same batch size, "
+                f"but got Bq={query.size(0)}, Bkv={key.size(0)}, B_block_mask={block_mask.kv_num_blocks.size(0)}."
+            )
+
+    if score_mod is None:
+        score_mod = _identity
+
+    if block_mask is None:
+        block_mask = _create_empty_block_mask(query, key)
+
+    # If BlockMask was sliced, its mask_mod is intentionally replaced with an error-raising stub.
+    # This guard ensures we surface the intended error message before any shape-based checks.
+    if getattr(block_mask, "mask_mod", None) is _sliced_mask_mod_error:
+        raise RuntimeError("Cannot use mask_mod from a sliced BlockMask")
+
+    if (
+        block_mask.BLOCK_SIZE[0] == _LARGE_SPARSE_BLOCK_SIZE
+        and block_mask.BLOCK_SIZE[1] == _LARGE_SPARSE_BLOCK_SIZE
+    ):
+        # This corresponds to the case where we essentially have a "no-op" block mask.
+        pass
+    else:
+        block_mask_q_len = block_mask.shape[-2]
+        block_mask_kv_len = block_mask.shape[-1]
+        if query.size(-2) > block_mask_q_len or key.size(-2) > block_mask_kv_len:
+            raise ValueError(
+                f"block_mask was created for block_mask.shape={block_mask.shape} but got q_len={query.size(-2)} and kv_len={key.size(-2)}. "
+                "As the block mask was created for a smaller length than you're using it for, you likely need to create a new block mask."
+            )
+        elif (
+            query.size(-2) < block_mask_q_len and key.size(-2) <= block_mask_kv_len
+        ) or (query.size(-2) <= block_mask_q_len and key.size(-2) < block_mask_kv_len):
+            raise ValueError(
+                f"block_mask was created for block_mask.shape={block_mask.shape} but got q_len={query.size(-2)} and kv_len={key.size(-2)}. "
+                "As the block mask was created for a larger length than you're using it for, you can either 1. create a new block mask with the correct length, or 2. 'adjust' the existing block mask to the correct length by calling block_mask._adjust(q_len, kv_len). This essentially 'crops' the block mask to the upper left corner, which does not work for all mask_mods!"
+            )
+        assert query.size(-2) == block_mask_q_len
+        assert key.size(-2) == block_mask_kv_len
+
+    if scale is None:
+        scale = 1.0 / math.sqrt(query.size(-1))
+
+    if query.device != block_mask.kv_num_blocks.device:  # type: ignore[union-attr]
+        raise RuntimeError(
+            f"Expect q/k/v and block_mask to be on the same device "
+            f"but got {query.device} and {block_mask.kv_num_blocks.device}."  # type: ignore[union-attr]
+        )
+
+    # Handle deprecation warnings for old parameters
+    if return_lse and return_aux is not None:
+        raise ValueError(
+            "Cannot specify both return_lse and return_aux. "
+            "return_lse is deprecated, please use return_aux=AuxRequest(lse=True) instead."
+        )
+    elif return_lse and return_aux is None:
+        _warn_once(
+            "deprecated_return_lse",
+            "return_lse is deprecated and will be removed in v2.10. "
+            "Please use return_aux=AuxRequest(lse=True) instead.",
+            category=FutureWarning,
+        )
+
+    kernel_options = _apply_kernel_options(
+        query,
+        key,
+        value,
+        return_lse,
+        kernel_options,
+        return_aux,
+    )
+
+    def _finalize_outputs(
+        out,
+        lse,
+        max_scores,
+        *,
+        return_aux: AuxRequest | None,
+        return_lse: bool,
+    ):
+        """Normalize stats and build return value (aux-aware, legacy-compatible)."""
+        ln2 = math.log(2.0)
+        return_lse = return_lse or return_aux is not None and return_aux.lse
+        return_max = return_aux is not None and return_aux.max_scores
+
+        lse_scaled = lse * ln2 if (return_lse and lse.numel() > 0) else None
+        max_scaled = (
+            max_scores * ln2 if (return_max and max_scores.numel() > 0) else None
+        )
+
+        if return_aux is not None:
+            return out, AuxOutput(
+                lse=lse_scaled,
+                max_scores=max_scaled,
+            )
+
+        if return_lse:
+            return out, lse_scaled
+
+        return out
+
+    if torch.compiler.is_dynamo_compiling():
+        # mark head_dim and number of heads to be static
+        for x in [query, key, value]:
+            torch._dynamo.mark_static(x, -3)
+            torch._dynamo.mark_static(x, -1)
+
+        out, lse, max_scores = flex_attention_hop(
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask.as_tuple(),
+            scale,
+            kernel_options,  # type: ignore[union-attr]
+        )
+        return _finalize_outputs(
+            out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+        )
+
+    if not _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG:
+        _warn_once(
+            warning_id="flex_attention_performance",
+            message=(
+                "flex_attention called without torch.compile() - this will use an unfused implementation that materializes the full scores matrix instead of generating a fused kernel.\n\n"
+                "SOLUTION: Use torch.compile(flex_attention)(...)\n\n"
+                "If you want to debug your score_mod/mask_mod, you can set:\n"
+                "torch.nn.attention.flex_attention._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = True\n\n"
+                "This will allow you to use print statements or breakpoints. Note: This doesn't work with the backwards pass and may produce incorrect results."
+            ),
+        )
+
+    if not torch._dynamo.is_dynamo_supported():
+        raise RuntimeError("flex_attention requires dynamo support")
+
+    from torch._dynamo.backends.debugging import (
+        make_eager_backend_with_torch_function_mode,
+    )
+
+    # Dynamo is expecting a callable with "__code__" attribute.
+    # We cannot directly pass hop to it. So we wrap it in a dummy function.
+    def _flex_attention_hop_wrapper(*args, **kwargs):
+        return flex_attention_hop(*args, **kwargs)
+
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            with _temp_remove_pre_dispatch_torch_function_mode():
+                with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                    if metadata_mode:
+                        backend: str | Callable[..., Any] = (
+                            make_eager_backend_with_torch_function_mode(metadata_mode)
+                        )
+                    else:
+                        backend = "eager"
+
+                    if _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG:
+                        flex_fn = _flex_attention_hop_wrapper
+                    else:
+                        flex_fn = torch.compile(
+                            _flex_attention_hop_wrapper, backend=backend, fullgraph=True
+                        )
+
+                    out, lse, max_scores = flex_fn(
+                        query,
+                        key,
+                        value,
+                        score_mod,
+                        block_mask.as_tuple(),  # type: ignore[union-attr]
+                        scale,
+                        kernel_options,
+                    )
+    return _finalize_outputs(
+        out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20c1b4b2e49a37cf0e29603f20ef50e0caf6146
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py
@@ -0,0 +1,326 @@
+"""
+Variable-length attention implementation using Flash Attention.
+
+This module provides a high-level Python interface for variable-length attention
+that calls into the optimized Flash Attention kernels.
+"""
+
+import logging
+from functools import lru_cache
+from typing import Any, NamedTuple
+
+import torch
+
+
+log = logging.getLogger(__name__)
+
+__all__ = ["varlen_attn", "AuxRequest"]
+
+
+@lru_cache(maxsize=8)
+def _should_use_cudnn(device_index: int) -> bool:
+    """Cache device capability check to avoid repeated CUDA calls."""
+    return False
+
+
+class AuxRequest(NamedTuple):
+    """
+    Request which auxiliary outputs to compute from varlen_attn.
+
+    Each field is a boolean indicating whether that auxiliary output should be computed.
+    """
+
+    lse: bool = False
+
+
+@torch.library.custom_op("torch_attn::_varlen_attn", mutates_args={})
+def _varlen_attn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Private custom op for variable-length attention.
+
+    This is the internal implementation. Users should use the public varlen_attn function instead.
+    """
+
+    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
+
+    if use_cudnn:
+        log.info("Using cuDNN backend for varlen_attn")
+        result = torch.ops.aten._cudnn_attention_forward(
+            query,
+            key,
+            value,
+            None,  # attn_bias
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            True,  # compute_log_sumexp
+            0.0,  # dropout_p hardcoded to 0.0
+            is_causal,
+            False,  # return_debug_mask
+        )
+        # cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask)
+        output, softmax_lse, rng_state = result[0], result[1], result[6]
+    else:
+        log.info("Using Flash Attention backend for varlen_attn")
+        output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward(
+            query,
+            key,
+            value,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            0.0,  # dropout_p hardcoded to 0.0
+            is_causal,
+            return_debug_mask=False,
+        )
+
+    rng_state_ = torch.zeros(
+        (2,), dtype=torch.uint64, device=query.device
+    )  # hardcoded since dropout is hardcoded to 0
+    return output, softmax_lse, rng_state_
+
+
+@_varlen_attn.register_fake
+def _varlen_attn_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Fake implementation for meta tensor computation and tracing.
+
+    Based on the 3D varlen path from meta__flash_attention_forward:
+    - query shape: (total, num_heads, head_dim)
+    - logsumexp shape: (num_heads, total_q)
+    """
+    # Output has same shape as query
+    output = torch.empty_like(query)
+
+    # For varlen path: logsumexp shape is (num_heads, total_q)
+    total_q = query.size(0)
+    num_heads = query.size(1)
+    logsumexp = torch.empty(
+        (num_heads, total_q), dtype=torch.float, device=query.device
+    )
+
+    rng_state = torch.empty((2,), dtype=torch.uint64, device=query.device)
+
+    return output, logsumexp, rng_state
+
+
+def varlen_attn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+    return_aux: AuxRequest | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute variable-length attention using Flash Attention.
+    This function is similar to scaled_dot_product_attention but optimized for
+    variable-length sequences using cumulative sequence position tensors.
+    Args:
+    - query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
+    - key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
+    - value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
+    - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
+    - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
+    - max_q (int): Maximum query sequence length in the batch.
+    - max_k (int): Maximum key/value sequence length in the batch.
+    - is_causal (bool, optional): If set to True, applies causal masking (default: False).
+    - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
+
+    Shape legend:
+    - :math:`N`: Batch size
+    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
+    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
+    - :math:`H`: Number of attention heads
+    - :math:`D`: Head dimension
+
+    Returns:
+    - Tensor: Output tensor from attention computation
+    - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
+    (output, lse), where lse is the logsumexp
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
+        >>> head_dim = embed_dim // num_heads
+        >>> seq_lengths = []
+        >>> for _ in range(batch_size):
+        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
+        ...     seq_lengths.append(min(length, max_seq_len))
+        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
+        >>> total_tokens = seq_lengths.sum().item()
+        >>>
+        >>> # Create packed query, key, value tensors
+        >>> query = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>> key = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>> value = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>>
+        >>> # Build cumulative sequence tensor
+        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+        >>> cu_seq[1:] = seq_lengths.cumsum(0)
+        >>> max_len = seq_lengths.max().item()
+        >>>
+        >>> # Call varlen_attn
+        >>> output = varlen_attn(
+        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
+        ... )
+    """
+    out, lse, _ = torch.ops.torch_attn._varlen_attn(
+        query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal
+    )
+    if return_aux is not None and return_aux.lse:
+        return out, lse
+    return out
+
+
+def _setup_context(ctx: Any, inputs: tuple[Any, ...], output: Any) -> None:
+    query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal = inputs
+    out, lse, rng_state = output
+
+    ctx.save_for_backward(query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state)
+
+    ctx.max_q = max_q
+    ctx.max_k = max_k
+    ctx.is_causal = is_causal
+
+
+@torch.library.custom_op("torch_attn::_varlen_attn_backward", mutates_args={})
+def _varlen_attn_backward(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    unused = torch.empty(0, device=query.device)
+
+    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
+    if use_cudnn:
+        log.info("Using cuDNN backend for varlen_attn")
+        dq, dk, dv = torch.ops.aten._cudnn_attention_backward(
+            grad_out,
+            query,
+            key,
+            value,
+            out,
+            lse,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            0.0,
+            is_causal,
+            rng_state,
+            unused,
+        )
+    else:
+        log.info("Using Flash Attention backend for varlen_attn")
+        dq, dk, dv = torch.ops.aten._flash_attention_backward(
+            grad_out,
+            query,
+            key,
+            value,
+            out,
+            lse,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            0.0,
+            is_causal,
+            rng_state,
+            unused,
+        )
+    return dq, dk, dv
+
+
+@_varlen_attn_backward.register_fake
+def _varlen_attn_backward_fake(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Fake implementation for meta tensor computation and tracing.
+    """
+
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    return grad_query, grad_key, grad_value
+
+
+def _backward(
+    ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor
+) -> tuple[torch.Tensor | None, ...]:
+    query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state = ctx.saved_tensors
+
+    max_q = ctx.max_q
+    max_k = ctx.max_k
+    is_causal = ctx.is_causal
+
+    dq, dk, dv = torch.ops.torch_attn._varlen_attn_backward(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        lse,
+        cu_seq_q,
+        cu_seq_k,
+        max_q,
+        max_k,
+        is_causal,
+        rng_state,
+    )
+    return dq, dk, dv, None, None, None, None, None, None
+
+
+_varlen_attn.register_autograd(_backward, setup_context=_setup_context)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c56e923a84383a79c2a3f7ebddb3dfa1ce1f0953
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/backends/thnn.py
@@ -0,0 +1,6 @@
+# mypy: allow-untyped-defs
+# this is for historical pickle deserialization, it is not used otherwise
+
+
+def _get_thnn_function_backend() -> None:
+    pass
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9a09aa31464fd4e88b2e46b4210561a70e42e7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/intrinsic/__init__.py
@@ -0,0 +1,36 @@
+from torch.ao.nn.intrinsic import (
+    BNReLU2d,
+    BNReLU3d,
+    ConvBn1d,
+    ConvBn2d,
+    ConvBn3d,
+    ConvBnReLU1d,
+    ConvBnReLU2d,
+    ConvBnReLU3d,
+    ConvReLU1d,
+    ConvReLU2d,
+    ConvReLU3d,
+    LinearBn1d,
+    LinearReLU,
+)
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule  # noqa: F401
+
+# Include the subpackages in case user imports from it directly
+from torch.nn.intrinsic import modules, qat, quantized  # noqa: F401
+
+
+__all__ = [
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "LinearReLU",
+    "BNReLU2d",
+    "BNReLU3d",
+    "LinearBn1d",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..99260ad43fc477c36a9780c057824f57d4914719
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/__init__.py
@@ -0,0 +1,334 @@
+from .module import Module  # usort: skip
+from .linear import Bilinear, Identity, LazyLinear, Linear  # usort: skip
+from .activation import (
+    CELU,
+    ELU,
+    GELU,
+    GLU,
+    Hardshrink,
+    Hardsigmoid,
+    Hardswish,
+    Hardtanh,
+    LeakyReLU,
+    LogSigmoid,
+    LogSoftmax,
+    Mish,
+    MultiheadAttention,
+    PReLU,
+    ReLU,
+    ReLU6,
+    RReLU,
+    SELU,
+    Sigmoid,
+    SiLU,
+    Softmax,
+    Softmax2d,
+    Softmin,
+    Softplus,
+    Softshrink,
+    Softsign,
+    Tanh,
+    Tanhshrink,
+    Threshold,
+)
+from .adaptive import AdaptiveLogSoftmaxWithLoss
+from .batchnorm import (
+    BatchNorm1d,
+    BatchNorm2d,
+    BatchNorm3d,
+    LazyBatchNorm1d,
+    LazyBatchNorm2d,
+    LazyBatchNorm3d,
+    SyncBatchNorm,
+)
+from .channelshuffle import ChannelShuffle
+from .container import (
+    Container,
+    ModuleDict,
+    ModuleList,
+    ParameterDict,
+    ParameterList,
+    Sequential,
+)
+from .conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+    LazyConv1d,
+    LazyConv2d,
+    LazyConv3d,
+    LazyConvTranspose1d,
+    LazyConvTranspose2d,
+    LazyConvTranspose3d,
+)
+from .distance import CosineSimilarity, PairwiseDistance
+from .dropout import (
+    AlphaDropout,
+    Dropout,
+    Dropout1d,
+    Dropout2d,
+    Dropout3d,
+    FeatureAlphaDropout,
+)
+from .flatten import Flatten, Unflatten
+from .fold import Fold, Unfold
+from .instancenorm import (
+    InstanceNorm1d,
+    InstanceNorm2d,
+    InstanceNorm3d,
+    LazyInstanceNorm1d,
+    LazyInstanceNorm2d,
+    LazyInstanceNorm3d,
+)
+from .loss import (
+    BCELoss,
+    BCEWithLogitsLoss,
+    CosineEmbeddingLoss,
+    CrossEntropyLoss,
+    CTCLoss,
+    GaussianNLLLoss,
+    HingeEmbeddingLoss,
+    HuberLoss,
+    KLDivLoss,
+    L1Loss,
+    MarginRankingLoss,
+    MSELoss,
+    MultiLabelMarginLoss,
+    MultiLabelSoftMarginLoss,
+    MultiMarginLoss,
+    NLLLoss,
+    NLLLoss2d,
+    PoissonNLLLoss,
+    SmoothL1Loss,
+    SoftMarginLoss,
+    TripletMarginLoss,
+    TripletMarginWithDistanceLoss,
+)
+from .normalization import (
+    CrossMapLRN2d,
+    GroupNorm,
+    LayerNorm,
+    LocalResponseNorm,
+    RMSNorm,
+)
+from .padding import (
+    CircularPad1d,
+    CircularPad2d,
+    CircularPad3d,
+    ConstantPad1d,
+    ConstantPad2d,
+    ConstantPad3d,
+    ReflectionPad1d,
+    ReflectionPad2d,
+    ReflectionPad3d,
+    ReplicationPad1d,
+    ReplicationPad2d,
+    ReplicationPad3d,
+    ZeroPad1d,
+    ZeroPad2d,
+    ZeroPad3d,
+)
+from .pixelshuffle import PixelShuffle, PixelUnshuffle
+from .pooling import (
+    AdaptiveAvgPool1d,
+    AdaptiveAvgPool2d,
+    AdaptiveAvgPool3d,
+    AdaptiveMaxPool1d,
+    AdaptiveMaxPool2d,
+    AdaptiveMaxPool3d,
+    AvgPool1d,
+    AvgPool2d,
+    AvgPool3d,
+    FractionalMaxPool2d,
+    FractionalMaxPool3d,
+    LPPool1d,
+    LPPool2d,
+    LPPool3d,
+    MaxPool1d,
+    MaxPool2d,
+    MaxPool3d,
+    MaxUnpool1d,
+    MaxUnpool2d,
+    MaxUnpool3d,
+)
+from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNN, RNNBase, RNNCell, RNNCellBase
+from .sparse import Embedding, EmbeddingBag
+from .transformer import (
+    Transformer,
+    TransformerDecoder,
+    TransformerDecoderLayer,
+    TransformerEncoder,
+    TransformerEncoderLayer,
+)
+from .upsampling import Upsample, UpsamplingBilinear2d, UpsamplingNearest2d
+
+
+__all__ = [
+    "AdaptiveAvgPool1d",
+    "AdaptiveAvgPool2d",
+    "AdaptiveAvgPool3d",
+    "AdaptiveLogSoftmaxWithLoss",
+    "AdaptiveMaxPool1d",
+    "AdaptiveMaxPool2d",
+    "AdaptiveMaxPool3d",
+    "AlphaDropout",
+    "AvgPool1d",
+    "AvgPool2d",
+    "AvgPool3d",
+    "BCELoss",
+    "BCEWithLogitsLoss",
+    "BatchNorm1d",
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Bilinear",
+    "CELU",
+    "CTCLoss",
+    "ChannelShuffle",
+    "CircularPad1d",
+    "CircularPad2d",
+    "CircularPad3d",
+    "ConstantPad1d",
+    "ConstantPad2d",
+    "ConstantPad3d",
+    "Container",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "CosineEmbeddingLoss",
+    "CosineSimilarity",
+    "CrossEntropyLoss",
+    "CrossMapLRN2d",
+    "Dropout",
+    "Dropout1d",
+    "Dropout2d",
+    "Dropout3d",
+    "ELU",
+    "Embedding",
+    "EmbeddingBag",
+    "FeatureAlphaDropout",
+    "Flatten",
+    "Fold",
+    "FractionalMaxPool2d",
+    "FractionalMaxPool3d",
+    "GELU",
+    "GLU",
+    "GRU",
+    "GRUCell",
+    "GaussianNLLLoss",
+    "GroupNorm",
+    "Hardshrink",
+    "Hardsigmoid",
+    "Hardswish",
+    "Hardtanh",
+    "HingeEmbeddingLoss",
+    "HuberLoss",
+    "Identity",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "KLDivLoss",
+    "L1Loss",
+    "LPPool1d",
+    "LPPool2d",
+    "LPPool3d",
+    "LSTM",
+    "LSTMCell",
+    "LayerNorm",
+    "LazyBatchNorm1d",
+    "LazyBatchNorm2d",
+    "LazyBatchNorm3d",
+    "LazyConv1d",
+    "LazyConv2d",
+    "LazyConv3d",
+    "LazyConvTranspose1d",
+    "LazyConvTranspose2d",
+    "LazyConvTranspose3d",
+    "LazyInstanceNorm1d",
+    "LazyInstanceNorm2d",
+    "LazyInstanceNorm3d",
+    "LazyLinear",
+    "LeakyReLU",
+    "Linear",
+    "LocalResponseNorm",
+    "LogSigmoid",
+    "LogSoftmax",
+    "MSELoss",
+    "MarginRankingLoss",
+    "MaxPool1d",
+    "MaxPool2d",
+    "MaxPool3d",
+    "MaxUnpool1d",
+    "MaxUnpool2d",
+    "MaxUnpool3d",
+    "Mish",
+    "Module",
+    "ModuleDict",
+    "ModuleList",
+    "MultiLabelMarginLoss",
+    "MultiLabelSoftMarginLoss",
+    "MultiMarginLoss",
+    "MultiheadAttention",
+    "NLLLoss",
+    "NLLLoss2d",
+    "PReLU",
+    "PairwiseDistance",
+    "ParameterDict",
+    "ParameterList",
+    "PixelShuffle",
+    "PixelUnshuffle",
+    "PoissonNLLLoss",
+    "RMSNorm",
+    "RNN",
+    "RNNBase",
+    "RNNCell",
+    "RNNCellBase",
+    "RReLU",
+    "ReLU",
+    "ReLU6",
+    "ReflectionPad1d",
+    "ReflectionPad2d",
+    "ReflectionPad3d",
+    "ReplicationPad1d",
+    "ReplicationPad2d",
+    "ReplicationPad3d",
+    "SELU",
+    "Sequential",
+    "SiLU",
+    "Sigmoid",
+    "SmoothL1Loss",
+    "SoftMarginLoss",
+    "Softmax",
+    "Softmax2d",
+    "Softmin",
+    "Softplus",
+    "Softshrink",
+    "Softsign",
+    "SyncBatchNorm",
+    "Tanh",
+    "Tanhshrink",
+    "Threshold",
+    "Transformer",
+    "TransformerDecoder",
+    "TransformerDecoderLayer",
+    "TransformerEncoder",
+    "TransformerEncoderLayer",
+    "TripletMarginLoss",
+    "TripletMarginWithDistanceLoss",
+    "Unflatten",
+    "Unfold",
+    "Upsample",
+    "UpsamplingBilinear2d",
+    "UpsamplingNearest2d",
+    "ZeroPad1d",
+    "ZeroPad2d",
+    "ZeroPad3d",
+]
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..408e6ef42f12843ddbfc38d540fc68e454c9e958
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/_functions.py
@@ -0,0 +1,319 @@
+# mypy: allow-untyped-defs
+import torch
+import torch.distributed as dist
+from torch.autograd.function import Function
+
+
+class SyncBatchNorm(Function):
+    @staticmethod
+    # pyrefly: ignore [bad-override]
+    def forward(
+        self,
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        eps,
+        momentum,
+        process_group,
+        world_size,
+    ):
+        if not (
+            input.is_contiguous(memory_format=torch.channels_last)
+            or input.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
+            input = input.contiguous()
+        if weight is not None:
+            weight = weight.contiguous()
+
+        size = int(input.numel() // input.size(1))
+        if size == 1 and world_size < 2:
+            raise ValueError(
+                f"Expected more than 1 value per channel when training, got input size {size}"
+            )
+
+        num_channels = input.shape[1]
+        if input.numel() > 0:
+            # calculate mean/invstd for input.
+            mean, invstd = torch.batch_norm_stats(input, eps)
+
+            count = torch.full(
+                (1,),
+                input.numel() // input.size(1),
+                dtype=mean.dtype,
+                device=mean.device,
+            )
+
+            # C, C, 1 -> (2C + 1)
+            combined = torch.cat([mean, invstd, count], dim=0)
+        else:
+            # for empty input, set stats and the count to zero. The stats with
+            # zero count will be filtered out later when computing global mean
+            # & invstd, but they still needs to participate the all_gather
+            # collective communication to unblock other peer processes.
+            combined = torch.zeros(
+                2 * num_channels + 1, dtype=input.dtype, device=input.device
+            )
+
+        # Use allgather instead of allreduce because count could be different across
+        # ranks, simple all reduce op can not give correct results.
+        # batch_norm_gather_stats_with_counts calculates global mean & invstd based on
+        # all gathered mean, invstd and count.
+        # for nccl backend, use the optimized version of all gather.
+        # The Gloo backend does not support `all_gather_into_tensor`.
+        if process_group._get_backend_name() != "gloo":
+            # world_size * (2C + 1)
+            combined_size = combined.numel()
+            combined_flat = torch.empty(
+                1,
+                combined_size * world_size,
+                dtype=combined.dtype,
+                device=combined.device,
+            )
+            dist.all_gather_into_tensor(
+                combined_flat, combined, process_group, async_op=False
+            )
+            combined = torch.reshape(combined_flat, (world_size, combined_size))
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+        else:
+            # world_size * (2C + 1)
+            combined_list = [torch.empty_like(combined) for _ in range(world_size)]
+            dist.all_gather(combined_list, combined, process_group, async_op=False)
+            combined = torch.stack(combined_list, dim=0)
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+
+        if not (torch.cuda.is_available() and torch.cuda.is_current_stream_capturing()):
+            # The lines below force a synchronization between CUDA and CPU, because
+            # the shape of the result count_all depends on the values in mask tensor.
+            # Such synchronizations break CUDA Graph capturing.
+            # See https://github.com/pytorch/pytorch/issues/78549
+            # FIXME: https://github.com/pytorch/pytorch/issues/78656 describes
+            # a better longer-term solution.
+
+            # remove stats from empty inputs
+            mask = count_all.squeeze(-1) >= 1
+            count_all = count_all[mask]
+            mean_all = mean_all[mask]
+            invstd_all = invstd_all[mask]
+
+        # calculate global mean & invstd
+        counts = count_all.view(-1)
+        if running_mean is not None and counts.dtype != running_mean.dtype:
+            counts = counts.to(running_mean.dtype)
+        mean, invstd = torch.batch_norm_gather_stats_with_counts(
+            input,
+            mean_all,
+            invstd_all,
+            running_mean,
+            running_var,
+            momentum,
+            eps,
+            counts,
+        )
+
+        self.save_for_backward(input, weight, mean, invstd, count_all.to(torch.int32))
+        self.process_group = process_group
+
+        # apply element-wise normalization
+        if input.numel() > 0:
+            return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
+        else:
+            return torch.empty_like(input)
+
+    @staticmethod
+    def backward(self, grad_output):
+        if not (
+            grad_output.is_contiguous(memory_format=torch.channels_last)
+            or grad_output.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
+            grad_output = grad_output.contiguous()
+        saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
+        grad_input = grad_weight = grad_bias = None
+        process_group = self.process_group
+
+        if saved_input.numel() > 0:
+            # calculate local stats as well as grad_weight / grad_bias
+            (
+                sum_dy,
+                sum_dy_xmu,
+                grad_weight,
+                grad_bias,
+            ) = torch.batch_norm_backward_reduce(
+                grad_output,
+                saved_input,
+                mean,
+                invstd,
+                weight,
+                self.needs_input_grad[0],
+                self.needs_input_grad[1],
+                self.needs_input_grad[2],
+            )
+
+            if self.needs_input_grad[0]:
+                # synchronizing stats used to calculate input gradient.
+                num_channels = sum_dy.shape[0]
+                combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
+                torch.distributed.all_reduce(
+                    combined,
+                    torch.distributed.ReduceOp.SUM,
+                    process_group,
+                    async_op=False,
+                )
+                sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
+
+                # backward pass for gradient calculation
+                if weight is not None and weight.dtype != mean.dtype:
+                    weight = weight.to(mean.dtype)
+                grad_input = torch.batch_norm_backward_elemt(
+                    grad_output,
+                    saved_input,
+                    mean,
+                    invstd,
+                    weight,
+                    sum_dy,
+                    sum_dy_xmu,
+                    count_tensor,
+                )
+            # synchronizing of grad_weight / grad_bias is not needed as distributed
+            # training would handle all reduce.
+            if weight is None or not self.needs_input_grad[1]:
+                grad_weight = None
+
+            if weight is None or not self.needs_input_grad[2]:
+                grad_bias = None
+        else:
+            # This process got an empty input tensor in the forward pass.
+            # Although this process can directly set grad_input as an empty
+            # tensor of zeros, it still needs to participate in the collective
+            # communication to unblock its peers, as other peer processes might
+            # have received non-empty inputs.
+            num_channels = saved_input.shape[1]
+            if self.needs_input_grad[0]:
+                # launch all_reduce to unblock other peer processes
+                combined = torch.zeros(
+                    2 * num_channels, dtype=saved_input.dtype, device=saved_input.device
+                )
+                torch.distributed.all_reduce(
+                    combined,
+                    torch.distributed.ReduceOp.SUM,
+                    process_group,
+                    async_op=False,
+                )
+
+            # Leave grad_input, grad_weight and grad_bias as None, which will be
+            # interpreted by the autograd engine as Tensors full of zeros.
+
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
+
+
+class CrossMapLRN2d(Function):
+    @staticmethod
+    # pyrefly: ignore [bad-override]
+    def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
+        ctx.size = size
+        ctx.alpha = alpha
+        ctx.beta = beta
+        ctx.k = k
+        ctx.scale = None
+
+        if input.dim() != 4:
+            raise ValueError(
+                f"CrossMapLRN2d: Expected input to be 4D, got {input.dim()}D instead."
+            )
+
+        ctx.scale = ctx.scale or input.new()
+        output = input.new()
+        channels = input.size(1)
+
+        output.resize_as_(input)
+        ctx.scale.resize_as_(input)
+
+        # use output storage as temporary buffer
+        input_square = output
+        torch.pow(input, 2, out=input_square)
+
+        pre_pad = int((ctx.size - 1) / 2 + 1)
+        pre_pad_crop = min(pre_pad, channels)
+
+        scale_first = ctx.scale.select(1, 0)
+        scale_first.zero_()
+        # compute first feature map normalization
+        for c in range(pre_pad_crop):
+            scale_first.add_(input_square.select(1, c))
+
+        # reuse computations for next feature maps normalization
+        # by adding the next feature map and removing the previous
+        for c in range(1, channels):
+            scale_previous = ctx.scale.select(1, c - 1)
+            scale_current = ctx.scale.select(1, c)
+            scale_current.copy_(scale_previous)
+            if c < channels - pre_pad + 1:
+                square_next = input_square.select(1, c + pre_pad - 1)
+                scale_current.add_(square_next, alpha=1)
+
+            if c > pre_pad:
+                square_previous = input_square.select(1, c - pre_pad)
+                scale_current.add_(square_previous, alpha=-1)
+
+        ctx.scale.mul_(ctx.alpha / ctx.size).add_(ctx.k)
+
+        torch.pow(ctx.scale, -ctx.beta, out=output)
+        output.mul_(input)
+
+        ctx.save_for_backward(input, output)
+        return output
+
+    @staticmethod
+    # pyrefly: ignore [bad-override]
+    def backward(ctx, grad_output):
+        input, output = ctx.saved_tensors
+        grad_input = grad_output.new()
+
+        batch_size = input.size(0)
+        channels = input.size(1)
+        input_height = input.size(2)
+        input_width = input.size(3)
+
+        paddded_ratio = input.new(channels + ctx.size - 1, input_height, input_width)
+        accum_ratio = input.new(input_height, input_width)
+
+        cache_ratio_value = 2 * ctx.alpha * ctx.beta / ctx.size
+        inversePrePad = int(ctx.size - (ctx.size - 1) / 2)
+
+        grad_input.resize_as_(input)
+        torch.pow(ctx.scale, -ctx.beta, out=grad_input).mul_(grad_output)
+
+        paddded_ratio.zero_()
+        padded_ratio_center = paddded_ratio.narrow(0, inversePrePad, channels)
+        for n in range(batch_size):
+            torch.mul(grad_output[n], output[n], out=padded_ratio_center)
+            padded_ratio_center.div_(ctx.scale[n])
+            torch.sum(
+                paddded_ratio.narrow(0, 0, ctx.size - 1),
+                0,
+                keepdim=False,
+                out=accum_ratio,
+            )
+            for c in range(channels):
+                accum_ratio.add_(paddded_ratio[c + ctx.size - 1])
+                grad_input[n][c].addcmul_(
+                    input[n][c], accum_ratio, value=-cache_ratio_value
+                )
+                accum_ratio.add_(paddded_ratio[c], alpha=-1)
+
+        return grad_input, None, None, None, None
+
+
+class BackwardHookFunction(torch.autograd.Function):
+    @staticmethod
+    # pyrefly: ignore [bad-override]
+    def forward(ctx, *args):
+        ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad])
+        return args
+
+    @staticmethod
+    def backward(ctx, *args):
+        return args
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dac27cdb0d2464847a85e4ee8683326188875977
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/activation.py
@@ -0,0 +1,1905 @@
+# mypy: allow-untyped-defs
+import warnings
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.parameter import Parameter
+
+from .linear import NonDynamicallyQuantizableLinear
+from .module import Module
+
+
+__all__ = [
+    "Threshold",
+    "ReLU",
+    "RReLU",
+    "Hardtanh",
+    "ReLU6",
+    "Sigmoid",
+    "Hardsigmoid",
+    "Tanh",
+    "SiLU",
+    "Mish",
+    "Hardswish",
+    "ELU",
+    "CELU",
+    "SELU",
+    "GLU",
+    "GELU",
+    "Hardshrink",
+    "LeakyReLU",
+    "LogSigmoid",
+    "Softplus",
+    "Softshrink",
+    "MultiheadAttention",
+    "PReLU",
+    "Softsign",
+    "Tanhshrink",
+    "Softmin",
+    "Softmax",
+    "Softmax2d",
+    "LogSoftmax",
+]
+
+
+class Threshold(Module):
+    r"""Thresholds each element of the input Tensor.
+
+    Threshold is defined as:
+
+    .. math::
+        y =
+        \begin{cases}
+        x, &\text{ if } x > \text{threshold} \\
+        \text{value}, &\text{ otherwise }
+        \end{cases}
+
+    Args:
+        threshold: The value to threshold at
+        value: The value to replace with
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Threshold.png
+
+    Examples::
+
+        >>> m = nn.Threshold(0, 0.5)
+        >>> input = torch.arange(-3, 3)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["threshold", "value", "inplace"]
+
+    threshold: float
+    value: float
+    inplace: bool
+
+    def __init__(self, threshold: float, value: float, inplace: bool = False) -> None:
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+        # TODO: check in THNN (if inplace == True, then assert value <= threshold)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.threshold(input, self.threshold, self.value, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"threshold={self.threshold}, value={self.value}{inplace_str}"
+
+
+class ReLU(Module):
+    r"""Applies the rectified linear unit function element-wise.
+
+    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ReLU.png
+
+    Examples::
+
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+
+      An implementation of CReLU - https://arxiv.org/abs/1603.05201
+
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2).unsqueeze(0)
+        >>> output = torch.cat((m(input), m(-input)))
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.relu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class RReLU(Module):
+    r"""Applies the randomized leaky rectified linear unit function, element-wise.
+
+    Method described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_.
+
+    The function is defined as:
+
+    .. math::
+        \text{RReLU}(x) =
+        \begin{cases}
+            x & \text{if } x \geq 0 \\
+            ax & \text{ otherwise }
+        \end{cases}
+
+    where :math:`a` is randomly sampled from uniform distribution
+    :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during
+    evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`.
+
+    Args:
+        lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}`
+        upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}`
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/RReLU.png
+
+    Examples::
+
+        >>> m = nn.RReLU(0.1, 0.3)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    """
+
+    __constants__ = ["lower", "upper", "inplace"]
+
+    lower: float
+    upper: float
+    inplace: bool
+
+    def __init__(
+        self, lower: float = 1.0 / 8, upper: float = 1.0 / 3, inplace: bool = False
+    ) -> None:
+        super().__init__()
+        self.lower = lower
+        self.upper = upper
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"lower={self.lower}, upper={self.upper}{inplace_str}"
+
+
+class Hardtanh(Module):
+    r"""Applies the HardTanh function element-wise.
+
+    HardTanh is defined as:
+
+    .. math::
+        \text{HardTanh}(x) = \begin{cases}
+            \text{max\_val} & \text{ if } x > \text{ max\_val } \\
+            \text{min\_val} & \text{ if } x < \text{ min\_val } \\
+            x & \text{ otherwise } \\
+        \end{cases}
+
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardtanh.png
+
+    Examples::
+
+        >>> m = nn.Hardtanh(-2, 2)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["min_val", "max_val", "inplace"]
+
+    min_val: float
+    max_val: float
+    inplace: bool
+
+    def __init__(
+        self,
+        min_val: float = -1.0,
+        max_val: float = 1.0,
+        inplace: bool = False,
+        min_value: float | None = None,
+        max_value: float | None = None,
+    ) -> None:
+        super().__init__()
+        if min_value is not None:
+            warnings.warn(
+                "keyword argument `min_value` is deprecated and rename to `min_val`",
+                FutureWarning,
+                stacklevel=2,
+            )
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn(
+                "keyword argument `max_value` is deprecated and rename to `max_val`",
+                FutureWarning,
+                stacklevel=2,
+            )
+            max_val = max_value
+
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+        assert self.max_val > self.min_val
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"min_val={self.min_val}, max_val={self.max_val}{inplace_str}"
+
+
+class ReLU6(Hardtanh):
+    r"""Applies the ReLU6 function element-wise.
+
+    .. math::
+        \text{ReLU6}(x) = \min(\max(0,x), 6)
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ReLU6.png
+
+    Examples::
+
+        >>> m = nn.ReLU6()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__(0.0, 6.0, inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Sigmoid(Module):
+    r"""Applies the Sigmoid function element-wise.
+
+    .. math::
+        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
+
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Sigmoid.png
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return torch.sigmoid(input)
+
+
+class Hardsigmoid(Module):
+    r"""Applies the Hardsigmoid function element-wise.
+
+    Hardsigmoid is defined as:
+
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardsigmoid.png
+
+    Examples::
+
+        >>> m = nn.Hardsigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.hardsigmoid(input, self.inplace)
+
+
+class Tanh(Module):
+    r"""Applies the Hyperbolic Tangent (Tanh) function element-wise.
+
+    Tanh is defined as:
+
+    .. math::
+        \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)}
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Tanh.png
+
+    Examples::
+
+        >>> m = nn.Tanh()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return torch.tanh(input)
+
+
+class SiLU(Module):
+    r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise.
+
+    The SiLU function is also known as the swish function.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
+        where the SiLU was experimented with later.
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/SiLU.png
+
+    Examples::
+
+        >>> m = nn.SiLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.silu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Mish(Module):
+    r"""Applies the Mish function, element-wise.
+
+    Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+    .. math::
+        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Mish.png
+
+    Examples::
+
+        >>> m = nn.Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.mish(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Hardswish(Module):
+    r"""Applies the Hardswish function, element-wise.
+
+    Method described in the paper: `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+
+    Hardswish is defined as:
+
+    .. math::
+        \text{Hardswish}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardswish.png
+
+    Examples::
+
+        >>> m = nn.Hardswish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.hardswish(input, self.inplace)
+
+
+class ELU(Module):
+    r"""Applies the Exponential Linear Unit (ELU) function, element-wise.
+
+    Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear
+    Units (ELUs) <https://arxiv.org/abs/1511.07289>`__.
+
+    ELU is defined as:
+
+    .. math::
+        \text{ELU}(x) = \begin{cases}
+        x, & \text{ if } x > 0\\
+        \alpha * (\exp(x) - 1), & \text{ if } x \leq 0
+        \end{cases}
+
+    Args:
+        alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ELU.png
+
+    Examples::
+
+        >>> m = nn.ELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["alpha", "inplace"]
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.elu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"alpha={self.alpha}{inplace_str}"
+
+
+class CELU(Module):
+    r"""Applies the CELU function element-wise.
+
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))
+
+    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+
+    Args:
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/CELU.png
+
+    Examples::
+
+        >>> m = nn.CELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _`Continuously Differentiable Exponential Linear Units`:
+        https://arxiv.org/abs/1704.07483
+    """
+
+    __constants__ = ["alpha", "inplace"]
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.celu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"alpha={self.alpha}{inplace_str}"
+
+
+class SELU(Module):
+    r"""Applies the SELU function element-wise.
+
+    .. math::
+        \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))
+
+    with :math:`\alpha = 1.6732632423543772848170429916717` and
+    :math:`\text{scale} = 1.0507009873554804934193349852946`.
+
+    .. warning::
+        When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation,
+        ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'``
+        in order to get `Self-Normalizing Neural Networks`_.
+        See :func:`torch.nn.init.calculate_gain` for more information.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/SELU.png
+
+    Examples::
+
+        >>> m = nn.SELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.selu(input, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class GLU(Module):
+    r"""Applies the gated linear unit function.
+
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
+    of the input matrices and :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    .. image:: ../scripts/activation_images/GLU.png
+
+    Examples::
+
+        >>> m = nn.GLU()
+        >>> input = torch.randn(4, 2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["dim"]
+    dim: int
+
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.glu(input, self.dim)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function.
+
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    When the approximate argument is 'tanh', Gelu is estimated with:
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["approximate"]
+    approximate: str
+
+    def __init__(self, approximate: str = "none") -> None:
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.gelu(input, approximate=self.approximate)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"approximate={repr(self.approximate)}"
+
+
+class Hardshrink(Module):
+    r"""Applies the Hard Shrinkage (Hardshrink) function element-wise.
+
+    Hardshrink is defined as:
+
+    .. math::
+        \text{HardShrink}(x) =
+        \begin{cases}
+        x, & \text{ if } x > \lambda \\
+        x, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardshrink.png
+
+    Examples::
+
+        >>> m = nn.Hardshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["lambd"]
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.hardshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.lambd}"
+
+
+class LeakyReLU(Module):
+    r"""Applies the LeakyReLU function element-wise.
+
+    .. math::
+        \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)
+
+
+    or
+
+    .. math::
+        \text{LeakyReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        \text{negative\_slope} \times x, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope (which is used for
+          negative input values). Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    .. image:: ../scripts/activation_images/LeakyReLU.png
+
+    Examples::
+
+        >>> m = nn.LeakyReLU(0.1)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace", "negative_slope"]
+    inplace: bool
+    negative_slope: float
+
+    def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"negative_slope={self.negative_slope}{inplace_str}"
+
+
+class LogSigmoid(Module):
+    r"""Applies the Logsigmoid function element-wise.
+
+    .. math::
+        \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/LogSigmoid.png
+
+    Examples::
+
+        >>> m = nn.LogSigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.logsigmoid(input)
+
+
+class Softplus(Module):
+    r"""Applies the Softplus function element-wise.
+
+    .. math::
+        \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \times \beta > threshold`.
+
+    Args:
+        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softplus.png
+
+    Examples::
+
+        >>> m = nn.Softplus()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["beta", "threshold"]
+    beta: float
+    threshold: float
+
+    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.softplus(input, self.beta, self.threshold)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"beta={self.beta}, threshold={self.threshold}"
+
+
+class Softshrink(Module):
+    r"""Applies the soft shrinkage function element-wise.
+
+    .. math::
+        \text{SoftShrinkage}(x) =
+        \begin{cases}
+        x - \lambda, & \text{ if } x > \lambda \\
+        x + \lambda, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softshrink.png
+
+    Examples::
+
+        >>> m = nn.Softshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["lambd"]
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.softshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return str(self.lambd)
+
+
+def _check_arg_device(x: torch.Tensor | None) -> bool:
+    if x is not None:
+        return x.device.type in [
+            "cpu",
+            "cuda",
+            torch.utils.backend_registration._privateuse1_backend_name,
+        ]
+    return True
+
+
+def _arg_requires_grad(x: torch.Tensor | None) -> bool:
+    if x is not None:
+        return x.requires_grad
+    return False
+
+
+def _is_make_fx_tracing():
+    if not torch.jit.is_scripting():
+        torch_dispatch_mode_stack = (
+            torch.utils._python_dispatch._get_current_dispatch_mode_stack()
+        )
+        # this can be triggered when dynamo inlining the module too.
+        return (
+            any(
+                type(x) is torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode
+                for x in torch_dispatch_mode_stack
+            )
+            or torch.compiler.is_exporting()
+        )
+    else:
+        return False
+
+
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information from different representation subspaces.
+
+    This MultiheadAttention layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    Multi-Head Attention is defined as:
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,\dots,\text{head}_h)W^O
+
+    where :math:`\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    ``nn.MultiheadAttention`` will use the optimized implementations of
+    ``scaled_dot_product_attention()`` when possible.
+
+    In addition to support for the new ``scaled_dot_product_attention()``
+    function, for speeding up Inference, MHA will use
+    fastpath inference with support for Nested Tensors, iff:
+
+    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
+    - inputs are batched (3D) with ``batch_first==True``
+    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
+    - training is disabled (using ``.eval()``)
+    - ``add_bias_kv`` is ``False``
+    - ``add_zero_attn`` is ``False``
+    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
+    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
+      nor ``attn_mask`` is passed
+    - autocast is disabled
+
+    If the optimized inference fastpath implementation is in use, a
+    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
+    ``query``/``key``/``value`` to represent padding more efficiently than using a
+    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
+    will be returned, and an additional speedup proportional to the fraction of the input
+    that is padding can be expected.
+
+    Args:
+        embed_dim: Total dimension of the model.
+        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+            Default: ``False``.
+        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+
+    .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
+    """
+
+    __constants__ = ["batch_first"]
+    bias_k: torch.Tensor | None
+    bias_v: torch.Tensor | None
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+        batch_first=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        if embed_dim <= 0 or num_heads <= 0:
+            raise ValueError(
+                f"embed_dim and num_heads must be greater than 0,"
+                f" got embed_dim={embed_dim} and num_heads={num_heads} instead"
+            )
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+
+        if not self._qkv_same_embed_dim:
+            self.q_proj_weight = Parameter(
+                torch.empty((embed_dim, embed_dim), **factory_kwargs)
+            )
+            self.k_proj_weight = Parameter(
+                torch.empty((embed_dim, self.kdim), **factory_kwargs)
+            )
+            self.v_proj_weight = Parameter(
+                torch.empty((embed_dim, self.vdim), **factory_kwargs)
+            )
+            self.register_parameter("in_proj_weight", None)
+        else:
+            self.in_proj_weight = Parameter(
+                torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+            )
+            self.register_parameter("q_proj_weight", None)
+            self.register_parameter("k_proj_weight", None)
+            self.register_parameter("v_proj_weight", None)
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
+        else:
+            self.register_parameter("in_proj_bias", None)
+        self.out_proj = NonDynamicallyQuantizableLinear(
+            embed_dim, embed_dim, bias=bias, **factory_kwargs
+        )
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.0)
+            constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
+
+        super().__setstate__(state)
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Tensor | None = None,
+        need_weights: bool = True,
+        attn_mask: Tensor | None = None,
+        average_attn_weights: bool = True,
+        is_causal: bool = False,
+    ) -> tuple[Tensor, Tensor | None]:
+        r"""Compute attention outputs using query, key, and value embeddings.
+
+            Supports optional parameters for padding, masks and attention weights.
+
+        Args:
+            query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
+                or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
+                :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
+                Queries are compared against key-value pairs to produce the output.
+                See "Attention Is All You Need" for more details.
+            key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
+                or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
+                :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
+                See "Attention Is All You Need" for more details.
+            value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
+                ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
+                sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
+                See "Attention Is All You Need" for more details.
+            key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+                to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
+                Binary and float masks are supported.
+                For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+                the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
+            need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+                Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``
+                and achieve the best performance for MHA.
+                Default: ``True``.
+            attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+                :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+                :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+                broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+                Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
+                corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+                the attention weight.
+                If both attn_mask and key_padding_mask are supplied, their types should match.
+            average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+                heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+                effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
+            is_causal: If specified, applies a causal mask as attention mask.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``attn_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Outputs:
+            - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
+              :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
+              where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
+              embedding dimension ``embed_dim``.
+            - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
+              returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+              :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+              :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+              head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
+
+            .. note::
+                `batch_first` argument is ignored for unbatched inputs.
+        """  # noqa: B950
+        why_not_fast_path = ""
+        if (
+            (attn_mask is not None and torch.is_floating_point(attn_mask))
+            or (key_padding_mask is not None)
+            and torch.is_floating_point(key_padding_mask)
+        ):
+            why_not_fast_path = "floating-point masks are not supported for fast path."
+
+        is_batched = query.dim() == 3
+
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype,
+        )
+
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=None,
+            other_name="",
+            target_type=query.dtype,
+            check_other=False,
+        )
+
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+
+        if not is_fastpath_enabled:
+            why_not_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not is_batched:
+            why_not_fast_path = (
+                f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+            )
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif self.in_proj_weight is None:
+            why_not_fast_path = "in_proj_weight was None"
+        elif query.dtype != self.in_proj_weight.dtype:
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif (self.num_heads % 2) != 0:
+            why_not_fast_path = "self.num_heads is not even"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif query.is_nested and (
+            key_padding_mask is not None or attn_mask is not None
+        ):
+            why_not_fast_path = (
+                "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
+            )
+        elif torch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
+
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_torch_function"
+            elif _is_make_fx_tracing():
+                why_not_fast_path = "we are running make_fx tracing"
+            elif not all(_check_arg_device(x) for x in tensor_args):
+                why_not_fast_path = (
+                    "some Tensor argument's device is neither one of "
+                    f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}"
+                )
+            elif torch.is_grad_enabled() and any(
+                _arg_requires_grad(x) for x in tensor_args
+            ):
+                why_not_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
+            if not why_not_fast_path:
+                merged_mask, mask_type = self.merge_masks(
+                    attn_mask, key_padding_mask, query
+                )
+
+                if self.in_proj_bias is not None and self.in_proj_weight is not None:
+                    return torch._native_multi_head_attention(
+                        query,
+                        key,
+                        value,
+                        self.embed_dim,
+                        self.num_heads,
+                        self.in_proj_weight,
+                        self.in_proj_bias,
+                        self.out_proj.weight,
+                        self.out_proj.bias,
+                        merged_mask,
+                        need_weights,
+                        average_attn_weights,
+                        mask_type,
+                    )
+
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, (
+            "MultiheadAttention does not support NestedTensor outside of its fast path. "
+            + f"The fast path was not hit because {why_not_fast_path}"
+        )
+
+        if self.batch_first and is_batched:
+            # make sure that the transpose op does not affect the "is" property
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = (x.transpose(1, 0) for x in (query, key))
+                    value = key
+            else:
+                query, key, value = (x.transpose(1, 0) for x in (query, key, value))
+
+        if not self._qkv_same_embed_dim:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal,
+            )
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal,
+            )
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+
+    def merge_masks(
+        self,
+        attn_mask: Tensor | None,
+        key_padding_mask: Tensor | None,
+        query: Tensor,
+    ) -> tuple[Tensor | None, int | None]:
+        r"""Determine mask type and combine masks if necessary.
+
+        If only one mask is provided, that mask
+        and the corresponding mask type will be returned. If both masks are provided, they will be both
+        expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or``
+        and mask type 2 will be returned
+        Args:
+            attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0
+            key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1
+            query: query embeddings of shape ``(batch_size, seq_len, embed_dim)``
+        Returns:
+            merged_mask: merged mask
+            mask_type: merged mask type (0, 1, or 2)
+        """
+        mask_type: int | None = None
+        merged_mask: Tensor | None = None
+
+        if key_padding_mask is not None:
+            mask_type = 1
+            merged_mask = key_padding_mask
+
+        if attn_mask is not None:
+            # In this branch query can't be a nested tensor, so it has a shape
+            batch_size, seq_len, _ = query.shape
+            mask_type = 2
+
+            # Always expands attn_mask to 4D
+            if attn_mask.dim() == 3:
+                attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len)
+            else:  # attn_mask.dim() == 2:
+                attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(
+                    batch_size, self.num_heads, -1, -1
+                )
+            merged_mask = attn_mask_expanded
+
+            if key_padding_mask is not None:
+                key_padding_mask_expanded = key_padding_mask.view(
+                    batch_size, 1, 1, seq_len
+                ).expand(-1, self.num_heads, -1, -1)
+                merged_mask = attn_mask_expanded + key_padding_mask_expanded
+
+        # no attn_mask and no key_padding_mask, returns None, None
+        return merged_mask, mask_type
+
+
+class PReLU(Module):
+    r"""Applies the element-wise PReLU function.
+
+    .. math::
+        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
+
+    or
+
+    .. math::
+        \text{PReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \ge 0 \\
+        ax, & \text{ otherwise }
+        \end{cases}
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+
+
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+
+    .. note::
+        Channel dim is the 2nd dim of input. When input has dims < 2, then there is
+        no channel dim and the number of channels = 1.
+
+    Args:
+        num_parameters (int): number of :math:`a` to learn.
+            Although it takes an int as input, there is only two values are legitimate:
+            1, or the number of channels at input. Default: 1
+        init (float): the initial value of :math:`a`. Default: 0.25
+
+    Shape:
+        - Input: :math:`( *)` where `*` means, any number of additional
+          dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Attributes:
+        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
+
+    .. image:: ../scripts/activation_images/PReLU.png
+
+    Examples::
+
+        >>> m = nn.PReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["num_parameters"]
+    num_parameters: int
+
+    def __init__(
+        self, num_parameters: int = 1, init: float = 0.25, device=None, dtype=None
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.num_parameters = num_parameters
+        super().__init__()
+        self.init = init
+        self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        torch.nn.init.constant_(self.weight, self.init)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.prelu(input, self.weight)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"num_parameters={self.num_parameters}"
+
+
+class Softsign(Module):
+    r"""Applies the element-wise Softsign function.
+
+    .. math::
+        \text{SoftSign}(x) = \frac{x}{ 1 + |x|}
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softsign.png
+
+    Examples::
+
+        >>> m = nn.Softsign()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.softsign(input)
+
+
+class Tanhshrink(Module):
+    r"""Applies the element-wise Tanhshrink function.
+
+    .. math::
+        \text{Tanhshrink}(x) = x - \tanh(x)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Tanhshrink.png
+
+    Examples::
+
+        >>> m = nn.Tanhshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.tanhshrink(input)
+
+
+class Softmin(Module):
+    r"""Applies the Softmin function to an n-dimensional input Tensor.
+
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range `[0, 1]` and sum to 1.
+
+    Softmin is defined as:
+
+    .. math::
+        \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Args:
+        dim (int): A dimension along which Softmin will be computed (so every slice
+            along dim will sum to 1).
+
+    Returns:
+        a Tensor of the same dimension and shape as the input, with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmin(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["dim"]
+    dim: int | None
+
+    def __init__(self, dim: int | None = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.softmin(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
+
+
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor.
+
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    .. note::
+        This module doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use `LogSoftmax` instead (it's faster and has better numerical properties).
+
+    Examples::
+
+        >>> m = nn.Softmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+
+    """
+
+    __constants__ = ["dim"]
+    dim: int | None
+
+    def __init__(self, dim: int | None = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
+
+
+class Softmax2d(Module):
+    r"""Applies SoftMax over features to each spatial location.
+
+    When given an image of ``Channels x Height x Width``, it will
+    apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmax2d()
+        >>> # you softmax over the 2nd dimension
+        >>> input = torch.randn(2, 3, 12, 13)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        if input.dim() not in (3, 4):
+            raise ValueError(
+                f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead"
+            )
+        return F.softmax(input, -3, _stacklevel=5)
+
+
+class LogSoftmax(Module):
+    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor.
+
+    The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [-inf, 0)
+
+    Examples::
+
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["dim"]
+    dim: int | None
+
+    def __init__(self, dim: int | None = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.log_softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py
new file mode 100644
index 0000000000000000000000000000000000000000..4267ed9993bff1ff69d57028308f4a3121ef2050
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/adaptive.py
@@ -0,0 +1,339 @@
+# mypy: allow-untyped-defs
+
+import itertools
+from collections import namedtuple
+from collections.abc import Sequence
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from .container import ModuleList, Sequential
+from .linear import Linear
+from .module import Module
+
+
+__all__ = ["AdaptiveLogSoftmaxWithLoss"]
+
+_ASMoutput = namedtuple("_ASMoutput", ["output", "loss"])
+
+
+class AdaptiveLogSoftmaxWithLoss(Module):
+    (
+        """Efficient softmax approximation.
+
+    As described in
+    `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
+    Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
+    <https://arxiv.org/abs/1609.04309>`__.
+"""
+        r"""
+    Adaptive softmax is an approximate strategy for training models with large
+    output spaces. It is most effective when the label distribution is highly
+    imbalanced, for example in natural language modelling, where the word
+    frequency distribution approximately follows the `Zipf's law`_.
+
+    Adaptive softmax partitions the labels into several clusters, according to
+    their frequency. These clusters may contain different number of targets
+    each.
+    Additionally, clusters containing less frequent labels assign lower
+    dimensional embeddings to those labels, which speeds up the computation.
+    For each minibatch, only clusters for which at least one target is
+    present are evaluated.
+
+    The idea is that the clusters which are accessed frequently
+    (like the first one, containing most frequent labels), should also be cheap
+    to compute -- that is, contain a small number of assigned labels.
+
+    We highly recommend taking a look at the original paper for more details.
+
+    * :attr:`cutoffs` should be an ordered Sequence of integers sorted
+      in the increasing order.
+      It controls number of clusters and the partitioning of targets into
+      clusters. For example setting ``cutoffs = [10, 100, 1000]``
+      means that first `10` targets will be assigned
+      to the 'head' of the adaptive softmax, targets `11, 12, ..., 100` will be
+      assigned to the first cluster, and targets `101, 102, ..., 1000` will be
+      assigned to the second cluster, while targets
+      `1001, 1002, ..., n_classes - 1` will be assigned
+      to the last, third cluster.
+
+    * :attr:`div_value` is used to compute the size of each additional cluster,
+      which is given as
+      :math:`\left\lfloor\frac{\texttt{in\_features}}{\texttt{div\_value}^{idx}}\right\rfloor`,
+      where :math:`idx` is the cluster index (with clusters
+      for less frequent words having larger indices,
+      and indices starting from :math:`1`).
+
+    * :attr:`head_bias` if set to True, adds a bias term to the 'head' of the
+      adaptive softmax. See paper for details. Set to False in the official
+      implementation.
+
+    .. warning::
+        Labels passed as inputs to this module should be sorted according to
+        their frequency. This means that the most frequent label should be
+        represented by the index `0`, and the least frequent
+        label should be represented by the index `n_classes - 1`.
+
+    .. note::
+        This module returns a ``NamedTuple`` with ``output``
+        and ``loss`` fields. See further documentation for details.
+
+    .. note::
+        To compute log-probabilities for all classes, the ``log_prob``
+        method can be used.
+
+    Args:
+        in_features (int): Number of features in the input tensor
+        n_classes (int): Number of classes in the dataset
+        cutoffs (Sequence): Cutoffs used to assign targets to their buckets
+        div_value (float, optional): value used as an exponent to compute sizes
+            of the clusters. Default: 4.0
+        head_bias (bool, optional): If ``True``, adds a bias term to the 'head' of the
+            adaptive softmax. Default: ``False``
+
+    Returns:
+        ``NamedTuple`` with ``output`` and ``loss`` fields:
+            * **output** is a Tensor of size ``N`` containing computed target
+              log probabilities for each example
+            * **loss** is a Scalar representing the computed negative
+              log likelihood loss
+
+    Shape:
+        - input: :math:`(N, \texttt{in\_features})` or :math:`(\texttt{in\_features})`
+        - target: :math:`(N)` or :math:`()` where each value satisfies :math:`0 <= \texttt{target[i]} <= \texttt{n\_classes}`
+        - output1: :math:`(N)` or :math:`()`
+        - output2: ``Scalar``
+
+    .. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law
+    """
+    )
+
+    in_features: int
+    n_classes: int
+    cutoffs: list[int]
+    div_value: float
+    head_bias: bool
+    head: Linear
+    tail: ModuleList
+
+    def __init__(
+        self,
+        in_features: int,
+        n_classes: int,
+        cutoffs: Sequence[int],
+        div_value: float = 4.0,
+        head_bias: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        cutoffs = list(cutoffs)
+
+        if len(cutoffs) == 0:
+            raise ValueError("cutoffs should be a sequence of length larger than 0")
+
+        if (
+            (cutoffs != sorted(cutoffs))
+            or (min(cutoffs) <= 0)
+            or (max(cutoffs) > (n_classes - 1))
+            or (len(set(cutoffs)) != len(cutoffs))
+            or any(int(c) != c for c in cutoffs)
+        ):
+            raise ValueError(
+                "cutoffs should be a sequence of unique, positive "
+                "integers sorted in an increasing order, where "
+                "each value is between 1 and n_classes-1"
+            )
+
+        self.in_features = in_features
+        self.n_classes = n_classes
+        self.cutoffs = cutoffs + [n_classes]
+        self.div_value = div_value
+        self.head_bias = head_bias
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        self.head = Linear(
+            self.in_features, self.head_size, bias=self.head_bias, **factory_kwargs
+        )
+        self.tail = ModuleList()
+
+        for i in range(self.n_clusters):
+            hsz = int(self.in_features // (self.div_value ** (i + 1)))
+            osz = self.cutoffs[i + 1] - self.cutoffs[i]
+
+            projection = Sequential(
+                Linear(self.in_features, hsz, bias=False, **factory_kwargs),
+                Linear(hsz, osz, bias=False, **factory_kwargs),
+            )
+
+            self.tail.append(projection)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        self.head.reset_parameters()
+        for i2h, h2o in self.tail:  # type: ignore[misc]
+            i2h.reset_parameters()  # type: ignore[has-type]
+            h2o.reset_parameters()  # type: ignore[has-type]
+
+    def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
+        """
+        Runs the forward pass.
+        """
+        targ_dim = target_.dim()
+
+        if targ_dim == 1:
+            if input_.size(0) != target_.size(0):
+                raise RuntimeError(
+                    "Input and target should have the same size in the batch dimension."
+                )
+            if input_.dim() != 2:
+                raise RuntimeError(
+                    "1D target tensor expects 2D input tensors, "
+                    "but found inputs with size",
+                    input_.size(),
+                )
+        elif targ_dim == 0:
+            if input_.dim() != 1:
+                raise RuntimeError(
+                    "0D target tensor expects 1D input tensors, "
+                    "but found inputs with size",
+                    input_.size(),
+                )
+        else:
+            raise RuntimeError(
+                "0D or 1D target tensor expected, multi-target not supported"
+            )
+
+        is_batched = targ_dim > 0
+        input = input_ if is_batched else input_.unsqueeze(0)
+        target = target_ if is_batched else target_.unsqueeze(0)
+
+        used_rows = 0
+        batch_size = target.size(0)
+
+        output = input.new_zeros(batch_size)
+        gather_inds = target.new_empty(batch_size)
+
+        cutoff_values = [0] + self.cutoffs
+        for i in range(len(cutoff_values) - 1):
+            low_idx = cutoff_values[i]
+            high_idx = cutoff_values[i + 1]
+
+            target_mask = (target >= low_idx) & (target < high_idx)
+            row_indices = target_mask.nonzero().squeeze()
+
+            if row_indices.numel() == 0:
+                continue
+
+            if i == 0:
+                gather_inds.index_copy_(0, row_indices, target[target_mask])
+
+            else:
+                relative_target = target[target_mask] - low_idx
+                input_subset = input.index_select(0, row_indices)
+
+                cluster_output = self.tail[i - 1](input_subset)
+                cluster_index = self.shortlist_size + i - 1
+
+                gather_inds.index_fill_(0, row_indices, cluster_index)
+                cluster_logprob = F.log_softmax(cluster_output, dim=1)
+                local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
+                output.index_copy_(0, row_indices, local_logprob.squeeze(1))
+
+            used_rows += row_indices.numel()
+
+        if used_rows != batch_size:
+            raise RuntimeError(
+                f"Target values should be in [0, {self.n_classes - 1}], "
+                f"but values in range [{target.min().item()}, {target.max().item()}] "
+                "were found. "
+            )
+
+        head_output = self.head(input)
+        head_logprob = F.log_softmax(head_output, dim=1)
+        output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze()
+        loss = (-output).mean()
+
+        if not is_batched:
+            output = output.squeeze(0)
+
+        return _ASMoutput(output, loss)
+
+    def _get_full_log_prob(self, input, head_output):
+        """Given input tensor, and output of ``self.head``, compute the log of the full distribution."""
+        out = input.new_empty((head_output.size(0), self.n_classes))
+        head_logprob = F.log_softmax(head_output, dim=1)
+
+        out[:, : self.shortlist_size] = head_logprob[:, : self.shortlist_size]
+
+        for i, (start_idx, stop_idx) in enumerate(itertools.pairwise(self.cutoffs)):
+            cluster_output = self.tail[i](input)
+            cluster_logprob = F.log_softmax(cluster_output, dim=1)
+            output_logprob = cluster_logprob + head_logprob[
+                :, self.shortlist_size + i
+            ].unsqueeze(1)
+
+            out[:, start_idx:stop_idx] = output_logprob
+
+        return out
+
+    def log_prob(self, input: Tensor) -> Tensor:
+        r"""Compute log probabilities for all :math:`\texttt{n\_classes}`.
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= \texttt{n\_classes}`, where :math:`\texttt{n\_classes}` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+
+        Shape:
+            - Input: :math:`(N, \texttt{in\_features})`
+            - Output: :math:`(N, \texttt{n\_classes})`
+
+        """
+        head_output = self.head(input)
+        return self._get_full_log_prob(input, head_output)
+
+    def predict(self, input: Tensor) -> Tensor:
+        r"""Return the class with the highest probability for each example in the input minibatch.
+
+        This is equivalent to ``self.log_prob(input).argmax(dim=1)``, but is more efficient in some cases.
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            output (Tensor): a class with the highest probability for each example
+
+        Shape:
+            - Input: :math:`(N, \texttt{in\_features})`
+            - Output: :math:`(N)`
+        """
+        head_output = self.head(input)
+        output = torch.argmax(head_output, dim=1)
+        not_in_shortlist = output >= self.shortlist_size
+        all_in_shortlist = not (not_in_shortlist.any())
+
+        if all_in_shortlist:
+            return output
+
+        elif not_in_shortlist.all():
+            log_prob = self._get_full_log_prob(input, head_output)
+            return torch.argmax(log_prob, dim=1)
+
+        else:
+            log_prob = self._get_full_log_prob(
+                input[not_in_shortlist], head_output[not_in_shortlist]
+            )
+            output[not_in_shortlist] = torch.argmax(log_prob, dim=1)
+            return output
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a912b4f05682792b1a3126b6df53230ced88c0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.py
@@ -0,0 +1,902 @@
+# mypy: allow-untyped-defs
+from typing import Any
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F, init
+from torch.nn.parameter import Parameter, UninitializedBuffer, UninitializedParameter
+
+from ._functions import SyncBatchNorm as sync_batch_norm
+from .lazy import LazyModuleMixin
+from .module import Module
+
+
+__all__ = [
+    "BatchNorm1d",
+    "LazyBatchNorm1d",
+    "BatchNorm2d",
+    "LazyBatchNorm2d",
+    "BatchNorm3d",
+    "LazyBatchNorm3d",
+    "SyncBatchNorm",
+]
+
+
+class _NormBase(Module):
+    """Common base of _InstanceNorm and _BatchNorm."""
+
+    _version = 2
+    __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
+    num_features: int
+    eps: float
+    momentum: float | None
+    affine: bool
+    track_running_stats: bool
+    # WARNING: weight and bias purposely not defined here.
+    # See https://github.com/pytorch/pytorch/issues/39670
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float | None = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_features, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_features, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer(
+                "running_mean", torch.zeros(num_features, **factory_kwargs)
+            )
+            self.register_buffer(
+                "running_var", torch.ones(num_features, **factory_kwargs)
+            )
+            self.running_mean: Tensor | None
+            self.running_var: Tensor | None
+            self.register_buffer(
+                "num_batches_tracked",
+                torch.tensor(
+                    0,
+                    dtype=torch.long,
+                    # pyrefly: ignore [bad-argument-type]
+                    **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+                ),
+            )
+            self.num_batches_tracked: Tensor | None
+        else:
+            self.register_buffer("running_mean", None)
+            self.register_buffer("running_var", None)
+            self.register_buffer("num_batches_tracked", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            # running_mean/running_var/num_batches... are registered at runtime depending
+            # if self.track_running_stats is on
+            self.running_mean.zero_()  # type: ignore[union-attr]
+            self.running_var.fill_(1)  # type: ignore[union-attr]
+            self.num_batches_tracked.zero_()  # type: ignore[union-attr,operator]
+
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}".format(**self.__dict__)
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        version = local_metadata.get("version", None)
+
+        if (version is None or version < 2) and self.track_running_stats:
+            # at version 2: added num_batches_tracked buffer
+            #               this should have a default value of 0
+            num_batches_tracked_key = prefix + "num_batches_tracked"
+            if num_batches_tracked_key not in state_dict:
+                state_dict[num_batches_tracked_key] = (
+                    self.num_batches_tracked
+                    if self.num_batches_tracked is not None
+                    and self.num_batches_tracked.device != torch.device("meta")
+                    else torch.tensor(0, dtype=torch.long)
+                )
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class _BatchNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float | None = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            # TODO: if statement only here to tell the jit to skip emitting this when it is None
+            if self.num_batches_tracked is not None:  # type: ignore[has-type]
+                self.num_batches_tracked.add_(1)  # type: ignore[has-type]
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        return F.batch_norm(
+            input,
+            # If buffers are not to be tracked, ensure that they won't be updated
+            (
+                self.running_mean
+                if not self.training or self.track_running_stats
+                else None
+            ),
+            self.running_var if not self.training or self.track_running_stats else None,
+            self.weight,
+            self.bias,
+            bn_training,
+            exponential_average_factor,
+            self.eps,
+        )
+
+
+class _LazyNormBase(LazyModuleMixin, _NormBase):
+    weight: UninitializedParameter  # type: ignore[assignment]
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        eps=1e-5,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            # affine and track_running_stats are hardcoded to False to
+            # avoid creating tensors that will soon be overwritten.
+            0,
+            eps,
+            momentum,
+            False,
+            False,
+            **factory_kwargs,
+        )
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            # pyrefly: ignore [bad-argument-type]
+            self.weight = UninitializedParameter(**factory_kwargs)
+            # pyrefly: ignore [bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+        if self.track_running_stats:
+            # pyrefly: ignore [bad-argument-type]
+            self.running_mean = UninitializedBuffer(**factory_kwargs)
+            # pyrefly: ignore [bad-argument-type]
+            self.running_var = UninitializedBuffer(**factory_kwargs)
+            self.num_batches_tracked = torch.tensor(
+                0,
+                dtype=torch.long,
+                # pyrefly: ignore [bad-argument-type]
+                **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+            )
+
+    def reset_parameters(self) -> None:
+        # pyrefly: ignore [bad-argument-type]
+        if not self.has_uninitialized_params() and self.num_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        # pyrefly: ignore [bad-argument-type]
+        if self.has_uninitialized_params():
+            self.num_features = input.shape[1]
+            if self.affine:
+                assert isinstance(self.weight, UninitializedParameter)
+                assert isinstance(self.bias, UninitializedParameter)
+                self.weight.materialize((self.num_features,))
+                self.bias.materialize((self.num_features,))
+            if self.track_running_stats:
+                self.running_mean.materialize(  # type:ignore[union-attr]
+                    (self.num_features,)
+                )
+                self.running_var.materialize(  # type:ignore[union-attr]
+                    (self.num_features,)
+                )
+            self.reset_parameters()
+
+
+class BatchNorm1d(_BatchNorm):
+    r"""Applies Batch Normalization over a 2D or 3D input.
+
+    Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the number of features or channels of the input). By default, the
+    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
+    At train time in the forward pass, the variance is calculated via the biased estimator,
+    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
+    moving average of the variance is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, correction=1)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+
+    Args:
+        num_features: number of features or channels :math:`C` of the input
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
+          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm1d(100, affine=False)
+        >>> input = torch.randn(20, 100)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+# pyrefly: ignore [inconsistent-inheritance]
+class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization.
+
+    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm1d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class BatchNorm2d(_BatchNorm):
+    r"""Applies Batch Normalization over a 4D input.
+
+    4D is a mini-batch of 2D inputs
+    with additional channel dimension. Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, correction=1)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm2d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+
+
+# pyrefly: ignore [inconsistent-inheritance]
+class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm2d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+
+
+class BatchNorm3d(_BatchNorm):
+    r"""Applies Batch Normalization over a 5D input.
+
+    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, correction=1)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
+    or Spatio-temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+
+
+# pyrefly: ignore [inconsistent-inheritance]
+class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm3d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+
+
+class SyncBatchNorm(_BatchNorm):
+    r"""Applies Batch Normalization over a N-Dimensional input.
+
+    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over all
+    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
+    are learnable parameter vectors of size `C` (where `C` is the input size).
+    By default, the elements of :math:`\gamma` are sampled from
+    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
+    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
+    Normalization or Spatio-temporal Batch Normalization.
+
+    Currently :class:`SyncBatchNorm` only supports
+    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
+    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
+    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
+    Network with DDP.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, +)`
+        eps: a value added to the denominator for numerical stability.
+            Default: ``1e-5``
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+        process_group: synchronization of stats happen within each process group
+            individually. Default behavior is synchronization across the whole
+            world
+
+    Shape:
+        - Input: :math:`(N, C, +)`
+        - Output: :math:`(N, C, +)` (same shape as input)
+
+    .. note::
+        Synchronization of batchnorm statistics occurs only while training, i.e.
+        synchronization is disabled when ``model.eval()`` is set or if
+        ``self.training`` is otherwise ``False``.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With Learnable Parameters
+        >>> m = nn.SyncBatchNorm(100)
+        >>> # creating process group (optional)
+        >>> # ranks is a list of int identifying rank ids.
+        >>> ranks = list(range(8))
+        >>> r1, r2 = ranks[:4], ranks[4:]
+        >>> # Note: every rank calls into new_group for every
+        >>> # process group created, even if that rank is not
+        >>> # part of the group.
+        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+
+        >>> # network is nn.BatchNorm layer
+        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
+        >>> # only single gpu per process is currently supported
+        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
+        >>>                         sync_bn_network,
+        >>>                         device_ids=[args.local_rank],
+        >>>                         output_device=args.local_rank)
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float | None = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        process_group: Any | None = None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.process_group = process_group
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() < 2:
+            raise ValueError(f"expected at least 2D input (got {input.dim()}D input)")
+
+    def _check_non_zero_input_channels(self, input) -> None:
+        if input.size(1) == 0:
+            raise ValueError(
+                "SyncBatchNorm number of input channels should be non-zero"
+            )
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        self._check_input_dim(input)
+        self._check_non_zero_input_channels(input)
+
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            assert self.num_batches_tracked is not None
+            self.num_batches_tracked.add_(1)
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        # If buffers are not to be tracked, ensure that they won't be updated
+        running_mean = (
+            self.running_mean if not self.training or self.track_running_stats else None
+        )
+        running_var = (
+            self.running_var if not self.training or self.track_running_stats else None
+        )
+
+        # Don't sync batchnorm stats in inference mode (model.eval()).
+        need_sync = (
+            bn_training
+            and self.training
+            and torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+        )
+        if need_sync:
+            # currently only GPU/PrivateUse1 input is supported
+            if input.device.type not in [
+                "cuda",
+                "hpu",
+                "xpu",
+                torch._C._get_privateuse1_backend_name(),
+            ]:
+                raise ValueError(
+                    "SyncBatchNorm expected input tensor to be on GPU or XPU or "
+                    f"{torch._C._get_privateuse1_backend_name()}"
+                )
+
+            process_group = torch.distributed.group.WORLD
+            if self.process_group:
+                process_group = self.process_group
+            world_size = torch.distributed.get_world_size(process_group)
+            need_sync = world_size > 1
+
+        # fallback to framework BN when synchronization is not necessary
+        if not need_sync:
+            return F.batch_norm(
+                input,
+                running_mean,
+                running_var,
+                self.weight,
+                self.bias,
+                bn_training,
+                exponential_average_factor,
+                self.eps,
+            )
+        else:
+            assert bn_training
+            return sync_batch_norm.apply(
+                input,
+                self.weight,
+                self.bias,
+                running_mean,
+                running_var,
+                self.eps,
+                exponential_average_factor,
+                process_group,  # type: ignore[possibly-undefined]
+                world_size,  # type: ignore[possibly-undefined]
+            )
+
+    @classmethod
+    def convert_sync_batchnorm(cls, module, process_group=None):
+        r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.
+
+        Args:
+            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
+            process_group (optional): process group to scope synchronization,
+                default is the whole world
+
+        Returns:
+            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
+            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
+            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
+            instead.
+
+        Example::
+
+            >>> # Network with nn.BatchNorm layer
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> module = torch.nn.Sequential(
+            >>>            torch.nn.Linear(20, 100),
+            >>>            torch.nn.BatchNorm1d(100),
+            >>>          ).cuda()
+            >>> # creating process group (optional)
+            >>> # ranks is a list of int identifying rank ids.
+            >>> ranks = list(range(8))
+            >>> r1, r2 = ranks[:4], ranks[4:]
+            >>> # Note: every rank calls into new_group for every
+            >>> # process group created, even if that rank is not
+            >>> # part of the group.
+            >>> # xdoctest: +SKIP("distributed")
+            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
+
+        """
+        module_output = module
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+            module_output = torch.nn.SyncBatchNorm(
+                module.num_features,
+                module.eps,
+                module.momentum,
+                module.affine,
+                module.track_running_stats,
+                process_group,
+            )
+            if module.affine:
+                with torch.no_grad():
+                    module_output.weight = module.weight
+                    module_output.bias = module.bias
+            module_output.running_mean = module.running_mean
+            module_output.running_var = module.running_var
+            module_output.num_batches_tracked = module.num_batches_tracked
+            module_output.training = module.training
+            if hasattr(module, "qconfig"):
+                module_output.qconfig = module.qconfig
+        for name, child in module.named_children():
+            module_output.add_module(
+                name, cls.convert_sync_batchnorm(child, process_group)
+            )
+        del module
+        return module_output
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a48f04f853dd1c458b035635728a122e9cc4d3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/channelshuffle.py
@@ -0,0 +1,62 @@
+import torch.nn.functional as F
+from torch import Tensor
+
+from .module import Module
+
+
+__all__ = ["ChannelShuffle"]
+
+
+class ChannelShuffle(Module):
+    r"""Divides and rearranges the channels in a tensor.
+
+    This operation divides the channels in a tensor of shape :math:`(N, C, *)`
+    into g groups as :math:`(N, \frac{C}{g}, g, *)` and shuffles them,
+    while retaining the original tensor shape in the final output.
+
+    Args:
+        groups (int): number of groups to divide channels in.
+
+    Examples::
+
+        >>> channel_shuffle = nn.ChannelShuffle(2)
+        >>> input = torch.arange(1, 17, dtype=torch.float32).view(1, 4, 2, 2)
+        >>> input
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]],
+                 [[ 5.,  6.],
+                  [ 7.,  8.]],
+                 [[ 9., 10.],
+                  [11., 12.]],
+                 [[13., 14.],
+                  [15., 16.]]]])
+        >>> output = channel_shuffle(input)
+        >>> output
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]],
+                 [[ 9., 10.],
+                  [11., 12.]],
+                 [[ 5.,  6.],
+                  [ 7.,  8.]],
+                 [[13., 14.],
+                  [15., 16.]]]])
+    """
+
+    __constants__ = ["groups"]
+    groups: int
+
+    def __init__(self, groups: int) -> None:
+        super().__init__()
+        self.groups = groups
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.channel_shuffle(input, self.groups)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"groups={self.groups}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..d99151369e18e4d55ef843d6b8c6f4395d6a6453
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/container.py
@@ -0,0 +1,1043 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import operator
+from collections import abc as container_abcs, OrderedDict
+from itertools import chain, islice
+from typing import Any, overload, TYPE_CHECKING, TypeVar
+from typing_extensions import deprecated, Self
+
+import torch
+from torch._jit_internal import _copy_to_script_wrapper
+from torch.nn.parameter import Parameter
+
+from .module import Module
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Mapping
+
+
+__all__ = [
+    "Container",
+    "Sequential",
+    "ModuleList",
+    "ModuleDict",
+    "ParameterList",
+    "ParameterDict",
+]
+
+T = TypeVar("T", bound=Module)
+_V = TypeVar("_V")
+
+
+# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
+def _addindent(s_, numSpaces):
+    s = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s
+
+
+@deprecated(
+    "`nn.Container` is deprecated. "
+    "All of it's functionality is now implemented in `nn.Module`. Subclass that instead.",
+    category=FutureWarning,
+)
+class Container(Module):
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__()
+        for key, value in kwargs.items():
+            self.add_module(key, value)
+
+
+class Sequential(Module):
+    r"""A sequential container.
+
+    Modules will be added to it in the order they are passed in the
+    constructor. Alternatively, an ``OrderedDict`` of modules can be
+    passed in. The ``forward()`` method of ``Sequential`` accepts any
+    input and forwards it to the first module it contains. It then
+    "chains" outputs to inputs sequentially for each subsequent module,
+    finally returning the output of the last module.
+
+    The value a ``Sequential`` provides over manually calling a sequence
+    of modules is that it allows treating the whole container as a
+    single module, such that performing a transformation on the
+    ``Sequential`` applies to each of the modules it stores (which are
+    each a registered submodule of the ``Sequential``).
+
+    What's the difference between a ``Sequential`` and a
+    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
+    sounds like--a list for storing ``Module`` s! On the other hand,
+    the layers in a ``Sequential`` are connected in a cascading way.
+
+    Example::
+
+        # Using Sequential to create a small model. When `model` is run,
+        # input will first be passed to `Conv2d(1,20,5)`. The output of
+        # `Conv2d(1,20,5)` will be used as the input to the first
+        # `ReLU`; the output of the first `ReLU` will become the input
+        # for `Conv2d(20,64,5)`. Finally, the output of
+        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
+        model = nn.Sequential(
+            nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU()
+        )
+
+        # Using Sequential with OrderedDict. This is functionally the
+        # same as the above code
+        model = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv1", nn.Conv2d(1, 20, 5)),
+                    ("relu1", nn.ReLU()),
+                    ("conv2", nn.Conv2d(20, 64, 5)),
+                    ("relu2", nn.ReLU()),
+                ]
+            )
+        )
+    """
+
+    _modules: dict[str, Module]  # type: ignore[assignment]
+
+    @overload
+    def __init__(self, *args: Module) -> None: ...
+
+    @overload
+    # pyrefly: ignore [inconsistent-overload]
+    def __init__(self, arg: OrderedDict[str, Module]) -> None: ...
+
+    def __init__(self, *args):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator: Iterable[_V], idx: int) -> _V:
+        """Get the idx-th item of the iterator."""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError(f"index {idx} is out of range")
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: slice | int) -> Sequential | Module:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key: str = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx: slice | int) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+        # To preserve numbering
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(
+            zip(str_indices, self._modules.values(), strict=True)
+        )
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __add__(self, other) -> Sequential:
+        if isinstance(other, Sequential):
+            ret = Sequential()
+            for layer in self:
+                ret.append(layer)
+            for layer in other:
+                ret.append(layer)
+            return ret
+        else:
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential class, but {str(type(other))} is given."
+            )
+
+    def pop(self, key: int | slice) -> Module:
+        """
+        Pop ``key`` from self.
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def __iadd__(self, other) -> Self:
+        if isinstance(other, Sequential):
+            offset = len(self)
+            for i, module in enumerate(other):
+                self.add_module(str(i + offset), module)
+            return self
+        else:
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential class, but {str(type(other))} is given."
+            )
+
+    def __mul__(self, other: int) -> Sequential:
+        if not isinstance(other, int):
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
+        else:
+            combined = Sequential()
+            offset = 0
+            for _ in range(other):
+                for module in self:
+                    combined.add_module(str(offset), module)
+                    offset += 1
+            return combined
+
+    def __rmul__(self, other: int) -> Sequential:
+        return self.__mul__(other)
+
+    def __imul__(self, other: int) -> Self:
+        if not isinstance(other, int):
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
+        else:
+            len_original = len(self)
+            offset = len(self)
+            for _ in range(other - 1):
+                for i in range(len_original):
+                    self.add_module(str(i + offset), self._modules[str(i)])
+                offset += len_original
+            return self
+
+    @_copy_to_script_wrapper
+    def __dir__(self) -> list[str]:
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    # NB: We can't really type check this function as the type of input
+    # may change dynamically (as is tested in
+    # TestScript.test_sequential_intermediary_types).  Cannot annotate
+    # with Any as TorchScript expects a more precise type
+    def forward(self, input):
+        """
+        Runs the forward pass.
+        """
+        for module in self:
+            input = module(input)
+        return input
+
+    def append(self, module: Module) -> Self:
+        r"""Append a given module to the end.
+
+        Args:
+            module (nn.Module): module to append
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.append(nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+            )
+
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def insert(self, index: int, module: Module) -> Self:
+        """
+        Inserts a module into the Sequential container at the specified index.
+
+        Args:
+            index (int): The index to insert the module.
+            module (Module): The module to be inserted.
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.insert(0, nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=3, out_features=4, bias=True)
+                (1): Linear(in_features=1, out_features=2, bias=True)
+                (2): Linear(in_features=2, out_features=3, bias=True)
+            )
+
+        """
+        if not isinstance(module, Module):
+            raise AssertionError(f"module should be of type: {Module}")
+        n = len(self._modules)
+        if not (-n <= index <= n):
+            raise IndexError(f"Index out of range: {index}")
+        if index < 0:
+            index += n
+        for i in range(n, index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+        return self
+
+    def extend(self, sequential: Iterable[Module]) -> Self:
+        """
+        Extends the current Sequential container with layers from another Sequential container.
+
+        Args:
+            sequential (Sequential): A Sequential container whose layers will be added to the current container.
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> other = nn.Sequential(nn.Linear(3, 4), nn.Linear(4, 5))
+            >>> n.extend(other) # or `n + other`
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+                (3): Linear(in_features=4, out_features=5, bias=True)
+            )
+
+        """
+        for layer in sequential:
+            self.append(layer)
+        return self
+
+
+class ModuleList(Module):
+    r"""Holds submodules in a list.
+
+    :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
+    modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
+
+            def forward(self, x):
+                # ModuleList can act as an iterable, or be indexed using ints
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i // 2](x) + l(x)
+                return x
+    """
+
+    _modules: dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Iterable[Module] | None = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f"index {idx} is out of range")
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: slice) -> ModuleList: ...
+
+    @overload
+    def __getitem__(self, idx: int) -> Module: ...
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: int | slice) -> Module | ModuleList:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx: int | slice) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        # To preserve numbering, self._modules is being reconstructed with modules after deletion
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(
+            zip(str_indices, self._modules.values(), strict=True)
+        )
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def __iadd__(self, modules: Iterable[Module]) -> Self:
+        return self.extend(modules)
+
+    def __add__(self, other: Iterable[Module]) -> ModuleList:
+        combined = ModuleList()
+        for i, module in enumerate(chain(self, other)):
+            combined.add_module(str(i), module)
+        return combined
+
+    def __repr__(self) -> str:
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+
+        lines = []
+        main_str = self._get_name() + "("
+        for (start_id, end_id), b in zip(
+            start_end_indices, repeated_blocks, strict=True
+        ):
+            local_repr = f"({start_id}): {b}"  # default repr
+
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+
+    @_copy_to_script_wrapper
+    def __dir__(self) -> list[str]:
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def insert(self, index: int, module: Module) -> None:
+        r"""Insert a given module before a given index in the list.
+
+        Args:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
+    def append(self, module: Module) -> Self:
+        r"""Append a given module to the end of the list.
+
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def pop(self, key: int | slice) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+
+    def extend(self, modules: Iterable[Module]) -> Self:
+        r"""Append modules from a Python iterable to the end of the list.
+
+        Args:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError(
+                "ModuleList.extend should be called with an "
+                "iterable, but got " + type(modules).__name__
+            )
+        offset = len(self)
+        for i, module in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+    # remove forward altogether to fallback on Module's _forward_unimplemented
+
+
+class ModuleDict(Module):
+    r"""Holds submodules in a dictionary.
+
+    :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+
+    :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+
+    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~torch.nn.ModuleDict` (the argument to
+      :meth:`~torch.nn.ModuleDict.update`).
+
+    Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
+    types does not preserve the order of the merged mapping.
+
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.choices = nn.ModuleDict(
+                    {"conv": nn.Conv2d(10, 10, 3), "pool": nn.MaxPool2d(3)}
+                )
+                self.activations = nn.ModuleDict(
+                    [["lrelu", nn.LeakyReLU()], ["prelu", nn.PReLU()]]
+                )
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    _modules: dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Mapping[str, Module] | None = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self.update(modules)
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    @_copy_to_script_wrapper
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict."""
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        r"""Remove key from the ModuleDict and return its module.
+
+        Args:
+            key (str): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    @_copy_to_script_wrapper
+    def keys(self) -> container_abcs.KeysView[str]:
+        r"""Return an iterable of the ModuleDict keys."""
+        return self._modules.keys()
+
+    @_copy_to_script_wrapper
+    def items(self) -> container_abcs.ItemsView[str, Module]:
+        r"""Return an iterable of the ModuleDict key/value pairs."""
+        return self._modules.items()
+
+    @_copy_to_script_wrapper
+    def values(self) -> container_abcs.ValuesView[Module]:
+        r"""Return an iterable of the ModuleDict values."""
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
+                or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError(
+                "ModuleDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(modules).__name__
+            )
+
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError(
+                        "ModuleDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(m).__name__
+                    )
+                # pyrefly: ignore [bad-argument-type]
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element "
+                        # pyrefly: ignore [bad-argument-type]
+                        "#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
+                    )
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward altogether to fallback on Module's _forward_unimplemented
+
+
+class ParameterList(Module):
+    r"""Holds parameters in a list.
+
+    :class:`~torch.nn.ParameterList` can be used like a regular Python
+    list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered,
+    and will be visible by all :class:`~torch.nn.Module` methods.
+
+    Note that the constructor, assigning an element of the list, the
+    :meth:`~torch.nn.ParameterList.append` method and the :meth:`~torch.nn.ParameterList.extend`
+    method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`.
+
+    Args:
+        parameters (iterable, optional): an iterable of elements to add to the list.
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.params = nn.ParameterList(
+                    [nn.Parameter(torch.randn(10, 10)) for i in range(10)]
+                )
+
+            def forward(self, x):
+                # ParameterList can act as an iterable, or be indexed using ints
+                for i, p in enumerate(self.params):
+                    x = self.params[i // 2].mm(x) + p.mm(x)
+                return x
+    """
+
+    def __init__(self, values: Iterable[Any] | None = None) -> None:
+        super().__init__()
+        self._size = 0
+        if values is not None:
+            self += values
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f"index {idx} is out of range")
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> Any: ...
+
+    @overload
+    # pyrefly: ignore [inconsistent-overload]
+    def __getitem__(self: T, idx: slice) -> T: ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, torch.Tensor) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
+
+    def __len__(self) -> int:
+        return self._size
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
+
+    def __iadd__(self, parameters: Iterable[Any]) -> Self:
+        return self.extend(parameters)
+
+    def __dir__(self) -> list[str]:
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, value: Any) -> Self:
+        """Append a given value at the end of the list.
+
+        Args:
+            value (Any): value to append
+        """
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
+        return self
+
+    def extend(self, values: Iterable[Any]) -> Self:
+        """Append values from a Python iterable to the end of the list.
+
+        Args:
+            values (iterable): iterable of values to append
+        """
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(
+            values, torch.Tensor
+        ):
+            raise TypeError(
+                "ParameterList.extend should be called with an "
+                "iterable, but got " + type(values).__name__
+            )
+        for value in values:
+            self.append(value)
+        return self
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, torch.Tensor):
+                size_str = "x".join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f" ({p.device})"
+                else:
+                    device_str = ""
+                parastr = "{} containing: [{} of size {}{}]".format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    p.dtype,
+                    size_str,
+                    device_str,
+                )
+                # pyrefly: ignore [bad-argument-type]
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+                    # pyrefly: ignore [bad-argument-type]
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError("ParameterList should not be called.")
+
+
+class ParameterDict(Module):
+    r"""Holds parameters in a dictionary.
+
+    ParameterDict can be indexed like a regular Python dictionary, but Parameters it
+    contains are properly registered, and will be visible by all Module methods.
+    Other objects are treated as would be done by a regular Python dictionary
+
+    :class:`~torch.nn.ParameterDict` is an **ordered** dictionary.
+    :meth:`~torch.nn.ParameterDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict``) does not preserve the order of the
+    merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict`
+    will preserve their ordering.
+
+    Note that the constructor, assigning an element of the dictionary and the
+    :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into
+    :class:`~torch.nn.Parameter`.
+
+    Args:
+        values (iterable, optional): a mapping (dictionary) of
+            (string : Any) or an iterable of key-value pairs
+            of type (string, Any)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.params = nn.ParameterDict(
+                    {
+                        "left": nn.Parameter(torch.randn(5, 10)),
+                        "right": nn.Parameter(torch.randn(5, 10)),
+                    }
+                )
+
+            def forward(self, x, choice):
+                x = self.params[choice].mm(x)
+                return x
+    """
+
+    def __init__(self, parameters: Any = None) -> None:
+        super().__init__()
+        self._keys: dict[str, None] = {}
+        if parameters is not None:
+            self.update(parameters)
+
+    def _key_to_attr(self, key: str) -> str:
+        if not isinstance(key, str):
+            raise TypeError(
+                "Index given to ParameterDict cannot be used as a key as it is "
+                f"not a string (type is '{type(key).__name__}'). Open an issue on "
+                "github if you need non-string keys."
+            )
+        else:
+            # Use the key as-is so that `.named_parameters()` returns the right thing
+            return key
+
+    def __getitem__(self, key: str) -> Any:
+        attr = self._key_to_attr(key)
+        return getattr(self, attr)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        # Note that all other function that add an entry to the dictionary part of
+        # the ParameterDict end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the dictionary part and thus won't
+        # call into this function.
+        self._keys[key] = None
+        attr = self._key_to_attr(key)
+        if isinstance(value, torch.Tensor) and not isinstance(value, Parameter):
+            value = Parameter(value)
+        setattr(self, attr, value)
+
+    def __delitem__(self, key: str) -> None:
+        del self._keys[key]
+        attr = self._key_to_attr(key)
+        delattr(self, attr)
+
+    def __len__(self) -> int:
+        return len(self._keys)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._keys)
+
+    def __reversed__(self) -> Iterator[str]:
+        return reversed(self._keys)
+
+    def copy(self) -> ParameterDict:
+        """Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
+        # We have to use an OrderedDict because the ParameterDict constructor
+        # behaves differently on plain dict vs OrderedDict
+        return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._keys
+
+    def setdefault(self, key: str, default: Any | None = None) -> Any:
+        """Set the default for a key in the Parameterdict.
+
+        If key is in the ParameterDict, return its value.
+        If not, insert `key` with a parameter `default` and return `default`.
+        `default` defaults to `None`.
+
+        Args:
+            key (str): key to set default for
+            default (Any): the parameter set to the key
+        """
+        if key not in self:
+            self[key] = default
+        return self[key]
+
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict."""
+        for k in self._keys.copy():
+            del self[k]
+
+    def pop(self, key: str) -> Any:
+        r"""Remove key from the ParameterDict and return its parameter.
+
+        Args:
+            key (str): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def popitem(self) -> tuple[str, Any]:
+        """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
+        k, _ = self._keys.popitem()
+        # We need the key in the _keys to be able to access/del
+        self._keys[k] = None
+        val = self[k]
+        del self[k]
+        return k, val
+
+    def get(self, key: str, default: Any | None = None) -> Any:
+        r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.
+
+        Args:
+            key (str): key to get from the ParameterDict
+            default (Parameter, optional): value to return if key not present
+        """
+        return self[key] if key in self else default  # noqa: SIM401
+
+    def fromkeys(
+        self, keys: Iterable[str], default: Any | None = None
+    ) -> ParameterDict:
+        r"""Return a new ParameterDict with the keys provided.
+
+        Args:
+            keys (iterable, string): keys to make the new ParameterDict from
+            default (Parameter, optional): value to set for all keys
+        """
+        return ParameterDict((k, default) for k in keys)
+
+    def keys(self) -> container_abcs.KeysView[str]:
+        r"""Return an iterable of the ParameterDict keys."""
+        return self._keys.keys()
+
+    def items(self) -> Iterable[tuple[str, Any]]:
+        r"""Return an iterable of the ParameterDict key/value pairs."""
+        return ((k, self[k]) for k in self._keys)
+
+    def values(self) -> Iterable[Any]:
+        r"""Return an iterable of the ParameterDict values."""
+        return (self[k] for k in self._keys)
+
+    def update(self, parameters: Mapping[str, Any] | ParameterDict) -> None:
+        r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
+
+        .. note::
+            If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            parameters (iterable): a mapping (dictionary) from string to
+                :class:`~torch.nn.Parameter`, or an iterable of
+                key-value pairs of type (string, :class:`~torch.nn.Parameter`)
+        """
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError(
+                "ParametersDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(parameters).__name__
+            )
+
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for key, parameter in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, container_abcs.Mapping):
+            for key, parameter in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, container_abcs.Iterable):
+                    raise TypeError(
+                        "ParameterDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
+                    )
+                # pyrefly: ignore [bad-argument-type]
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element "
+                        # pyrefly: ignore [bad-argument-type]
+                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
+                    )
+                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
+                self[p[0]] = p[1]  # type: ignore[assignment]
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in self.items():
+            if isinstance(p, torch.Tensor):
+                size_str = "x".join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f" ({p.device})"
+                else:
+                    device_str = ""
+                parastr = "{} containing: [{} of size {}{}]".format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    torch.typename(p),
+                    size_str,
+                    device_str,
+                )
+                # pyrefly: ignore [bad-argument-type]
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+                    # pyrefly: ignore [bad-argument-type]
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterDict should not be called.")
+
+    def __or__(self, other: ParameterDict) -> ParameterDict:
+        copy = self.copy()
+        copy.update(other)
+        return copy
+
+    def __ror__(self, other: ParameterDict) -> ParameterDict:
+        copy = other.copy()
+        copy.update(self)
+        return copy
+
+    def __ior__(self, other: ParameterDict) -> Self:
+        self.update(other)
+        return self
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b74b6a5a39e8ebfec821a047936e82b3cf002f0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py
@@ -0,0 +1,1904 @@
+# mypy: allow-untyped-defs
+import math
+from typing import Literal, Optional
+from typing_extensions import deprecated
+
+import torch
+from torch import Tensor
+from torch._torch_docs import reproducibility_notes
+from torch.nn import functional as F, init
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from .lazy import LazyModuleMixin
+from .module import Module
+from .utils import _pair, _reverse_repeat_tuple, _single, _triple
+
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "LazyConv1d",
+    "LazyConv2d",
+    "LazyConv3d",
+    "LazyConvTranspose1d",
+    "LazyConvTranspose2d",
+    "LazyConvTranspose3d",
+]
+
+convolution_notes = {
+    "groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""",
+    "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also known as a "depthwise convolution".
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
+        :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`.""",
+}  # noqa: B950
+
+
+class _ConvNd(Module):
+    __constants__ = [
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "padding_mode",
+        "output_padding",
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+    ]
+    __annotations__ = {"bias": Optional[torch.Tensor]}
+
+    def _conv_forward(  # type: ignore[empty-body]
+        self, input: Tensor, weight: Tensor, bias: Tensor | None
+    ) -> Tensor: ...
+
+    in_channels: int
+    _reversed_padding_repeated_twice: list[int]
+    out_channels: int
+    kernel_size: tuple[int, ...]
+    stride: tuple[int, ...]
+    padding: str | tuple[int, ...]
+    dilation: tuple[int, ...]
+    transposed: bool
+    output_padding: tuple[int, ...]
+    groups: int
+    padding_mode: Literal["zeros", "reflect", "replicate", "circular"]
+    weight: Tensor
+    bias: Tensor | None
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple[int, ...],
+        stride: tuple[int, ...],
+        padding: str | tuple[int, ...],
+        dilation: tuple[int, ...],
+        transposed: bool,
+        output_padding: tuple[int, ...],
+        groups: int,
+        bias: bool,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"],
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if groups <= 0:
+            raise ValueError("groups must be a positive integer")
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+        valid_padding_strings = {"same", "valid"}
+        if isinstance(padding, str):
+            if padding not in valid_padding_strings:
+                raise ValueError(
+                    f"Invalid padding string {padding!r}, should be one of {valid_padding_strings}"
+                )
+            if padding == "same" and any(s != 1 for s in stride):
+                raise ValueError(
+                    "padding='same' is not supported for strided convolutions"
+                )
+
+        valid_padding_modes = {"zeros", "reflect", "replicate", "circular"}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'"
+            )
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.output_padding = output_padding
+        self.groups = groups
+        self.padding_mode = padding_mode
+        # `_reversed_padding_repeated_twice` is the padding to be passed to
+        # `F.pad` if needed (e.g., for non-zero padding types that are
+        # implemented as two ops: padding + conv). `F.pad` accepts paddings in
+        # reverse order than the dimension.
+        if isinstance(self.padding, str):
+            self._reversed_padding_repeated_twice = [0, 0] * len(kernel_size)
+            if padding == "same":
+                for d, k, i in zip(
+                    dilation,
+                    kernel_size,
+                    range(len(kernel_size) - 1, -1, -1),
+                    strict=False,
+                ):
+                    total_padding = d * (k - 1)
+                    left_pad = total_padding // 2
+                    self._reversed_padding_repeated_twice[2 * i] = left_pad
+                    self._reversed_padding_repeated_twice[2 * i + 1] = (
+                        total_padding - left_pad
+                    )
+        else:
+            self._reversed_padding_repeated_twice = _reverse_repeat_tuple(
+                self.padding, 2
+            )
+
+        if transposed:
+            self.weight = Parameter(
+                torch.empty(
+                    (in_channels, out_channels // groups, *kernel_size),
+                    **factory_kwargs,
+                )
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    (out_channels, in_channels // groups, *kernel_size),
+                    **factory_kwargs,
+                )
+            )
+        if bias:
+            self.bias = Parameter(torch.empty(out_channels, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size)
+        # For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            if fan_in != 0:
+                bound = 1 / math.sqrt(fan_in)
+                init.uniform_(self.bias, -bound, bound)
+
+    def extra_repr(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.output_padding != (0,) * len(self.output_padding):
+            s += ", output_padding={output_padding}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
+        return s.format(**self.__dict__)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "padding_mode"):
+            self.padding_mode = "zeros"
+
+
+class Conv1d(_ConvNd):
+    __doc__ = (
+        r"""Applies a 1D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\text{in}}, L)` and output :math:`(N, C_{\text{out}}, L_{\text{out}})` can be
+    precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{\text{out}_j}, k)
+        \star \text{input}(N_i, k)
+
+    where :math:`\star` is the valid `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`L` is a length of signal sequence.
+    """
+        + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a one-element tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    {groups_note}
+
+    Note:
+        {depthwise_separable_note}
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out\_channels},
+            \frac{\text{in\_channels}}{\text{groups}}, \text{kernel\_size})`.
+            The values of these weights are sampled from
+            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
+            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}`
+
+    Examples::
+
+        >>> m = nn.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: str | _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # we create new variables below to make mypy happy since kernel_size has
+        # type Union[int, Tuple[int]] and kernel_size_ has type Tuple[int]
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride_,
+            padding_,
+            dilation_,
+            False,
+            _single(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None):
+        if self.padding_mode != "zeros":
+            return F.conv1d(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                self.stride,
+                _single(0),
+                self.dilation,
+                self.groups,
+            )
+
+        return F.conv1d(
+            input, weight, bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+class Conv2d(_ConvNd):
+    __doc__ = (
+        r"""Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+
+    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+    """
+        + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or an int / a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        {depthwise_separable_note}
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+            :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+            The values of these weights are sampled from
+            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``,
+            then the values of these weights are
+            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+
+    Examples:
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: str | _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride_,
+            padding_,
+            dilation_,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None):
+        if self.padding_mode != "zeros":
+            return F.conv2d(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                self.stride,
+                _pair(0),
+                self.dilation,
+                self.groups,
+            )
+
+        return F.conv2d(
+            input, weight, bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+class Conv3d(_ConvNd):
+    __doc__ = (
+        r"""Applies a 3D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, D, H, W)`
+    and output :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_{out_j}) = bias(C_{out_j}) +
+                                \sum_{k = 0}^{C_{in} - 1} weight(C_{out_j}, k) \star input(N_i, k)
+
+    where :math:`\star` is the valid 3D `cross-correlation`_ operator
+    """
+        + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Note:
+        {depthwise_separable_note}
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all six sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`,
+          where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
+                    \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
+                    \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2]
+                    \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
+                         then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: str | _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride_,
+            padding_,
+            dilation_,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Tensor | None):
+        if self.padding_mode != "zeros":
+            return F.conv3d(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                self.stride,
+                _triple(0),
+                self.dilation,
+                self.groups,
+            )
+
+        return F.conv3d(
+            input, weight, bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+class _ConvTransposeNd(_ConvNd):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode,
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding_mode != "zeros":
+            raise ValueError(
+                f'Only "zeros" padding mode is supported for {self.__class__.__name__}'
+            )
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    # dilation being an optional parameter is for backwards
+    # compatibility
+    def _output_padding(
+        self,
+        input: Tensor,
+        output_size: list[int] | None,
+        stride: list[int],
+        padding: list[int],
+        kernel_size: list[int],
+        num_spatial_dims: int,
+        dilation: list[int] | None = None,
+    ) -> list[int]:
+        if output_size is None:
+            ret = _single(self.output_padding)  # converting to list if was not already
+        else:
+            has_batch_dim = input.dim() == num_spatial_dims + 2
+            num_non_spatial_dims = 2 if has_batch_dim else 1
+            if len(output_size) == num_non_spatial_dims + num_spatial_dims:
+                output_size = output_size[num_non_spatial_dims:]
+            if len(output_size) != num_spatial_dims:
+                raise ValueError(
+                    f"ConvTranspose{num_spatial_dims}D: for {input.dim()}D input, output_size must have {num_spatial_dims} "
+                    f"or {num_non_spatial_dims + num_spatial_dims} elements (got {len(output_size)})"
+                )
+
+            min_sizes = torch.jit.annotate(list[int], [])
+            max_sizes = torch.jit.annotate(list[int], [])
+            for d in range(num_spatial_dims):
+                dim_size = (
+                    (input.size(d + num_non_spatial_dims) - 1) * stride[d]
+                    - 2 * padding[d]
+                    + (dilation[d] if dilation is not None else 1)
+                    * (kernel_size[d] - 1)
+                    + 1
+                )
+                min_sizes.append(dim_size)
+                max_sizes.append(min_sizes[d] + stride[d] - 1)
+
+            for i in range(len(output_size)):
+                size = output_size[i]
+                min_size = min_sizes[i]
+                max_size = max_sizes[i]
+                if size < min_size or size > max_size:
+                    raise ValueError(
+                        f"requested an output size of {output_size}, but valid sizes range "
+                        f"from {min_sizes} to {max_sizes} (for an input of {input.size()[2:]})"
+                    )
+
+            res = torch.jit.annotate(list[int], [])
+            for d in range(num_spatial_dims):
+                res.append(output_size[d] - min_sizes[d])
+
+            ret = res
+        return ret
+
+
+class ConvTranspose1d(_ConvTransposeNd):
+    __doc__ = (
+        r"""Applies a 1D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv1d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    {groups_note}
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv1d` and a :class:`~torch.nn.ConvTranspose1d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv1d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        In some circumstances when using the CUDA backend with CuDNN, this operator
+        may select a nondeterministic algorithm to increase performance. If this is
+        undesirable, you can try to make the operation deterministic (potentially at
+        a performance cost) by setting ``torch.backends.cudnn.deterministic =
+        True``.
+        Please see the notes on :doc:`/notes/randomness` for background.
+
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where
+
+          .. math::
+              L_{out} = (L_{in} - 1) \times \text{stride} - 2 \times \text{padding} + \text{dilation}
+                        \times (\text{kernel\_size} - 1) + \text{output\_padding} + 1
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels).
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12)
+        >>> downsample = nn.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        output_padding = _single(output_padding)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
+        if self.padding_mode != "zeros":
+            raise ValueError(
+                "Only `zeros` padding mode is supported for ConvTranspose1d"
+            )
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 1
+        output_padding = self._output_padding(
+            input,
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims,
+            self.dilation,  # type: ignore[arg-type]
+        )
+        return F.conv_transpose1d(
+            input,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+
+
+class ConvTranspose2d(_ConvTransposeNd):
+    __doc__ = (
+        r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation. When stride > 1, ConvTranspose2d inserts zeros between input
+      elements along the spatial dimensions before applying the convolution kernel. This zero-insertion operation is the standard
+      behavior of transposed convolutions, which can increase the spatial resolution and is equivalent to a learnable
+      upsampling operation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimensions
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv2d` and a :class:`~torch.nn.ConvTranspose2d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv2d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where
+
+        .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12)
+        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_2_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
+        """
+        Performs the forward pass.
+
+        Attributes:
+            input (Tensor): The input tensor.
+            output_size (list[int], optional): A list of integers representing
+                the size of the output tensor. Default is None.
+        """
+        if self.padding_mode != "zeros":
+            raise ValueError(
+                "Only `zeros` padding mode is supported for ConvTranspose2d"
+            )
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 2
+        output_padding = self._output_padding(
+            input,
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims,
+            self.dilation,  # type: ignore[arg-type]
+        )
+
+        return F.conv_transpose2d(
+            input,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+
+
+class ConvTranspose3d(_ConvTransposeNd):
+    __doc__ = (
+        r"""Applies a 3D transposed convolution operator over an input image composed of several input
+    planes.
+    The transposed convolution operator multiplies each input value element-wise by a learnable kernel,
+    and sums over the outputs from all input feature planes.
+
+    This module can be seen as the gradient of Conv3d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimensions
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv3d` and a :class:`~torch.nn.ConvTranspose3d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv3d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes)
+        + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or
+          :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where
+
+        .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1
+        .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2] + \text{dilation}[2]
+                        \times (\text{kernel\_size}[2] - 1) + \text{output\_padding}[2] + 1
+
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+    )
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        output_padding = _triple(output_padding)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def forward(self, input: Tensor, output_size: list[int] | None = None) -> Tensor:
+        if self.padding_mode != "zeros":
+            raise ValueError(
+                "Only `zeros` padding mode is supported for ConvTranspose3d"
+            )
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 3
+        output_padding = self._output_padding(
+            input,
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims,
+            self.dilation,  # type: ignore[arg-type]
+        )
+
+        return F.conv_transpose3d(
+            input,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+
+
+# TODO: Deprecate and remove the following alias `_ConvTransposeMixin`.
+#
+# `_ConvTransposeMixin` was a mixin that was removed.  It is meant to be used
+# with `_ConvNd` to construct actual module classes that implements conv
+# transpose ops:
+#
+#   class MyConvTranspose(_ConvNd, _ConvTransposeMixin):
+#       ...
+#
+# In PyTorch, it has been replaced by `_ConvTransposeNd`, which is a proper
+# subclass of `_ConvNd`.  However, some user code in the wild still (incorrectly)
+# use the internal class `_ConvTransposeMixin`.  Hence, we provide this alias
+# for BC, because it is cheap and easy for us to do so, even though that
+# `_ConvTransposeNd` is really not a mixin anymore (but multiple inheritance as
+# above would still work).
+class _ConvTransposeMixin(_ConvTransposeNd):
+    @deprecated(
+        "`_ConvTransposeMixin` is a deprecated internal class. "
+        "Please consider using public APIs.",
+        category=FutureWarning,
+    )
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+
+# TODO: Conv2dLocal
+# TODO: Conv2dMap
+# TODO: ConvTranspose2dMap
+
+
+class _LazyConvXdMixin(LazyModuleMixin):
+    groups: int
+    transposed: bool
+    in_channels: int
+    out_channels: int
+    kernel_size: tuple[int, ...]
+    weight: UninitializedParameter
+    bias: UninitializedParameter
+
+    def reset_parameters(self) -> None:
+        # has_uninitialized_params is defined in parent class and it is using a protocol on self
+        if not self.has_uninitialized_params() and self.in_channels != 0:  # type: ignore[misc]
+            # "type:ignore[..]" is required because mypy thinks that "reset_parameters" is undefined
+            # in super class. Turns out that it is defined in _ConvND which is inherited by any class
+            # that also inherits _LazyConvXdMixin
+            super().reset_parameters()  # type: ignore[misc]
+
+    # Signature of "initialize_parameters" is incompatible with the definition in supertype LazyModuleMixin
+    def initialize_parameters(self, input: Tensor, *args, **kwargs) -> None:  # type: ignore[override]
+        # defined by parent class but using a protocol
+        if self.has_uninitialized_params():  # type: ignore[misc]
+            self.in_channels = self._get_in_channels(input)
+            if self.in_channels % self.groups != 0:
+                raise ValueError("in_channels must be divisible by groups")
+            assert isinstance(self.weight, UninitializedParameter)
+            if self.transposed:
+                self.weight.materialize(
+                    (
+                        self.in_channels,
+                        self.out_channels // self.groups,
+                        *self.kernel_size,
+                    )
+                )
+            else:
+                self.weight.materialize(
+                    (
+                        self.out_channels,
+                        self.in_channels // self.groups,
+                        *self.kernel_size,
+                    )
+                )
+            if self.bias is not None:
+                assert isinstance(self.bias, UninitializedParameter)
+                self.bias.materialize((self.out_channels,))
+            self.reset_parameters()
+
+    # Function to extract in_channels from first input.
+    def _get_in_channels(self, input: Tensor) -> int:
+        num_spatial_dims = self._get_num_spatial_dims()
+        num_dims_no_batch = num_spatial_dims + 1  # +1 for channels dim
+        num_dims_batch = num_dims_no_batch + 1
+        if input.dim() not in (num_dims_no_batch, num_dims_batch):
+            raise RuntimeError(
+                f"Expected {num_dims_no_batch}D (unbatched) or {num_dims_batch}D (batched) input "
+                f"to {self.__class__.__name__}, but "
+                f"got input of size: {input.shape}"
+            )
+        return input.shape[1] if input.dim() == num_dims_batch else input.shape[0]
+
+    # Function to return the number of spatial dims expected for inputs to the module.
+    # This is expected to be implemented by subclasses.
+    def _get_num_spatial_dims(self) -> int:
+        raise NotImplementedError
+
+
+# LazyConv1d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConv1d(_LazyConvXdMixin, Conv1d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`torch.nn.Conv1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConv2d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConv2d(_LazyConvXdMixin, Conv2d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv2d` that is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`torch.nn.Conv2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConv3d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConv3d(_LazyConvXdMixin, Conv3d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv3d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`torch.nn.Conv3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
+
+
+# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose1d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose2d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UninitializeParameter
+class LazyConvTranspose3d(_LazyConvXdMixin, ConvTranspose3d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose3d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        # pyrefly: ignore [bad-override, bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            # pyrefly: ignore [bad-override, bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ab92fef5eb4a6da80d97d8559a204a6956ac4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/distance.py
@@ -0,0 +1,100 @@
+import torch.nn.functional as F
+from torch import Tensor
+
+from .module import Module
+
+
+__all__ = ["PairwiseDistance", "CosineSimilarity"]
+
+
+class PairwiseDistance(Module):
+    r"""
+    Computes the pairwise distance between input vectors, or between columns of input matrices.
+
+    Distances are computed using ``p``-norm, with constant ``eps`` added to avoid division by zero
+    if ``p`` is negative, i.e.:
+
+    .. math ::
+        \mathrm{dist}\left(x, y\right) = \left\Vert x-y + \epsilon e \right\Vert_p,
+
+    where :math:`e` is the vector of ones and the ``p``-norm is given by.
+
+    .. math ::
+        \Vert x \Vert _p = \left( \sum_{i=1}^n  \vert x_i \vert ^ p \right) ^ {1/p}.
+
+    Args:
+        p (real, optional): the norm degree. Can be negative. Default: 2
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-6
+        keepdim (bool, optional): Determines whether or not to keep the vector dimension.
+            Default: False
+    Shape:
+        - Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension`
+        - Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1
+        - Output: :math:`(N)` or :math:`()` based on input dimension.
+          If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
+
+    Examples:
+        >>> pdist = nn.PairwiseDistance(p=2)
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> output = pdist(input1, input2)
+    """
+
+    __constants__ = ["norm", "eps", "keepdim"]
+    norm: float
+    eps: float
+    keepdim: bool
+
+    def __init__(
+        self, p: float = 2.0, eps: float = 1e-6, keepdim: bool = False
+    ) -> None:
+        super().__init__()
+        self.norm = p
+        self.eps = eps
+        self.keepdim = keepdim
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
+
+
+class CosineSimilarity(Module):
+    r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along `dim`.
+
+    .. math ::
+        \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}.
+
+    Args:
+        dim (int, optional): Dimension where cosine similarity is computed. Default: 1
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8
+    Shape:
+        - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
+        - Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`,
+          and broadcastable with x1 at other dimensions.
+        - Output: :math:`(\ast_1, \ast_2)`
+
+    Examples:
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        >>> output = cos(input1, input2)
+    """
+
+    __constants__ = ["dim", "eps"]
+    dim: int
+    eps: float
+
+    def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.cosine_similarity(x1, x2, self.dim, self.eps)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3de5d61dc0b56d6f708a242611bfc5b2850288
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/dropout.py
@@ -0,0 +1,323 @@
+import torch.nn.functional as F
+from torch import Tensor
+
+from .module import Module
+
+
+__all__ = [
+    "Dropout",
+    "Dropout1d",
+    "Dropout2d",
+    "Dropout3d",
+    "AlphaDropout",
+    "FeatureAlphaDropout",
+]
+
+
+class _DropoutNd(Module):
+    __constants__ = ["p", "inplace"]
+    p: float
+    inplace: bool
+
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                f"dropout probability has to be between 0 and 1, but got {p}"
+            )
+        self.p = p
+        self.inplace = inplace
+
+    def extra_repr(self) -> str:
+        return f"p={self.p}, inplace={self.inplace}"
+
+
+class Dropout(_DropoutNd):
+    r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`.
+
+    The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution.
+
+    Each channel will be zeroed out independently on every forward call.
+
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    `Improving neural networks by preventing co-adaptation of feature
+    detectors`_ .
+
+    Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.Dropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Improving neural networks by preventing co-adaptation of feature
+        detectors: https://arxiv.org/abs/1207.0580
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.dropout(input, self.p, self.training, self.inplace)
+
+
+class Dropout1d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 1D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 1D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv1d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout1d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`.
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout1d(p=0.2)
+        >>> input = torch.randn(20, 16, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.dropout1d(input, self.p, self.training, self.inplace)
+
+
+class Dropout2d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 2D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 2D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv2d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout2d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    .. warning ::
+        Due to historical reasons, this class will perform 1D channel-wise dropout
+        for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT
+        support inputs without a batch dimension of shape :math:`(C, H, W)`. This
+        behavior will change in a future release to interpret 3D inputs as no-batch-dim
+        inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`.
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout2d(p=0.2)
+        >>> input = torch.randn(20, 16, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.dropout2d(input, self.p, self.training, self.inplace)
+
+
+class Dropout3d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 3D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 3D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv3d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout3d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout3d(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.dropout3d(input, self.p, self.training, self.inplace)
+
+
+class AlphaDropout(_DropoutNd):
+    r"""Applies Alpha Dropout over the input.
+
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing
+    property.
+    For an input with zero mean and unit standard deviation, the output of
+    Alpha Dropout maintains the original mean and standard deviation of the
+    input.
+    Alpha Dropout goes hand-in-hand with SELU activation function, which ensures
+    that the outputs have zero mean and unit standard deviation.
+
+    During training, it randomly masks some of the elements of the input
+    tensor with probability *p* using samples from a bernoulli distribution.
+    The elements to masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit standard deviation.
+
+    During evaluation the module simply computes an identity function.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        p (float): probability of an element to be dropped. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.AlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.alpha_dropout(input, self.p, self.training)
+
+
+class FeatureAlphaDropout(_DropoutNd):
+    r"""Randomly masks out entire channels.
+
+    A channel is a feature map,
+    e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input
+    is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of
+    setting activations to zero, as in regular Dropout, the activations are set
+    to the negative saturation value of the SELU activation function. More details
+    can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Each element will be masked independently for each sample on every forward
+    call with probability :attr:`p` using samples from a Bernoulli distribution.
+    The elements to be masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit variance.
+
+    Usually the input comes from :class:`nn.AlphaDropout` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.AlphaDropout` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.FeatureAlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.feature_alpha_dropout(input, self.p, self.training)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..146a1890d422475712c9d62d0ff841530282d30e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/flatten.py
@@ -0,0 +1,167 @@
+# mypy: allow-untyped-defs
+
+from torch import Tensor
+from torch.types import _size
+
+from .module import Module
+
+
+__all__ = ["Flatten", "Unflatten"]
+
+
+class Flatten(Module):
+    r"""
+    Flattens a contiguous range of dims into a tensor.
+
+    For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details.
+
+    Shape:
+        - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,'
+          where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any
+          number of dimensions including none.
+        - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`.
+
+    Args:
+        start_dim: first dim to flatten (default = 1).
+        end_dim: last dim to flatten (default = -1).
+
+    Examples::
+        >>> input = torch.randn(32, 1, 5, 5)
+        >>> # With default parameters
+        >>> m = nn.Flatten()
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([32, 25])
+        >>> # With non-default parameters
+        >>> m = nn.Flatten(0, 2)
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([160, 5])
+    """
+
+    __constants__ = ["start_dim", "end_dim"]
+    start_dim: int
+    end_dim: int
+
+    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
+        super().__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return input.flatten(self.start_dim, self.end_dim)
+
+    def extra_repr(self) -> str:
+        """
+        Returns the extra representation of the module.
+        """
+        return f"start_dim={self.start_dim}, end_dim={self.end_dim}"
+
+
+class Unflatten(Module):
+    r"""
+    Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`.
+
+    * :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can
+      be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
+
+    * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape`
+      (tuple of `(name, size)` tuples) for `NamedTensor` input.
+
+    Shape:
+        - Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at
+          dimension :attr:`dim` and :math:`*` means any number of dimensions including none.
+        - Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and
+          :math:`\prod_{i=1}^n U_i = S_{\text{dim}}`.
+
+    Args:
+        dim (Union[int, str]): Dimension to be unflattened
+        unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
+
+    Examples:
+        >>> input = torch.randn(2, 50)
+        >>> # With tuple of ints
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, (2, 5, 5))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With torch.Size
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, torch.Size([2, 5, 5]))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With namedshape (tuple of tuples)
+        >>> input = torch.randn(2, 50, names=("N", "features"))
+        >>> unflatten = nn.Unflatten("features", (("C", 2), ("H", 5), ("W", 5)))
+        >>> output = unflatten(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+    """
+
+    NamedShape = tuple[tuple[str, int]]
+
+    __constants__ = ["dim", "unflattened_size"]
+    dim: int | str
+    unflattened_size: _size | NamedShape
+
+    def __init__(self, dim: int | str, unflattened_size: _size | NamedShape) -> None:
+        super().__init__()
+
+        if isinstance(dim, int):
+            self._require_tuple_int(unflattened_size)
+        elif isinstance(dim, str):
+            self._require_tuple_tuple(unflattened_size)
+        else:
+            raise TypeError("invalid argument type for dim parameter")
+
+        self.dim = dim
+        self.unflattened_size = unflattened_size
+
+    def _require_tuple_tuple(self, input) -> None:
+        if isinstance(input, tuple):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, tuple):
+                    raise TypeError(
+                        "unflattened_size must be tuple of tuples, "
+                        + f"but found element of type {type(elem).__name__} at pos {idx}"
+                    )
+            return
+        raise TypeError(
+            "unflattened_size must be a tuple of tuples, "
+            + f"but found type {type(input).__name__}"
+        )
+
+    def _require_tuple_int(self, input) -> None:
+        if isinstance(input, (tuple, list)):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, int):
+                    raise TypeError(
+                        "unflattened_size must be tuple of ints, "
+                        + f"but found element of type {type(elem).__name__} at pos {idx}"
+                    )
+            return
+        raise TypeError(
+            f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}"
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return input.unflatten(self.dim, self.unflattened_size)
+
+    def extra_repr(self) -> str:
+        """
+        Returns the extra representation of the module.
+        """
+        return f"dim={self.dim}, unflattened_size={self.unflattened_size}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1a58882c852370141e1e1dd911278334b425d8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/fold.py
@@ -0,0 +1,335 @@
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.common_types import _size_any_t
+
+from .module import Module
+
+
+__all__ = ["Fold", "Unfold"]
+
+
+class Fold(Module):
+    (
+        r"""Combines an array of sliding local blocks into a large containing tensor.
+
+    Consider a batched :attr:`input` tensor containing sliding local blocks,
+    e.g., patches of images, of shape :math:`(N, C \times  \prod(\text{kernel\_size}), L)`,
+    where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})`
+    is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})`
+    spatial locations each containing a :math:`C`-channeled vector), and
+    :math:`L` is the total number of blocks. (This is exactly the
+    same specification as the output shape of :class:`~torch.nn.Unfold`.) This
+    operation combines these local blocks into the large :attr:`output` tensor
+    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+    by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
+    arguments must satisfy
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
+            - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`d` is over all spatial dimensions.
+
+    * :attr:`output_size` describes the spatial shape of the large containing
+      tensor of the sliding local blocks. It is useful to resolve the ambiguity
+      when multiple input shapes map to same number of sliding blocks, e.g.,
+      with ``stride > 0``.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    Args:
+        output_size (int or tuple): the shape of the spatial dimensions of the
+                                    output (i.e., ``output.sizes()[2:]``)
+        kernel_size (int or tuple): the size of the sliding blocks
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple): the stride of the sliding blocks in the input
+                               spatial dimensions. Default: 1
+
+    * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
+      :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
+      their values will be replicated across all spatial dimensions.
+
+    * For the case of two output spatial dimensions this operation is sometimes
+      called ``col2im``.
+
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
+        In general, folding and unfolding operations are related as
+        follows. Consider :class:`~torch.nn.Fold` and
+        :class:`~torch.nn.Unfold` instances created with the same
+        parameters:
+
+        >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
+        >>> fold = nn.Fold(output_size=..., **fold_params)
+        >>> unfold = nn.Unfold(**fold_params)
+
+        Then for any (supported) ``input`` tensor the following
+        equality holds:
+
+        ::
+
+            fold(unfold(input)) == divisor * input
+
+        where ``divisor`` is a tensor that depends only on the shape
+        and dtype of the ``input``:
+
+        >>> # xdoctest: +SKIP
+        >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
+        >>> divisor = fold(unfold(input_ones))
+
+        When the ``divisor`` tensor contains no zero elements, then
+        ``fold`` and ``unfold`` operations are inverses of each
+        other (up to constant divisor).
+
+    .. warning::
+        Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported.
+
+    Shape:
+        - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)`
+        - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+          or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
+
+    Examples::
+
+        >>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2))
+        >>> input = torch.randn(1, 3 * 2 * 2, 12)
+        >>> output = fold(input)
+        >>> output.size()
+        torch.Size([1, 3, 4, 5])
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+    )
+
+    __constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"]
+    output_size: _size_any_t
+    kernel_size: _size_any_t
+    dilation: _size_any_t
+    padding: _size_any_t
+    stride: _size_any_t
+
+    def __init__(
+        self,
+        output_size: _size_any_t,
+        kernel_size: _size_any_t,
+        dilation: _size_any_t = 1,
+        padding: _size_any_t = 0,
+        stride: _size_any_t = 1,
+    ) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.fold(
+            input,
+            self.output_size,
+            self.kernel_size,
+            self.dilation,
+            self.padding,
+            self.stride,
+        )
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            "output_size={output_size}, kernel_size={kernel_size}, "
+            "dilation={dilation}, padding={padding}, stride={stride}".format(
+                **self.__dict__
+            )
+        )
+
+
+class Unfold(Module):
+    (
+        r"""Extracts sliding local blocks from a batched input tensor.
+
+    Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
+    where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
+    and :math:`*` represent arbitrary spatial dimensions. This operation flattens
+    each sliding :attr:`kernel_size`-sized block within the spatial dimensions
+    of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output`
+    tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where
+    :math:`C \times \prod(\text{kernel\_size})` is the total number of values
+    within each block (a block has :math:`\prod(\text{kernel\_size})` spatial
+    locations each containing a :math:`C`-channeled vector), and :math:`L` is
+    the total number of such blocks:
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
+            - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`\text{spatial\_size}` is formed by the spatial dimensions
+    of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
+    dimensions.
+
+    Therefore, indexing :attr:`output` at the last dimension (column dimension)
+    gives all values within a certain block.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+    Args:
+        kernel_size (int or tuple): the size of the sliding blocks
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple, optional): the stride of the sliding blocks in the input
+                                         spatial dimensions. Default: 1
+
+    * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
+      :attr:`stride` is an int or a tuple of length 1, their values will be
+      replicated across all spatial dimensions.
+
+    * For the case of two input spatial dimensions this operation is sometimes
+      called ``im2col``.
+
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
+        In general, folding and unfolding operations are related as
+        follows. Consider :class:`~torch.nn.Fold` and
+        :class:`~torch.nn.Unfold` instances created with the same
+        parameters:
+
+        >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
+        >>> fold = nn.Fold(output_size=..., **fold_params)
+        >>> unfold = nn.Unfold(**fold_params)
+
+        Then for any (supported) ``input`` tensor the following
+        equality holds:
+
+        ::
+
+            fold(unfold(input)) == divisor * input
+
+        where ``divisor`` is a tensor that depends only on the shape
+        and dtype of the ``input``:
+
+        >>> # xdoctest: +SKIP
+        >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
+        >>> divisor = fold(unfold(input_ones))
+
+        When the ``divisor`` tensor contains no zero elements, then
+        ``fold`` and ``unfold`` operations are inverses of each
+        other (up to constant divisor).
+
+    .. warning::
+        Currently, only 4-D input tensors (batched image-like tensors) are
+        supported.
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above
+
+    Examples::
+
+        >>> unfold = nn.Unfold(kernel_size=(2, 3))
+        >>> input = torch.randn(2, 5, 3, 4)
+        >>> output = unfold(input)
+        >>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels)
+        >>> # 4 blocks (2x3 kernels) in total in the 3x4 input
+        >>> output.size()
+        torch.Size([2, 30, 4])
+
+        >>> # xdoctest: +IGNORE_WANT
+        >>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
+        >>> inp = torch.randn(1, 3, 10, 12)
+        >>> w = torch.randn(2, 3, 4, 5)
+        >>> inp_unf = torch.nn.functional.unfold(inp, (4, 5))
+        >>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
+        >>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
+        >>> # or equivalently (and avoiding a copy),
+        >>> # out = out_unf.view(1, 2, 7, 8)
+        >>> (torch.nn.functional.conv2d(inp, w) - out).abs().max()
+        tensor(1.9073e-06)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+    )
+
+    __constants__ = ["kernel_size", "dilation", "padding", "stride"]
+    kernel_size: _size_any_t
+    dilation: _size_any_t
+    padding: _size_any_t
+    stride: _size_any_t
+
+    def __init__(
+        self,
+        kernel_size: _size_any_t,
+        dilation: _size_any_t = 1,
+        padding: _size_any_t = 0,
+        stride: _size_any_t = 1,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.unfold(
+            input, self.kernel_size, self.dilation, self.padding, self.stride
+        )
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            "kernel_size={kernel_size}, dilation={dilation}, padding={padding},"
+            " stride={stride}".format(**self.__dict__)
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..058ffb3ed9aa9fa9bf496c709b4f3e6c48e72178
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/instancenorm.py
@@ -0,0 +1,472 @@
+# mypy: allow-untyped-defs
+
+import warnings
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from .batchnorm import _LazyNormBase, _NormBase
+
+
+__all__ = [
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "LazyInstanceNorm1d",
+    "LazyInstanceNorm2d",
+    "LazyInstanceNorm3d",
+]
+
+
+class _InstanceNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = False,
+        track_running_stats: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def _get_no_batch_dim(self):
+        raise NotImplementedError
+
+    def _handle_no_batch_input(self, input):
+        return self._apply_instance_norm(input.unsqueeze(0)).squeeze(0)
+
+    def _apply_instance_norm(self, input):
+        return F.instance_norm(
+            input,
+            self.running_mean,
+            self.running_var,
+            self.weight,
+            self.bias,
+            self.training or not self.track_running_stats,
+            self.momentum if self.momentum is not None else 0.0,
+            self.eps,
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        version = local_metadata.get("version", None)
+        # at version 1: removed running_mean and running_var when
+        # track_running_stats=False (default)
+        if version is None and not self.track_running_stats:
+            running_stats_keys = []
+            for name in ("running_mean", "running_var"):
+                key = prefix + name
+                if key in state_dict:
+                    running_stats_keys.append(key)
+            if len(running_stats_keys) > 0:
+                error_msgs.append(
+                    "Unexpected running stats buffer(s) {names} for {klass} "
+                    "with track_running_stats=False. If state_dict is a "
+                    "checkpoint saved before 0.4.0, this may be expected "
+                    "because {klass} does not track running stats by default "
+                    "since 0.4.0. Please remove these keys from state_dict. If "
+                    "the running stats are actually needed, instead set "
+                    "track_running_stats=True in {klass} to enable them. See "
+                    "the documentation of {klass} for details.".format(
+                        names=" and ".join(f'"{k}"' for k in running_stats_keys),
+                        klass=self.__class__.__name__,
+                    )
+                )
+                for key in running_stats_keys:
+                    state_dict.pop(key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+
+        feature_dim = input.dim() - self._get_no_batch_dim()
+        if input.size(feature_dim) != self.num_features:
+            if self.affine:
+                raise ValueError(
+                    f"expected input's size at dim={feature_dim} to match num_features"
+                    f" ({self.num_features}), but got: {input.size(feature_dim)}."
+                )
+            else:
+                warnings.warn(
+                    f"input's size at dim={feature_dim} does not match num_features. "
+                    "You can silence this warning by not passing in num_features, "
+                    "which is not used because affine=False",
+                    stacklevel=2,
+                )
+
+        if input.dim() == self._get_no_batch_dim():
+            return self._handle_no_batch_input(input)
+
+        return self._apply_instance_norm(input)
+
+
+class InstanceNorm1d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 2D (unbatched) or 3D (batched) input as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``.
+    The variance is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: number of features or channels :math:`C` of the input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100, affine=True)
+        >>> input = torch.randn(20, 100, 40)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self) -> int:
+        return 2
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (2, 3):
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm1d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`, `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`(C, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm1d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 2
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (2, 3):
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class InstanceNorm2d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self) -> int:
+        return 3
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (3, 4):
+            raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
+
+
+class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm2d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm2d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm2d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 3
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (3, 4):
+            raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
+
+
+class InstanceNorm3d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size C (where C is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self) -> int:
+        return 4
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (4, 5):
+            raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
+
+
+class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm3d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm3d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm3d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 4
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (4, 5):
+            raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d90d1c10364ea380b1f27069dd69dda6ec80cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/lazy.py
@@ -0,0 +1,278 @@
+# mypy: allow-untyped-defs
+import itertools
+from typing import Any, Protocol
+
+import torch
+from torch.nn.parameter import is_lazy
+
+
+__all__ = ["LazyModuleMixin"]
+
+
+class _LazyProtocol(Protocol):
+    """This class is used to avoid errors with mypy checks for the attributes in a mixin.
+
+    https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
+    """
+
+    def _register_load_state_dict_pre_hook(self, hook): ...
+
+    def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False): ...
+
+    def _lazy_load_hook(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ): ...
+
+    def _get_name(self): ...
+
+    def _infer_parameters(self, module, input): ...
+
+    @property
+    def _parameters(self): ...
+
+    @property
+    def _buffers(self): ...
+
+    @property
+    def _non_persistent_buffers_set(self): ...
+
+    @property
+    def _load_hook(self): ...
+
+    @property
+    def _initialize_hook(self): ...
+
+
+class LazyModuleMixin:
+    r"""A mixin for modules that lazily initialize parameters, also known as "lazy modules".
+
+    .. warning:
+        Lazy modules are an experimental new feature under active development,
+        and their API is likely to change.
+
+    Modules that lazily initialize parameters, or "lazy modules",
+    derive the shapes of their parameters from the first input(s)
+    to their forward method. Until that first forward they contain
+    :class:`torch.nn.UninitializedParameter` s that should not be accessed
+    or used, and afterward they contain regular :class:`torch.nn.Parameter` s.
+    Lazy modules are convenient since they don't require computing some
+    module arguments, like the :attr:`in_features` argument of a
+    typical :class:`torch.nn.Linear`.
+
+    After construction, networks with lazy modules should first
+    be converted to the desired dtype and placed on the expected device.
+    This is because lazy modules only perform shape inference so the usual dtype
+    and device placement behavior applies.
+    The lazy modules should then perform "dry runs" to initialize all the components in the module.
+    These "dry runs" send inputs of the correct size, dtype, and device through
+    the network and to each one of its lazy modules. After this the network can be used as usual.
+
+    >>> # xdoctest: +SKIP
+    >>> class LazyMLP(torch.nn.Module):
+    ...     def __init__(self) -> None:
+    ...         super().__init__()
+    ...         self.fc1 = torch.nn.LazyLinear(10)
+    ...         self.relu1 = torch.nn.ReLU()
+    ...         self.fc2 = torch.nn.LazyLinear(1)
+    ...         self.relu2 = torch.nn.ReLU()
+    ...
+    ...     def forward(self, input):
+    ...         x = self.relu1(self.fc1(input))
+    ...         y = self.relu2(self.fc2(x))
+    ...         return y
+    >>> # constructs a network with lazy modules
+    >>> lazy_mlp = LazyMLP()
+    >>> # transforms the network's device and dtype
+    >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
+    >>> lazy_mlp = lazy_mlp.cuda()
+    >>> lazy_mlp
+    LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
+      (relu1): ReLU()
+      (fc2): LazyLinear(in_features=0, out_features=1, bias=True)
+      (relu2): ReLU()
+    )
+    >>> # performs a dry run to initialize the network's lazy modules
+    >>> lazy_mlp(torch.ones(10, 10).cuda())
+    >>> # after initialization, LazyLinear modules become regular Linear modules
+    >>> lazy_mlp
+    LazyMLP(
+      (fc1): Linear(in_features=10, out_features=10, bias=True)
+      (relu1): ReLU()
+      (fc2): Linear(in_features=10, out_features=1, bias=True)
+      (relu2): ReLU()
+    )
+    >>> # attaches an optimizer, since parameters can now be used as usual
+    >>> optim = torch.optim.SGD(lazy_mlp.parameters(), lr=0.01)
+
+    A final caveat when using lazy modules is that the order of initialization of a network's
+    parameters may change, since the lazy modules are always initialized after other modules.
+    For example, if the LazyMLP class defined above had a :class:`torch.nn.LazyLinear` module
+    first and then a regular :class:`torch.nn.Linear` second, the second module would be
+    initialized on construction and the first module would be initialized during the first dry run.
+    This can cause the parameters of a network using lazy modules to be initialized differently
+    than the parameters of a network without lazy modules as the order of parameter initializations,
+    which often depends on a stateful random number generator, is different.
+    Check :doc:`/notes/randomness` for more details.
+
+    Lazy modules can be serialized with a state dict like other modules. For example:
+
+    >>> lazy_mlp = LazyMLP()
+    >>> # The state dict shows the uninitialized parameters
+    >>> lazy_mlp.state_dict()
+    OrderedDict({'fc1.weight': <UninitializedParameter>,
+                 'fc1.bias': <UninitializedParameter>,
+                 'fc2.weight': <UninitializedParameter>,
+                 'fc2.bias': <UninitializedParameter>})
+
+    Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize
+    initialized LazyModules and they will remain initialized)
+
+
+    >>> full_mlp = LazyMLP()
+    >>> # Dry run to initialize another module
+    >>> full_mlp.forward(torch.ones(10, 1))
+    >>> # Load an initialized state into a lazy module
+    >>> lazy_mlp.load_state_dict(full_mlp.state_dict())
+    >>> # The state dict now holds valid values
+    >>> lazy_mlp.state_dict()
+    OrderedDict([('fc1.weight',
+                  tensor([[-0.3837],
+                          [ 0.0907],
+                          [ 0.6708],
+                          [-0.5223],
+                          [-0.9028],
+                          [ 0.2851],
+                          [-0.4537],
+                          [ 0.6813],
+                          [ 0.5766],
+                          [-0.8678]])),
+                 ('fc1.bias',
+                  tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
+                           4.5637e-41, -1.8788e+22,  4.5636e-41, -2.0042e-31,  4.5637e-41])),
+                 ('fc2.weight',
+                  tensor([[ 0.1320,  0.2938,  0.0679,  0.2793,  0.1088, -0.1795, -0.2301,  0.2807,
+                            0.2479,  0.1091]])),
+                 ('fc2.bias', tensor([0.0019]))])
+
+    Note, however, that the loaded parameters will not be replaced when doing a "dry run" if they are initialized
+    when the state is loaded. This prevents using initialized modules in different contexts.
+    """
+
+    # modules inheriting from this will change their __class__ to the specified
+    # one after they are fully initialized
+    cls_to_become: type[Any] | None = None
+
+    def __init__(self: _LazyProtocol, *args, **kwargs):
+        # Mypy doesn't like this super call in a mixin
+        super().__init__(*args, **kwargs)  # type: ignore[misc]
+        # pyrefly: ignore [read-only]
+        self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
+        # pyrefly: ignore [read-only]
+        self._initialize_hook = self.register_forward_pre_hook(
+            self._infer_parameters, with_kwargs=True
+        )
+
+    def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
+        # This should be ideally implemented as a hook,
+        # but we should override `detach` in the UninitializedParameter to return itself
+        # which is not clean
+        for name, param in self._parameters.items():
+            if param is not None:
+                if not (is_lazy(param) or keep_vars):
+                    param = param.detach()
+                destination[prefix + name] = param
+        for name, buf in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                if not (is_lazy(buf) or keep_vars):
+                    buf = buf.detach()
+                destination[prefix + name] = buf
+
+    def _lazy_load_hook(
+        self: _LazyProtocol,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        """load_state_dict pre-hook function for lazy buffers and parameters.
+
+        The purpose of this hook is to adjust the current state and/or
+        ``state_dict`` being loaded so that a module instance serialized in
+        both un/initialized state can be deserialized onto both un/initialized
+        module instance.
+        See comment in ``torch.nn.Module._register_load_state_dict_pre_hook``
+        for the details of the hook specification.
+        """
+        for name, param in itertools.chain(
+            self._parameters.items(), self._buffers.items()
+        ):
+            key = prefix + name
+            if key in state_dict and param is not None:
+                input_param = state_dict[key]
+                if is_lazy(param):
+                    # The current parameter is not initialized but the one being loaded one is
+                    # create a new parameter based on the uninitialized one
+                    if not is_lazy(input_param):
+                        with torch.no_grad():
+                            param.materialize(input_param.shape)
+
+    def initialize_parameters(self: _LazyProtocol, *args, **kwargs):
+        r"""Initialize parameters according to the input batch properties.
+
+        This adds an interface to isolate parameter initialization from the
+        forward pass when doing parameter shape inference.
+        """
+        raise NotImplementedError(
+            f"initialize_parameters is not implemented for {self.__class__.__name__}"
+        )
+
+    def has_uninitialized_params(self: _LazyProtocol):
+        r"""Check if a module has parameters that are not initialized."""
+        # This is to avoid the JIT to track this parameter and force
+        # custom modules __setstate__ to add it
+        params = self._parameters.values()
+        buffers = self._buffers.values()
+        for param in itertools.chain(params, buffers):
+            if is_lazy(param):
+                return True
+        return False
+
+    # torchrec tests the code consistency with the following code
+    # fmt: off
+    def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
+        r"""Infers the size and initializes the parameters according to the provided input batch.
+
+        Given a module that contains parameters that were declared inferable
+        using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass
+        in the complete module using the provided input to initialize all the parameters
+        as needed.
+        The module is set into evaluation mode before running the forward pass in order
+        to avoid saving statistics or calculating gradients
+        """
+        kwargs = kwargs if kwargs else {}
+        module.initialize_parameters(*args, **kwargs)
+        if module.has_uninitialized_params():
+            raise RuntimeError(f'module {self._get_name()} has not been fully initialized')
+        module._initialize_hook.remove()
+        module._load_hook.remove()
+        delattr(module, '_initialize_hook')
+        delattr(module, '_load_hook')
+        if module.cls_to_become is not None:
+            module.__class__ = module.cls_to_become
+    # fmt: on
+
+    def _replicate_for_data_parallel(self: _LazyProtocol):
+        raise RuntimeError(
+            "Modules with uninitialized parameters can't be used with `DataParallel`. "
+            "Run a dummy forward pass to correctly initialize the modules"
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58bdcefd0e0a9212d44891d6ade694e55c5f529
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/linear.py
@@ -0,0 +1,337 @@
+# mypy: allow-untyped-defs
+import math
+from typing import Any
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F, init
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from .lazy import LazyModuleMixin
+from .module import Module
+
+
+__all__ = [
+    "Bilinear",
+    "Identity",
+    "LazyLinear",
+    "Linear",
+]
+
+
+class Identity(Module):
+    r"""A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 20])
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return input
+
+
+class Linear(Module):
+    r"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Args:
+        in_features: size of each input sample
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input: :math:`(*, H_\text{in})` where :math:`*` means any number of
+          dimensions including none and :math:`H_\text{in} = \text{in\_features}`.
+        - Output: :math:`(*, H_\text{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_\text{out} = \text{out\_features}`.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+
+    Examples::
+
+        >>> m = nn.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(
+            torch.empty((out_features, in_features), **factory_kwargs)
+        )
+        if bias:
+            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
+        # https://github.com/pytorch/pytorch/issues/57109
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.linear(input, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
+
+
+# This class exists solely to avoid triggering an obscure error when scripting
+# an improperly quantized attention layer. See this issue for details:
+# https://github.com/pytorch/pytorch/issues/58969
+# TODO: fail fast on quantization API usage error, then remove this class
+# and replace uses of it with plain Linear
+class NonDynamicallyQuantizableLinear(Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(
+            in_features, out_features, bias=bias, device=device, dtype=dtype
+        )
+
+
+class Bilinear(Module):
+    r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`.
+
+    Args:
+        in1_features: size of each first input sample, must be > 0
+        in2_features: size of each second input sample, must be > 0
+        out_features: size of each output sample, must be > 0
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input1: :math:`(*, H_\text{in1})` where :math:`H_\text{in1}=\text{in1\_features}` and
+          :math:`*` means any number of additional dimensions including none. All but the last dimension
+          of the inputs should be the same.
+        - Input2: :math:`(*, H_\text{in2})` where :math:`H_\text{in2}=\text{in2\_features}`.
+        - Output: :math:`(*, H_\text{out})` where :math:`H_\text{out}=\text{out\_features}`
+          and all but the last dimension are the same shape as the input.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
+            The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in1\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+                :math:`k = \frac{1}{\text{in1\_features}}`
+
+    Examples::
+
+        >>> m = nn.Bilinear(20, 30, 40)
+        >>> input1 = torch.randn(128, 20)
+        >>> input2 = torch.randn(128, 30)
+        >>> output = m(input1, input2)
+        >>> print(output.size())
+        torch.Size([128, 40])
+    """
+
+    __constants__ = ["in1_features", "in2_features", "out_features"]
+    in1_features: int
+    in2_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(
+        self,
+        in1_features: int,
+        in2_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in1_features = in1_features
+        self.in2_features = in2_features
+        self.out_features = out_features
+        self.weight = Parameter(
+            torch.empty((out_features, in1_features, in2_features), **factory_kwargs)
+        )
+
+        if bias:
+            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        if self.in1_features <= 0:
+            raise ValueError(
+                f"in1_features must be > 0, but got (in1_features={self.in1_features})"
+            )
+        bound = 1 / math.sqrt(self.weight.size(1))
+        init.uniform_(self.weight, -bound, bound)
+        if self.bias is not None:
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.bilinear(input1, input2, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            f"in1_features={self.in1_features}, in2_features={self.in2_features}, "
+            f"out_features={self.out_features}, bias={self.bias is not None}"
+        )
+
+
+class LazyLinear(LazyModuleMixin, Linear):
+    r"""A :class:`torch.nn.Linear` module where `in_features` is inferred.
+
+    In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter`
+    class. They will be initialized after the first call to ``forward`` is done and the
+    module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument
+    of the :class:`Linear` is inferred from the ``input.shape[-1]``.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+
+
+    """
+
+    cls_to_become = Linear  # type: ignore[assignment]
+    # pyrefly: ignore [bad-override]
+    weight: UninitializedParameter
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(
+        self, out_features: int, bias: bool = True, device=None, dtype=None
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # bias is hardcoded to False to avoid creating tensor
+        # that will soon be overwritten.
+        # pyrefly: ignore [bad-argument-type]
+        super().__init__(0, 0, False)
+        # pyrefly: ignore [bad-argument-type]
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_features = out_features
+        if bias:
+            # pyrefly: ignore [bad-argument-type]
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        # pyrefly: ignore [bad-argument-type]
+        if not self.has_uninitialized_params() and self.in_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        """
+        Infers ``in_features`` based on ``input`` and initializes parameters.
+        """
+        # pyrefly: ignore [bad-argument-type]
+        if self.has_uninitialized_params():
+            with torch.no_grad():
+                self.in_features = input.shape[-1]
+                self.weight.materialize((self.out_features, self.in_features))
+                if self.bias is not None:
+                    self.bias.materialize((self.out_features,))
+                self.reset_parameters()
+        if self.in_features == 0:
+            assert input.shape[-1] == self.weight.shape[-1], (
+                f"The in_features inferred from input: {input.shape[-1]} "
+                f"is not equal to in_features from self.weight: "
+                f"{self.weight.shape[-1]}"
+            )
+            self.in_features = input.shape[-1]
+
+
+# TODO: PartialLinear - maybe in sparse?
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ada62febded14af25c6a32ec8c1e5998349d74
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/loss.py
@@ -0,0 +1,2083 @@
+# mypy: allow-untyped-defs
+from collections.abc import Callable
+from typing_extensions import deprecated
+
+from torch import Tensor
+from torch.nn import _reduction as _Reduction, functional as F
+
+from .distance import PairwiseDistance
+from .module import Module
+
+
+__all__ = [
+    "L1Loss",
+    "NLLLoss",
+    "NLLLoss2d",
+    "PoissonNLLLoss",
+    "GaussianNLLLoss",
+    "KLDivLoss",
+    "MSELoss",
+    "BCELoss",
+    "BCEWithLogitsLoss",
+    "HingeEmbeddingLoss",
+    "MultiLabelMarginLoss",
+    "SmoothL1Loss",
+    "HuberLoss",
+    "SoftMarginLoss",
+    "CrossEntropyLoss",
+    "MultiLabelSoftMarginLoss",
+    "CosineEmbeddingLoss",
+    "MarginRankingLoss",
+    "MultiMarginLoss",
+    "TripletMarginLoss",
+    "TripletMarginWithDistanceLoss",
+    "CTCLoss",
+]
+
+
+class _Loss(Module):
+    reduction: str
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
+        super().__init__()
+        if size_average is not None or reduce is not None:
+            self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
+        else:
+            self.reduction = reduction
+
+
+class _WeightedLoss(_Loss):
+    def __init__(
+        self,
+        weight: Tensor | None = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.register_buffer("weight", weight)
+        self.weight: Tensor | None
+
+
+class L1Loss(_Loss):
+    r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
+    the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left| x_n - y_n \right|,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`N` elements each.
+
+    The sum operation still operates over all the elements, and divides by :math:`N`.
+
+    The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
+
+    Supports real-valued and complex-valued inputs.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then
+          :math:`(*)`, same shape as the input.
+
+    Examples:
+
+        >>> loss = nn.L1Loss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.l1_loss(input, target, reduction=self.reduction)
+
+
+class NLLLoss(_WeightedLoss):
+    r"""The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
+    weight to each of the classes. This is particularly useful when you have an
+    unbalanced training set.
+
+    The `input` given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for
+    higher dimension inputs, such as computing NLL loss per-pixel for 2D images.
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
+    where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
+    this class index (this index may not necessarily be in the class range).
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \\
+        l_n = - w_{y_n} x_{n,y_n}, \\
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
+
+    where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
+    :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
+            \text{if reduction} = \text{`mean';}\\
+            \sum_{n=1}^N l_n,  &
+            \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``None``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When
+            :attr:`size_average` is ``True``, the loss is averaged over
+            non-ignored targets.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``None``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will override
+            :attr:`reduction`. Default: ``'mean'``
+
+    Shape::
+        - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, `N = batch size`, or
+          :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of `K`-dimensional loss.
+        - Target: :math:`(N)` or :math:`()`, where each value is
+          :math:`0 \leq \text{targets}[i] \leq C-1`, or
+          :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
+          K-dimensional loss.
+        - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or
+          :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
+          Otherwise, scalar.
+
+    Examples:
+
+        >>> log_softmax = nn.LogSoftmax(dim=1)
+        >>> loss_fn = nn.NLLLoss()
+        >>> # input to NLLLoss is of size N x C = 3 x 5
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> # each element in target must have 0 <= value < C
+        >>> target = torch.tensor([1, 0, 4])
+        >>> loss = loss_fn(log_softmax(input), target)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # 2D loss example (used, for example, with image inputs)
+        >>> N, C = 5, 4
+        >>> loss_fn = nn.NLLLoss()
+        >>> data = torch.randn(N, 16, 10, 10)
+        >>> conv = nn.Conv2d(16, C, (3, 3))
+        >>> log_softmax = nn.LogSoftmax(dim=1)
+        >>> # output of conv forward is of shape [N, C, 8, 8]
+        >>> output = log_softmax(conv(data))
+        >>> # each element in target must have 0 <= value < C
+        >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+        >>> # input to NLLLoss is of size N x C x height (8) x width (8)
+        >>> loss = loss_fn(output, target)
+        >>> loss.backward()
+    """
+
+    __constants__ = ["ignore_index", "reduction"]
+    ignore_index: int
+
+    def __init__(
+        self,
+        weight: Tensor | None = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.nll_loss(
+            input,
+            target,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+        )
+
+
+@deprecated(
+    "`NLLLoss2d` has been deprecated. "
+    "Please use `NLLLoss` instead as a drop-in replacement and see "
+    "https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss for more details.",
+    category=FutureWarning,
+)
+class NLLLoss2d(NLLLoss):
+    def __init__(
+        self,
+        weight: Tensor | None = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(weight, size_average, ignore_index, reduce, reduction)
+
+
+class PoissonNLLLoss(_Loss):
+    r"""Negative log likelihood loss with Poisson distribution of target.
+
+    The loss can be described as:
+
+    .. math::
+        \text{target} \sim \mathrm{Poisson}(\text{input})
+
+        \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
+                                    + \log(\text{target!})
+
+    The last term can be omitted or approximated with Stirling formula. The
+    approximation is used for target values more than 1. For targets less or
+    equal to 1 zeros are added to the loss.
+
+    Args:
+        log_input (bool, optional): if ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
+            :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
+        full (bool, optional): whether to compute full loss, i. e. to add the
+            Stirling approximation term
+
+            .. math::
+                \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input = False`. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Examples:
+
+        >>> loss = nn.PoissonNLLLoss()
+        >>> log_input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> output = loss(log_input, target)
+        >>> output.backward()
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
+          the same shape as the input.
+    """
+
+    __constants__ = ["log_input", "full", "eps", "reduction"]
+    log_input: bool
+    full: bool
+    eps: float
+
+    def __init__(
+        self,
+        log_input: bool = True,
+        full: bool = False,
+        size_average=None,
+        eps: float = 1e-8,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.log_input = log_input
+        self.full = full
+        self.eps = eps
+
+    def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.poisson_nll_loss(
+            log_input,
+            target,
+            log_input=self.log_input,
+            full=self.full,
+            eps=self.eps,
+            reduction=self.reduction,
+        )
+
+
+class GaussianNLLLoss(_Loss):
+    r"""Gaussian negative log likelihood loss.
+
+    The targets are treated as samples from Gaussian distributions with
+    expectations and variances predicted by the neural network. For a
+    ``target`` tensor modelled as having Gaussian distribution with a tensor
+    of expectations ``input`` and a tensor of positive variances ``var`` the loss is:
+
+    .. math::
+        \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
+        \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2}
+        {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
+
+    where :attr:`eps` is used for stability. By default, the constant term of
+    the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same
+    size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
+    of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
+
+    Args:
+        full (bool, optional): include the constant term in the loss
+            calculation. Default: ``False``.
+        eps (float, optional): value used to clamp ``var`` (see note below), for
+            stability. Default: 1e-6.
+        reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+            will be applied, ``'mean'``: the output is the average of all batch
+            member losses, ``'sum'``: the output is the sum of all batch member
+            losses. Default: ``'mean'``.
+
+    Shape:
+        - Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
+          dimensions
+        - Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
+          but with one dimension equal to 1 (to allow for broadcasting)
+        - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
+          with one dimension equal to 1, or same shape as the input but with one fewer
+          dimension (to allow for broadcasting), or a scalar value
+        - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
+          ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
+          shape as the input
+
+    Examples:
+        >>> loss = nn.GaussianNLLLoss()
+        >>> input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> var = torch.ones(5, 2, requires_grad=True)  # heteroscedastic
+        >>> output = loss(input, target, var)
+        >>> output.backward()
+
+        >>> loss = nn.GaussianNLLLoss()
+        >>> input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> var = torch.ones(5, 1, requires_grad=True)  # homoscedastic
+        >>> output = loss(input, target, var)
+        >>> output.backward()
+
+    Note:
+        The clamping of ``var`` is ignored with respect to autograd, and so the
+        gradients are unaffected by it.
+
+    Reference:
+        Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the
+        target probability distribution", Proceedings of 1994 IEEE International
+        Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
+        vol.1, doi: 10.1109/ICNN.1994.374138.
+    """
+
+    __constants__ = ["full", "eps", "reduction"]
+    full: bool
+    eps: float
+
+    def __init__(
+        self, *, full: bool = False, eps: float = 1e-6, reduction: str = "mean"
+    ) -> None:
+        super().__init__(None, None, reduction)
+        self.full = full
+        self.eps = eps
+
+    def forward(self, input: Tensor, target: Tensor, var: Tensor | float) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.gaussian_nll_loss(
+            input, target, var, full=self.full, eps=self.eps, reduction=self.reduction
+        )
+
+
+class KLDivLoss(_Loss):
+    r"""The Kullback-Leibler divergence loss.
+
+    For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`,
+    where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the
+    :attr:`target`, we define the **pointwise KL-divergence** as
+
+    .. math::
+
+        L(y_{\text{pred}},\ y_{\text{true}})
+            = y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}}
+            = y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}})
+
+    To avoid underflow issues when computing this quantity, this loss expects the argument
+    :attr:`input` in the log-space. The argument :attr:`target` may also be provided in the
+    log-space if :attr:`log_target`\ `= True`.
+
+    To summarise, this function is roughly equivalent to computing
+
+    .. code-block:: python
+
+        if not log_target:  # default
+            loss_pointwise = target * (target.log() - input)
+        else:
+            loss_pointwise = target.exp() * (target - input)
+
+    and then reducing this result depending on the argument :attr:`reduction` as
+
+    .. code-block:: python
+
+        if reduction == "mean":  # default
+            loss = loss_pointwise.mean()
+        elif reduction == "batchmean":  # mathematically correct
+            loss = loss_pointwise.sum() / input.size(0)
+        elif reduction == "sum":
+            loss = loss_pointwise.sum()
+        else:  # reduction == "none"
+            loss = loss_pointwise
+
+    .. note::
+        As all the other losses in PyTorch, this function expects the first argument,
+        :attr:`input`, to be the output of the model (e.g. the neural network)
+        and the second, :attr:`target`, to be the observations in the dataset.
+        This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where
+        :math:`P` denotes the distribution of the observations and :math:`Q` denotes the model.
+
+    .. warning::
+        :attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use
+        :attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to `False`, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is `False`. Default: `True`
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: `True`
+        reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"`
+        log_target (bool, optional): Specifies whether `target` is the log space. Default: `False`
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
+          same shape as the input.
+
+    Examples:
+        >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
+        >>> # input should be a distribution in the log space
+        >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
+        >>> # Sample a batch of distributions. Usually this would come from the dataset
+        >>> target = F.softmax(torch.rand(3, 5), dim=1)
+        >>> output = kl_loss(input, target)
+        >>>
+        >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
+        >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
+        >>> output = kl_loss(input, log_target)
+    """
+
+    __constants__ = ["reduction"]
+
+    def __init__(
+        self,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+        log_target: bool = False,
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.log_target = log_target
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.kl_div(
+            input, target, reduction=self.reduction, log_target=self.log_target
+        )
+
+
+class MSELoss(_Loss):
+    r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
+    each element in the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left( x_n - y_n \right)^2,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`N` elements each.
+
+    The mean operation still operates over all the elements, and divides by :math:`N`.
+
+    The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+
+    Examples:
+
+        >>> loss = nn.MSELoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.mse_loss(input, target, reduction=self.reduction)
+
+
+class BCELoss(_WeightedLoss):
+    r"""Creates a criterion that measures the Binary Cross Entropy between the target and
+    the input probabilities:
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets :math:`y` should be numbers
+    between 0 and 1.
+
+    Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be
+    mathematically undefined in the above loss equation. PyTorch chooses to set
+    :math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`.
+    However, an infinite term in the loss equation is not desirable for several reasons.
+
+    For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be
+    multiplying 0 with infinity. Secondly, if we have an infinite loss value, then
+    we would also have an infinite term in our gradient, since
+    :math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`.
+    This would make BCELoss's backward method nonlinear with respect to :math:`x_n`,
+    and using it for things like linear regression would not be straight-forward.
+
+    Our solution is that BCELoss clamps its log function outputs to be greater than
+    or equal to -100. This way, we can always have a finite loss value and a linear
+    backward method.
+
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size `nbatch`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+    Examples:
+
+        >>> m = nn.Sigmoid()
+        >>> loss = nn.BCELoss()
+        >>> input = torch.randn(3, 2, requires_grad=True)
+        >>> target = torch.rand(3, 2, requires_grad=False)
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.binary_cross_entropy(
+            input, target, weight=self.weight, reduction=self.reduction
+        )
+
+
+class BCEWithLogitsLoss(_Loss):
+    r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+    class. This version is more numerically stable than using a plain `Sigmoid`
+    followed by a `BCELoss` as, by combining the operations into one layer,
+    we take advantage of the log-sum-exp trick for numerical stability.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
+        + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets `t[i]` should be numbers
+    between 0 and 1.
+
+    It's possible to trade off recall and precision by adding weights to positive examples.
+    In the case of multi-label classification the loss can be described as:
+
+    .. math::
+        \ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad
+        l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c})
+        + (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right],
+
+    where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification,
+    :math:`c = 1` for single-label binary classification),
+    :math:`n` is the number of the sample in the batch and
+    :math:`p_c` is the weight of the positive answer for the class :math:`c`.
+
+    :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision.
+
+    For example, if a dataset contains 100 positive and 300 negative examples of a single class,
+    then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
+    The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
+
+    Examples:
+
+        >>> target = torch.ones([10, 64], dtype=torch.float32)  # 64 classes, batch size = 10
+        >>> output = torch.full([10, 64], 1.5)  # A prediction (logit)
+        >>> pos_weight = torch.ones([64])  # All weights are equal to 1
+        >>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+        >>> criterion(output, target)  # -log(sigmoid(1.5))
+        tensor(0.20...)
+
+    In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes
+    in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the
+    loss function based on the imbalance between negative and positive samples for the respective class.
+    This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss
+    calculation accurately accounts for the distribution in each class.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size `nbatch`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target.
+            Must be a tensor with equal size along the class dimension to the number of classes.
+            Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired
+            operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
+            size [B, C, H, W] will apply different pos_weights to each element of the batch or
+            [C, H, W] the same pos_weights across the batch. To apply the same positive weight
+            along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+            Default: ``None``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+    Examples:
+
+        >>> loss = nn.BCEWithLogitsLoss()
+        >>> input = torch.randn(3, requires_grad=True)
+        >>> target = torch.empty(3).random_(2)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+
+    def __init__(
+        self,
+        weight: Tensor | None = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+        pos_weight: Tensor | None = None,
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.register_buffer("weight", weight)
+        self.register_buffer("pos_weight", pos_weight)
+        self.weight: Tensor | None
+        self.pos_weight: Tensor | None
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.binary_cross_entropy_with_logits(
+            input,
+            target,
+            self.weight,
+            pos_weight=self.pos_weight,
+            reduction=self.reduction,
+        )
+
+
+class HingeEmbeddingLoss(_Loss):
+    r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
+    (containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
+    used for learning nonlinear embeddings or semi-supervised learning.
+
+    The loss function for :math:`n`-th sample in the mini-batch is
+
+    .. math::
+        l_n = \begin{cases}
+            x_n, & \text{if}\; y_n = 1,\\
+            \max \{0, margin - x_n\}, & \text{if}\; y_n = -1,
+        \end{cases}
+
+    and the total loss functions is
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    where :math:`L = \{l_1,\dots,l_N\}^\top`.
+
+    Args:
+        margin (float, optional): Has a default value of `1`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
+          operates over all the elements.
+        - Target: :math:`(*)`, same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
+    """
+
+    __constants__ = ["margin", "reduction"]
+    margin: float
+
+    def __init__(
+        self,
+        margin: float = 1.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.hinge_embedding_loss(
+            input, target, margin=self.margin, reduction=self.reduction
+        )
+
+
+class MultiLabelMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a multi-class multi-classification
+    hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+    and output :math:`y` (which is a 2D `Tensor` of target class indices).
+    For each sample in the mini-batch:
+
+    .. math::
+        \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
+
+    where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
+    :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
+    :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
+    and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
+
+    :math:`y` and :math:`x` must have the same size.
+
+    The criterion only considers a contiguous block of non-negative targets that
+    starts at the front.
+
+    This allows for different samples to have variable amounts of target classes.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
+          is the number of classes.
+        - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+
+    Examples:
+
+        >>> loss = nn.MultiLabelMarginLoss()
+        >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
+        >>> # for target y, only consider labels 3 and 0, not after label -1
+        >>> y = torch.LongTensor([[3, 0, -1, 1]])
+        >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
+        >>> loss(x, y)
+        tensor(0.85...)
+
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.multilabel_margin_loss(input, target, reduction=self.reduction)
+
+
+class SmoothL1Loss(_Loss):
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below beta and an L1 term otherwise.
+    It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases
+    prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick).
+
+    For a batch of size :math:`N`, the unreduced loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1, ..., l_N\}^T
+
+    with
+
+    .. math::
+        l_n = \begin{cases}
+        0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\
+        |x_n - y_n| - 0.5 * beta, & \text{otherwise }
+        \end{cases}
+
+    If `reduction` is not `none`, then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    .. note::
+        Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta`
+        portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`.
+        The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`.
+
+    .. note::
+        Smooth L1 loss is closely related to :class:`HuberLoss`, being
+        equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is
+        also known as delta for Huber). This leads to the following differences:
+
+        * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss`
+          converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss.
+        * As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while
+          :class:`HuberLoss` converges to :class:`MSELoss`.
+        * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1.
+          For :class:`HuberLoss`, the slope of the L1 segment is beta.
+
+    .. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
+            The value must be non-negative. Default: 1.0
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
+    """
+
+    __constants__ = ["reduction"]
+
+    def __init__(
+        self, size_average=None, reduce=None, reduction: str = "mean", beta: float = 1.0
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.beta = beta
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
+
+
+class HuberLoss(_Loss):
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below delta and a delta-scaled L1 term otherwise.
+    This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the
+    delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`,
+    while the L2 region provides smoothness over :class:`L1Loss` near 0. See
+    `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`_ for more information.
+
+    For a batch of size :math:`N`, the unreduced loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1, ..., l_N\}^T
+
+    with
+
+    .. math::
+        l_n = \begin{cases}
+        0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\
+        delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise }
+        \end{cases}
+
+    If `reduction` is not `none`, then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    .. note::
+        When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`.
+        In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta
+        in Smooth L1).
+        See :class:`SmoothL1Loss` for additional discussion on the differences in behavior
+        between the two losses.
+
+    Args:
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+        delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss.
+            The value must be positive.  Default: 1.0
+
+    Shape:
+        - Input: :math:`(*)` where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
+    """
+
+    __constants__ = ["reduction", "delta"]
+
+    def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None:
+        super().__init__(reduction=reduction)
+        self.delta = delta
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
+
+
+class SoftMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a two-class classification
+    logistic loss between input tensor :math:`x` and target tensor :math:`y`
+    (containing 1 or -1).
+
+    .. math::
+        \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.soft_margin_loss(input, target, reduction=self.reduction)
+
+
+class CrossEntropyLoss(_WeightedLoss):
+    r"""This criterion computes the cross entropy loss between input logits
+    and target.
+
+    It is useful when training a classification problem with `C` classes.
+    If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
+    assigning weight to each of the classes.
+    This is particularly useful when you have an unbalanced training set.
+
+    The `input` is expected to contain the unnormalized logits for each class (which do `not` need
+    to be positive or sum to 1, in general).
+    `input` has to be a Tensor of size :math:`(C)` for unbatched input,
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the
+    `K`-dimensional case. The last being useful for higher dimension inputs, such
+    as computing cross entropy loss per-pixel for 2D images.
+
+    The `target` that this criterion expects should contain either:
+
+    - Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if
+      `ignore_index` is specified, this loss also accepts this class index (this index
+      may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction`
+      set to ``'none'``) loss for this case can be described as:
+
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
+          \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}
+
+      where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
+      :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
+      :math:`d_1, ..., d_k` for the `K`-dimensional case. If
+      :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
+
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, &
+               \text{if reduction} = \text{`mean';}\\
+                \sum_{n=1}^N l_n,  &
+                \text{if reduction} = \text{`sum'.}
+            \end{cases}
+
+      Note that this case is equivalent to applying :class:`~torch.nn.LogSoftmax`
+      on an input, followed by :class:`~torch.nn.NLLLoss`.
+
+    - Probabilities for each class; useful when labels beyond a single class per minibatch item
+      are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with
+      :attr:`reduction` set to ``'none'``) loss for this case can be described as:
+
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
+
+      where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
+      :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
+      :math:`d_1, ..., d_k` for the `K`-dimensional case. If
+      :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
+
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \frac{\sum_{n=1}^N l_n}{N}, &
+               \text{if reduction} = \text{`mean';}\\
+                \sum_{n=1}^N l_n,  &
+                \text{if reduction} = \text{`sum'.}
+            \end{cases}
+
+    .. note::
+        The performance of this criterion is generally better when `target` contains class
+        indices, as this allows for optimized computation. Consider providing `target` as
+        class probabilities only when a single class label per minibatch item is too restrictive.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size `C`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Note that
+            :attr:`ignore_index` is only applicable when the target contains class indices.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will override
+            :attr:`reduction`. Default: ``'mean'``
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
+
+    Shape:
+        - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of `K`-dimensional loss.
+        - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
+          :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The
+          target data type is required to be long when using class indices. If containing class probabilities, the
+          target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target
+          data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce
+          probability constraints on the class probabilities and that it is the user's responsibility to ensure
+          ``target`` contains valid probability distributions (see below examples section for more details).
+        - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
+
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                C ={} & \text{number of classes} \\
+                N ={} & \text{batch size} \\
+            \end{aligned}
+
+    Examples:
+
+        >>> # Example of target with class indices
+        >>> loss = nn.CrossEntropyLoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+        >>>
+        >>> # Example of target with class probabilities
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5).softmax(dim=1)
+        >>> output = loss(input, target)
+        >>> output.backward()
+
+    .. note::
+        When ``target`` contains class probabilities, it should consist of soft labels—that is,
+        each ``target`` entry should represent a probability distribution over the possible classes for a given data sample,
+        with individual probabilities between ``[0,1]`` and the total distribution summing to 1.
+        This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above.
+
+        PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]``
+        or whether the distribution of each data sample sums to ``1``.
+        No warning will be raised and it is the user's responsibility
+        to ensure that ``target`` contains valid probability distributions.
+        Providing arbitrary values may yield misleading loss values and unstable gradients during training.
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # Example of target with incorrectly specified class probabilities
+        >>> loss = nn.CrossEntropyLoss()
+        >>> torch.manual_seed(283)
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> # Provided target class probabilities are not in range [0,1]
+        >>> target
+        tensor([[ 0.7105,  0.4446,  2.0297,  0.2671, -0.6075],
+                [-1.0496, -0.2753, -0.3586,  0.9270,  1.0027],
+                [ 0.7551,  0.1003,  1.3468, -0.3581, -0.9569]])
+        >>> # Provided target class probabilities do not sum to 1
+        >>> target.sum(axis=1)
+        tensor([2.8444, 0.2462, 0.8873])
+        >>> # No error message and possible misleading loss value
+        >>> loss(input, target).item()
+        4.6379876136779785
+        >>>
+        >>> # Example of target with correctly specified class probabilities
+        >>> # Use .softmax() to ensure true probability distribution
+        >>> target_new = target.softmax(dim=1)
+        >>> # New target class probabilities all in range [0,1]
+        >>> target_new
+        tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417],
+                [0.0496, 0.1075, 0.0990, 0.3579, 0.3860],
+                [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]])
+        >>> # New target class probabilities sum to 1
+        >>> target_new.sum(axis=1)
+        tensor([1.0000, 1.0000, 1.0000])
+        >>> loss(input, target_new).item()
+        2.55349063873291
+    """
+
+    __constants__ = ["ignore_index", "reduction", "label_smoothing"]
+    ignore_index: int
+    label_smoothing: float
+
+    def __init__(
+        self,
+        weight: Tensor | None = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+        label_smoothing: float = 0.0,
+    ) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.cross_entropy(
+            input,
+            target,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+            label_smoothing=self.label_smoothing,
+        )
+
+
+class MultiLabelSoftMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-label one-versus-all
+    loss based on max-entropy, between input :math:`x` and target :math:`y` of size
+    :math:`(N, C)`.
+    For each sample in the minibatch:
+
+    .. math::
+        loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
+                         + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
+
+    where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
+    :math:`y[i] \in \left\{0, \; 1\right\}`.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
+        - Target: :math:`(N, C)`, label targets must have the same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+    """
+
+    __constants__ = ["reduction"]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.multilabel_soft_margin_loss(
+            input, target, weight=self.weight, reduction=self.reduction
+        )
+
+
+class CosineEmbeddingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given input tensors
+    :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
+    Use (:math:`y=1`) to maximize the cosine similarity of two inputs, and (:math:`y=-1`) otherwise.
+    This is typically used for learning nonlinear
+    embeddings or semi-supervised learning.
+
+    The loss function for each sample is:
+
+    .. math::
+        \text{loss}(x, y) =
+        \begin{cases}
+        1 - \cos(x_1, x_2), & \text{if } y = 1 \\
+        \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
+        \end{cases}
+
+    Args:
+        margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
+            :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
+            default value is :math:`0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension.
+        - Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1.
+        - Target: :math:`(N)` or :math:`()`.
+        - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
+
+    Examples:
+
+        >>> loss = nn.CosineEmbeddingLoss()
+        >>> input1 = torch.randn(3, 5, requires_grad=True)
+        >>> input2 = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.ones(3)
+        >>> output = loss(input1, input2, target)
+        >>> output.backward()
+    """
+
+    __constants__ = ["margin", "reduction"]
+    margin: float
+
+    def __init__(
+        self,
+        margin: float = 0.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.cosine_embedding_loss(
+            input1, input2, target, margin=self.margin, reduction=self.reduction
+        )
+
+
+class MarginRankingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given
+    inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`,
+    and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1).
+
+    If :math:`y = 1` then it assumed the first input should be ranked higher
+    (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
+
+    The loss function for each pair of samples in the mini-batch is:
+
+    .. math::
+        \text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin})
+
+    Args:
+        margin (float, optional): Has a default value of :math:`0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input1: :math:`(N)` or :math:`()` where `N` is the batch size.
+        - Input2: :math:`(N)` or :math:`()`, same shape as the Input1.
+        - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
+        - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
+
+    Examples:
+
+        >>> loss = nn.MarginRankingLoss()
+        >>> input1 = torch.randn(3, requires_grad=True)
+        >>> input2 = torch.randn(3, requires_grad=True)
+        >>> target = torch.randn(3).sign()
+        >>> output = loss(input1, input2, target)
+        >>> output.backward()
+    """
+
+    __constants__ = ["margin", "reduction"]
+    margin: float
+
+    def __init__(
+        self,
+        margin: float = 0.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.margin_ranking_loss(
+            input1, input2, target, margin=self.margin, reduction=self.reduction
+        )
+
+
+class MultiMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-class classification hinge
+    loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
+    output :math:`y` (which is a 1D tensor of target class indices,
+    :math:`0 \leq y \leq \text{x.size}(1)-1`):
+
+    For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
+    output :math:`y` is:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
+
+    where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
+    and :math:`i \neq y`.
+
+    Optionally, you can give non-equal weighting on the classes by passing
+    a 1D :attr:`weight` tensor into the constructor.
+
+    The loss function then becomes:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i w[y] * \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
+
+    Args:
+        p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
+            are the only supported values.
+        margin (float, optional): Has a default value of :math:`1`.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes.
+        - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
+
+    Examples:
+
+        >>> loss = nn.MultiMarginLoss()
+        >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
+        >>> y = torch.tensor([3])
+        >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
+        >>> loss(x, y)
+        tensor(0.32...)
+    """
+
+    __constants__ = ["p", "margin", "reduction"]
+    margin: float
+    p: int
+
+    def __init__(
+        self,
+        p: int = 1,
+        margin: float = 1.0,
+        weight: Tensor | None = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        if p != 1 and p != 2:
+            raise ValueError("only p == 1 and p == 2 supported")
+        if weight is not None and weight.dim() != 1:
+            raise ValueError(
+                f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead"
+            )
+        self.p = p
+        self.margin = margin
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.multi_margin_loss(
+            input,
+            target,
+            p=self.p,
+            margin=self.margin,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
+
+
+class TripletMarginLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given an input
+    tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
+    :math:`(N, D)`.
+
+    The distance swap is described in detail in the paper `Learning shallow
+    convolutional feature descriptors with triplet losses`_ by
+    V. Balntas, E. Riba et al.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+
+    where
+
+    .. math::
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
+
+    The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is
+    added for numerical stability.
+
+    See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
+    triplet margin loss for input tensors using a custom distance function.
+
+    Args:
+        margin (float, optional): Default: :math:`1`.
+        p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
+        eps (float, optional): Small constant for numerical stability. Default: :math:`1e-6`.
+        swap (bool, optional): The distance swap is described in detail in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. Default: ``False``.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
+          input shape is :math:`(N, D)`; a scalar otherwise.
+
+    Examples:
+
+    >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
+    >>> anchor = torch.randn(100, 128, requires_grad=True)
+    >>> positive = torch.randn(100, 128, requires_grad=True)
+    >>> negative = torch.randn(100, 128, requires_grad=True)
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    .. _Learning shallow convolutional feature descriptors with triplet losses:
+        https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
+    """
+
+    __constants__ = ["margin", "p", "eps", "swap", "reduction"]
+    margin: float
+    p: float
+    eps: float
+    swap: bool
+
+    def __init__(
+        self,
+        margin: float = 1.0,
+        p: float = 2.0,
+        eps: float = 1e-6,
+        swap: bool = False,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average, reduce, reduction)
+        if margin <= 0:
+            raise ValueError(
+                f"TripletMarginLoss: expected margin to be greater than 0, got {margin} instead"
+            )
+        self.margin = margin
+        self.p = p
+        self.eps = eps
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.triplet_margin_loss(
+            anchor,
+            positive,
+            negative,
+            margin=self.margin,
+            p=self.p,
+            eps=self.eps,
+            swap=self.swap,
+            reduction=self.reduction,
+        )
+
+
+class TripletMarginWithDistanceLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given input
+    tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+    positive, and negative examples, respectively), and a nonnegative,
+    real-valued function ("distance function") used to compute the relationship
+    between the anchor and positive example ("positive distance") and the
+    anchor and negative example ("negative distance").
+
+    The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
+    can be described as:
+
+    .. math::
+        \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+    where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
+    quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
+    and :math:`margin` is a nonnegative margin representing the minimum difference
+    between the positive and negative distances that is required for the loss to
+    be 0.  The input tensors have :math:`N` elements each and can be of any shape
+    that the distance function can handle.
+
+    If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
+    loss for input tensors using the :math:`l_p` distance as the distance function.
+
+    Args:
+        distance_function (Callable, optional): A nonnegative, real-valued function that
+            quantifies the closeness of two tensors. If not specified,
+            `nn.PairwiseDistance` will be used.  Default: ``None``
+        margin (float, optional): A nonnegative margin representing the minimum difference
+            between the positive and negative distances required for the loss to be 0. Larger
+            margins penalize cases where the negative examples are not distant enough from the
+            anchors, relative to the positives. Default: :math:`1`.
+        swap (bool, optional): Whether to use the distance swap described in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
+            negative example than the anchor is, swaps the positive example and the anchor in
+            the loss computation. Default: ``False``.
+        reduction (str, optional): Specifies the (optional) reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
+          as supported by the distance function.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+          otherwise.
+
+    Examples:
+
+    >>> # Initialize embeddings
+    >>> embedding = nn.Embedding(1000, 128)
+    >>> anchor_ids = torch.randint(0, 1000, (1,))
+    >>> positive_ids = torch.randint(0, 1000, (1,))
+    >>> negative_ids = torch.randint(0, 1000, (1,))
+    >>> anchor = embedding(anchor_ids)
+    >>> positive = embedding(positive_ids)
+    >>> negative = embedding(negative_ids)
+    >>>
+    >>> # Built-in Distance Function
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function
+    >>> def l_infinity(x1, x2):
+    >>>     return torch.max(torch.abs(x1 - x2), dim=1).values
+    >>>
+    >>> # xdoctest: +SKIP("FIXME: Would call backwards a second time")
+    >>> triplet_loss = (
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function (Lambda)
+    >>> triplet_loss = (
+    >>>     nn.TripletMarginWithDistanceLoss(
+    >>>         distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    Reference:
+        V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
+        https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
+    """
+
+    __constants__ = ["margin", "swap", "reduction"]
+    margin: float
+    swap: bool
+
+    def __init__(
+        self,
+        *,
+        distance_function: Callable[[Tensor, Tensor], Tensor] | None = None,
+        margin: float = 1.0,
+        swap: bool = False,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(size_average=None, reduce=None, reduction=reduction)
+        if margin <= 0:
+            raise ValueError(
+                f"TripletMarginWithDistanceLoss: expected margin to be greater than 0, got {margin} instead"
+            )
+        self.distance_function: Callable[[Tensor, Tensor], Tensor] | None = (
+            distance_function if distance_function is not None else PairwiseDistance()
+        )
+        self.margin = margin
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.triplet_margin_with_distance_loss(
+            anchor,
+            positive,
+            negative,
+            distance_function=self.distance_function,
+            margin=self.margin,
+            swap=self.swap,
+            reduction=self.reduction,
+        )
+
+
+class CTCLoss(_Loss):
+    r"""The Connectionist Temporal Classification loss.
+
+    Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
+    probability of possible alignments of input to target, producing a loss value which is differentiable
+    with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
+    limits the length of the target sequence such that it must be :math:`\leq` the input length.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken, ``'sum'``: the output losses will be summed.
+            Default: ``'mean'``
+        zero_infinity (bool, optional):
+            Whether to zero infinite losses and the associated gradients.
+            Default: ``False``
+            Infinite losses mainly occur when the inputs are too short
+            to be aligned to the targets.
+
+    Shape:
+        - Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`,
+          where :math:`T = \text{input length}`,
+          :math:`N = \text{batch size}`, and
+          :math:`C = \text{number of classes (including blank)}`.
+          The logarithmized probabilities of the outputs (e.g. obtained with
+          :func:`torch.nn.functional.log_softmax`).
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\operatorname{sum}(\text{target\_lengths}))`,
+          where :math:`N = \text{batch size}` and
+          :math:`S = \text{max target length, if shape is } (N, S)`.
+          It represents the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
+          where :math:`N = \text{batch size}`. It represents the lengths of the
+          inputs (must each be :math:`\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
+          where :math:`N = \text{batch size}`. It represents lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+        - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
+          ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or
+          :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
+
+    Examples:
+
+        >>> # Target are to be padded
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
+        >>> S = 30  # Target sequence length of longest target in batch (padding length)
+        >>> S_min = 10  # Minimum target length, for demonstration purposes
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
+        >>>
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>> target_lengths = torch.randint(
+        ...     low=S_min,
+        ...     high=S,
+        ...     size=(N,),
+        ...     dtype=torch.long,
+        ... )
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # Target are to be un-padded
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        >>> target = torch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(sum(target_lengths),),
+        ...     dtype=torch.long,
+        ... )
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # Target are to be un-padded and unbatched (effectively N=1)
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,C)
+        >>> # xdoctest: +SKIP("FIXME: error in doctest")
+        >>> input = torch.randn(T, C).log_softmax(1).detach().requires_grad_()
+        >>> input_lengths = torch.tensor(T, dtype=torch.long)
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
+        >>> target = torch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(target_lengths,),
+        ...     dtype=torch.long,
+        ... )
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    Note:
+        In order to use CuDNN, the following must be satisfied: the :attr:`targets` must be
+        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
+        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
+        dtype :attr:`torch.int32`, and the :attr:`log_probs` itself must be of
+        dtype :attr:`torch.float32`.
+
+        The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+
+
+    Note:
+        In some circumstances when using the CUDA backend with CuDNN, this operator
+        may select a nondeterministic algorithm to increase performance. If this is
+        undesirable, you can try to make the operation deterministic (potentially at
+        a performance cost) by setting ``torch.backends.cudnn.deterministic =
+        True``.
+        Please see the notes on :doc:`/notes/randomness` for background.
+    """
+
+    __constants__ = ["blank", "reduction"]
+    blank: int
+    zero_infinity: bool
+
+    def __init__(
+        self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
+    ) -> None:
+        super().__init__(reduction=reduction)
+        self.blank = blank
+        self.zero_infinity = zero_infinity
+
+    def forward(
+        self,
+        log_probs: Tensor,
+        targets: Tensor,
+        input_lengths: Tensor,
+        target_lengths: Tensor,
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.ctc_loss(
+            log_probs,
+            targets,
+            input_lengths,
+            target_lengths,
+            self.blank,
+            self.reduction,
+            self.zero_infinity,
+        )
+
+
+# TODO: L1HingeEmbeddingCriterion
+# TODO: MSECriterion weight
+# TODO: ClassSimplexCriterion
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9123f76b75c31d71c2e863c2cdb3c87f862291f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py
@@ -0,0 +1,3046 @@
+# mypy: allow-untyped-defs
+
+import functools
+import inspect
+import itertools
+import warnings
+import weakref
+from collections import namedtuple, OrderedDict
+from collections.abc import Callable, Iterator, Mapping
+from typing import Any, Optional, overload, TypeVar, Union
+from typing_extensions import Self
+
+import torch
+from torch import device, dtype, Tensor
+from torch._prims_common import DeviceLikeType
+from torch.nn.parameter import Buffer, Parameter
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils.hooks import BackwardHook, RemovableHandle
+
+
+__all__ = [
+    "register_module_forward_pre_hook",
+    "register_module_forward_hook",
+    "register_module_full_backward_pre_hook",
+    "register_module_backward_hook",
+    "register_module_full_backward_hook",
+    "register_module_buffer_registration_hook",
+    "register_module_module_registration_hook",
+    "register_module_parameter_registration_hook",
+    "Module",
+]
+
+_grad_t = Union[tuple[Tensor, ...], Tensor]
+# See https://mypy.readthedocs.io/en/latest/generics.html#generic-methods-and-generic-self for the use
+# of `T` to annotate `self`. Many methods of `Module` return `self` and we want those return values to be
+# the type of the subclass, not the looser type of `Module`.
+T = TypeVar("T", bound="Module")
+
+
+class _IncompatibleKeys(
+    # pyrefly: ignore [invalid-inheritance]
+    namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]),
+):
+    __slots__ = ()
+
+    def __repr__(self) -> str:
+        # pyrefly: ignore [missing-attribute]
+        if not self.missing_keys and not self.unexpected_keys:
+            return "<All keys matched successfully>"
+        return super().__repr__()
+
+    __str__ = __repr__
+
+
+def _addindent(s_, numSpaces):
+    s = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s
+
+
+r"""This tracks hooks common to all modules that are executed immediately before
+.registering the buffer/module/parameter"""
+_global_buffer_registration_hooks: dict[int, Callable] = OrderedDict()
+_global_module_registration_hooks: dict[int, Callable] = OrderedDict()
+_global_parameter_registration_hooks: dict[int, Callable] = OrderedDict()
+
+
+class _WrappedHook:
+    def __init__(self, hook: Callable, module: Optional["Module"] = None) -> None:
+        self.hook: Callable = hook
+        functools.update_wrapper(self, hook)
+
+        self.with_module: bool = False
+
+        if module is not None:
+            self.module: weakref.ReferenceType[Module] = weakref.ref(module)
+            self.with_module = True
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        if self.with_module:
+            module = self.module()
+            if module is None:
+                raise RuntimeError("You are trying to call the hook of a dead Module!")
+            return self.hook(module, *args, **kwargs)
+        return self.hook(*args, **kwargs)
+
+    def __getstate__(self) -> dict:
+        result = {"hook": self.hook, "with_module": self.with_module}
+        if self.with_module:
+            # pyrefly: ignore [unsupported-operation]
+            result["module"] = self.module()
+
+        return result
+
+    def __setstate__(self, state: dict):
+        self.hook = state["hook"]
+        self.with_module = state["with_module"]
+
+        if self.with_module:
+            if state["module"] is None:
+                raise RuntimeError(
+                    "You are trying to revive the hook of a dead Module!"
+                )
+            self.module = weakref.ref(state["module"])
+
+
+r"""This tracks hooks common to all modules that are executed before/after
+calling forward and backward. This is global state used for debugging/profiling
+purposes"""
+_global_backward_pre_hooks: dict[int, Callable] = OrderedDict()
+_global_backward_hooks: dict[int, Callable] = OrderedDict()
+_global_is_full_backward_hook: bool | None = None
+_global_forward_pre_hooks: dict[int, Callable] = OrderedDict()
+_global_forward_hooks: dict[int, Callable] = OrderedDict()
+_global_forward_hooks_always_called: dict[int, bool] = OrderedDict()
+_global_forward_hooks_with_kwargs: dict[int, bool] = OrderedDict()
+
+
+def _has_any_global_hook():
+    return (
+        _global_backward_pre_hooks
+        or _global_backward_hooks
+        or _global_forward_pre_hooks
+        or _global_forward_hooks
+        or _global_forward_hooks_always_called
+        or _global_forward_hooks_with_kwargs
+    )
+
+
+_EXTRA_STATE_KEY_SUFFIX = "_extra_state"
+
+
+def register_module_buffer_registration_hook(
+    hook: Callable[..., None],
+) -> RemovableHandle:
+    r"""Register a buffer registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_buffer` is invoked.
+    It should have the following signature::
+
+        hook(module, name, buffer) -> None or new buffer
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = RemovableHandle(_global_buffer_registration_hooks)
+    _global_buffer_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_module_registration_hook(
+    hook: Callable[..., None],
+) -> RemovableHandle:
+    r"""Register a module registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_module` is invoked.
+    It should have the following signature::
+
+        hook(module, name, submodule) -> None or new submodule
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = RemovableHandle(_global_module_registration_hooks)
+    _global_module_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_parameter_registration_hook(
+    hook: Callable[..., None],
+) -> RemovableHandle:
+    r"""Register a parameter registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_parameter` is invoked.
+    It should have the following signature::
+
+        hook(module, name, param) -> None or new parameter
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = RemovableHandle(_global_parameter_registration_hooks)
+    _global_parameter_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_forward_pre_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a forward pre-hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    The hook will be called every time before :func:`forward` is invoked.
+    It should have the following signature::
+
+        hook(module, input) -> None or modified input
+
+    The input contains only the positional arguments given to the module.
+    Keyword arguments won't be passed to the hooks and only to the ``forward``.
+    The hook can modify the input. User can either return a tuple or a
+    single modified value in the hook. We will wrap the value into a tuple
+    if a single value is returned(unless that value is already a tuple).
+
+    This hook has precedence over the specific module hooks registered with
+    ``register_forward_pre_hook``.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = RemovableHandle(_global_forward_pre_hooks)
+    _global_forward_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_forward_hook(
+    hook: Callable[..., None],
+    *,
+    with_kwargs: bool = False,
+    always_call: bool = False,
+) -> RemovableHandle:
+    r"""Register a global forward hook for all the modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    The hook will be called every time after :func:`forward` has computed an output.
+    It should have the following signature::
+
+        hook(module, input, output) -> None or modified output
+
+    The input contains only the positional arguments given to the module.
+    Keyword arguments won't be passed to the hooks and only to the ``forward``.
+    You can optionally modify the output of the module by returning a new value
+    that will replace the output from the :func:`forward` function.
+
+    Parameters:
+        hook (Callable): The user defined hook to be registered.
+        always_call (bool): If ``True`` the ``hook`` will be run regardless of
+            whether an exception is raised while calling the Module.
+            Default: ``False``
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    This hook will be executed before specific module hooks registered with
+    ``register_forward_hook``.
+    """
+    handle = RemovableHandle(
+        _global_forward_hooks, extra_dict=_global_forward_hooks_always_called
+    )
+    _global_forward_hooks[handle.id] = hook
+    if with_kwargs:
+        _global_forward_hooks_with_kwargs[handle.id] = True
+    if always_call:
+        _global_forward_hooks_always_called[handle.id] = True
+    return handle
+
+
+def register_module_backward_hook(
+    hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
+) -> RemovableHandle:
+    r"""Register a backward hook common to all the modules.
+
+    This function is deprecated in favor of
+    :func:`torch.nn.modules.module.register_module_full_backward_hook`
+    and the behavior of this function will change in future versions.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    global _global_is_full_backward_hook
+    if _global_is_full_backward_hook is True:
+        raise RuntimeError(
+            "Cannot use both regular backward hooks and full backward hooks as a "
+            "global Module hook. Please use only one of them."
+        )
+
+    _global_is_full_backward_hook = False
+
+    handle = RemovableHandle(_global_backward_hooks)
+    _global_backward_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_full_backward_pre_hook(
+    hook: Callable[["Module", _grad_t], None | _grad_t],
+) -> RemovableHandle:
+    r"""Register a backward pre-hook common to all the modules.
+
+    .. warning ::
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_pre_hook`.
+    Refer to its documentation for more details.
+
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_pre_hook`.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    handle = RemovableHandle(_global_backward_pre_hooks)
+    _global_backward_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_full_backward_hook(
+    hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
+) -> RemovableHandle:
+    r"""Register a backward hook common to all the modules.
+
+    .. warning ::
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_hook`.
+    Refer to its documentation for more details.
+
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_hook`.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    global _global_is_full_backward_hook
+    if _global_is_full_backward_hook is False:
+        raise RuntimeError(
+            "Cannot use both regular backward hooks and full backward hooks as a "
+            "global Module hook. Please use only one of them."
+        )
+
+    _global_is_full_backward_hook = True
+
+    handle = RemovableHandle(_global_backward_hooks)
+    _global_backward_hooks[handle.id] = hook
+    return handle
+
+
+# Trick mypy into not applying contravariance rules to inputs by defining
+# forward as a value, rather than a function.  See also
+# https://github.com/python/mypy/issues/8795
+def _forward_unimplemented(self, *input: Any) -> None:
+    r"""Define the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    """
+    raise NotImplementedError(
+        f'Module [{type(self).__name__}] is missing the required "forward" function'
+    )
+
+
+class Module:
+    r"""Base class for all neural network modules.
+
+    Your models should also subclass this class.
+
+    Modules can also contain other Modules, allowing them to be nested in
+    a tree structure. You can assign the submodules as regular attributes::
+
+        import torch.nn as nn
+        import torch.nn.functional as F
+
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 20, 5)
+                self.conv2 = nn.Conv2d(20, 20, 5)
+
+            def forward(self, x):
+                x = F.relu(self.conv1(x))
+                return F.relu(self.conv2(x))
+
+    Submodules assigned in this way will be registered, and will also have their
+    parameters converted when you call :meth:`to`, etc.
+
+    .. note::
+        As per the example above, an ``__init__()`` call to the parent class
+        must be made before assignment on the child.
+
+    :ivar training: Boolean represents whether this module is in training or
+                    evaluation mode.
+    :vartype training: bool
+    """
+
+    dump_patches: bool = False
+
+    _version: int = 1
+    r"""This allows better BC support for :meth:`load_state_dict`. In
+    :meth:`state_dict`, the version number will be saved as in the attribute
+    `_metadata` of the returned state dict, and thus pickled. `_metadata` is a
+    dictionary with keys that follow the naming convention of state dict. See
+    ``_load_from_state_dict`` on how to use this information in loading.
+
+    If new parameters/buffers are added/removed from a module, this number shall
+    be bumped, and the module's `_load_from_state_dict` method can compare the
+    version number and do appropriate changes if the state dict is from before
+    the change."""
+
+    training: bool
+    _parameters: dict[str, Parameter | None]
+    _buffers: dict[str, Tensor | None]
+    _non_persistent_buffers_set: set[str]
+    _backward_pre_hooks: dict[int, Callable]
+    _backward_hooks: dict[int, Callable]
+    _is_full_backward_hook: bool | None
+    _forward_hooks: dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_hooks_with_kwargs: dict[int, bool]
+    # forward hooks that should always be called even if an exception is raised
+    _forward_hooks_always_called: dict[int, bool]
+    _forward_pre_hooks: dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_pre_hooks_with_kwargs: dict[int, bool]
+    _state_dict_hooks: dict[int, Callable]
+    _load_state_dict_pre_hooks: dict[int, Callable]
+    _state_dict_pre_hooks: dict[int, Callable]
+    _load_state_dict_post_hooks: dict[int, Callable]
+    _modules: dict[str, Optional["Module"]]
+    call_super_init: bool = False
+    _compiled_call_impl: Callable | None = None
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Initialize internal Module state, shared by both nn.Module and ScriptModule."""
+        torch._C._log_api_usage_once("python.nn_module")
+
+        # Backward compatibility: no args used to be allowed when call_super_init=False
+        if self.call_super_init is False and bool(kwargs):
+            raise TypeError(
+                f"{type(self).__name__}.__init__() got an unexpected keyword argument '{next(iter(kwargs))}'"
+                ""
+            )
+
+        if self.call_super_init is False and bool(args):
+            raise TypeError(
+                f"{type(self).__name__}.__init__() takes 1 positional argument but {len(args) + 1} were"
+                " given"
+            )
+
+        """
+        Calls super().__setattr__('a', a) instead of the typical self.a = a
+        to avoid Module.__setattr__ overhead. Module's __setattr__ has special
+        handling for parameters, submodules, and buffers but simply calls into
+        super().__setattr__ for all other attributes.
+        """
+        super().__setattr__("training", True)
+        super().__setattr__("_parameters", {})
+        super().__setattr__("_buffers", {})
+        super().__setattr__("_non_persistent_buffers_set", set())
+        super().__setattr__("_backward_pre_hooks", OrderedDict())
+        super().__setattr__("_backward_hooks", OrderedDict())
+        super().__setattr__("_is_full_backward_hook", None)
+        super().__setattr__("_forward_hooks", OrderedDict())
+        super().__setattr__("_forward_hooks_with_kwargs", OrderedDict())
+        super().__setattr__("_forward_hooks_always_called", OrderedDict())
+        super().__setattr__("_forward_pre_hooks", OrderedDict())
+        super().__setattr__("_forward_pre_hooks_with_kwargs", OrderedDict())
+        super().__setattr__("_state_dict_hooks", OrderedDict())
+        super().__setattr__("_state_dict_pre_hooks", OrderedDict())
+        super().__setattr__("_load_state_dict_pre_hooks", OrderedDict())
+        super().__setattr__("_load_state_dict_post_hooks", OrderedDict())
+        super().__setattr__("_modules", {})
+
+        if self.call_super_init:
+            super().__init__(*args, **kwargs)
+
+    forward: Callable[..., Any] = _forward_unimplemented
+
+    def register_buffer(
+        self, name: str, tensor: Tensor | None, persistent: bool = True
+    ) -> None:
+        r"""Add a buffer to the module.
+
+        This is typically used to register a buffer that should not be
+        considered a model parameter. For example, BatchNorm's ``running_mean``
+        is not a parameter, but is part of the module's state. Buffers, by
+        default, are persistent and will be saved alongside parameters. This
+        behavior can be changed by setting :attr:`persistent` to ``False``. The
+        only difference between a persistent buffer and a non-persistent buffer
+        is that the latter will not be a part of this module's
+        :attr:`state_dict`.
+
+        Buffers can be accessed as attributes using given names.
+
+        Args:
+            name (str): name of the buffer. The buffer can be accessed
+                from this module using the given name
+            tensor (Tensor or None): buffer to be registered. If ``None``, then operations
+                that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,
+                the buffer is **not** included in the module's :attr:`state_dict`.
+            persistent (bool): whether the buffer is part of this module's
+                :attr:`state_dict`.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> self.register_buffer('running_mean', torch.zeros(num_features))
+
+        """
+        if persistent is False and isinstance(self, torch.jit.ScriptModule):
+            raise RuntimeError("ScriptModule does not support non-persistent buffers")
+
+        if "_buffers" not in self.__dict__:
+            raise AttributeError("cannot assign buffer before Module.__init__() call")
+        elif not isinstance(name, str):
+            raise TypeError(
+                f"buffer name should be a string. Got {torch.typename(name)}"
+            )
+        elif "." in name:
+            raise KeyError('buffer name can\'t contain "."')
+        elif name == "":
+            raise KeyError('buffer name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError(f"attribute '{name}' already exists")
+        elif tensor is not None and not (
+            isinstance(tensor, torch.Tensor) or hasattr(tensor, "__torch_function__")
+        ):
+            raise TypeError(
+                f"cannot assign '{torch.typename(tensor)}' object to buffer '{name}' "
+                "(torch Tensor or None required)"
+            )
+        else:
+            for hook in _global_buffer_registration_hooks.values():
+                output = hook(self, name, tensor)
+                if output is not None:
+                    tensor = output
+            self._buffers[name] = tensor
+            if persistent:
+                self._non_persistent_buffers_set.discard(name)
+            else:
+                self._non_persistent_buffers_set.add(name)
+
+    def register_parameter(self, name: str, param: Parameter | None) -> None:
+        r"""Add a parameter to the module.
+
+        The parameter can be accessed as an attribute using given name.
+
+        Args:
+            name (str): name of the parameter. The parameter can be accessed
+                from this module using the given name
+            param (Parameter or None): parameter to be added to the module. If
+                ``None``, then operations that run on parameters, such as :attr:`cuda`,
+                are ignored. If ``None``, the parameter is **not** included in the
+                module's :attr:`state_dict`.
+        """
+        if "_parameters" not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call"
+            )
+
+        elif not isinstance(name, str):
+            raise TypeError(
+                f"parameter name should be a string. Got {torch.typename(name)}"
+            )
+        elif "." in name:
+            raise KeyError('parameter name can\'t contain "."')
+        elif name == "":
+            raise KeyError('parameter name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError(f"attribute '{name}' already exists")
+
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError(
+                f"cannot assign '{torch.typename(param)}' object to parameter '{name}' "
+                "(torch.nn.Parameter or None required)"
+            )
+        elif param.grad_fn:
+            raise ValueError(
+                f"Cannot assign non-leaf Tensor to parameter '{name}'. Model "
+                f"parameters must be created explicitly. To express '{name}' "
+                "as a function of another Tensor, compute the value in "
+                "the forward() method."
+            )
+        else:
+            for hook in _global_parameter_registration_hooks.values():
+                output = hook(self, name, param)
+                if output is not None:
+                    param = output
+            self._parameters[name] = param
+
+    def add_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Add a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (str): name of the child module. The child module can be
+                accessed from this module using the given name
+            module (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError(f"{torch.typename(module)} is not a Module subclass")
+        elif not isinstance(name, str):
+            raise TypeError(
+                f"module name should be a string. Got {torch.typename(name)}"
+            )
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError(f"attribute '{name}' already exists")
+        elif "." in name:
+            raise KeyError(f'module name can\'t contain ".", got: {name}')
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        for hook in _global_module_registration_hooks.values():
+            output = hook(self, name, module)
+            if output is not None:
+                module = output
+        self._modules[name] = module
+
+    def register_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Alias for :func:`add_module`."""
+        self.add_module(name, module)
+
+    def get_submodule(self, target: str) -> "Module":
+        """Return the submodule given by ``target`` if it exists, otherwise throw an error.
+
+        For example, let's say you have an ``nn.Module`` ``A`` that
+        looks like this:
+
+        .. code-block:: text
+
+            A(
+                (net_b): Module(
+                    (net_c): Module(
+                        (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
+                    )
+                    (linear): Linear(in_features=100, out_features=200, bias=True)
+                )
+            )
+
+        (The diagram shows an ``nn.Module`` ``A``. ``A`` which has a nested
+        submodule ``net_b``, which itself has two submodules ``net_c``
+        and ``linear``. ``net_c`` then has a submodule ``conv``.)
+
+        To check whether or not we have the ``linear`` submodule, we
+        would call ``get_submodule("net_b.linear")``. To check whether
+        we have the ``conv`` submodule, we would call
+        ``get_submodule("net_b.net_c.conv")``.
+
+        The runtime of ``get_submodule`` is bounded by the degree
+        of module nesting in ``target``. A query against
+        ``named_modules`` achieves the same result, but it is O(N) in
+        the number of transitive modules. So, for a simple check to see
+        if some submodule exists, ``get_submodule`` should always be
+        used.
+
+        Args:
+            target: The fully-qualified string name of the submodule
+                to look for. (See above example for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.nn.Module: The submodule referenced by ``target``
+
+        Raises:
+            AttributeError: If at any point along the path resulting from
+                the target string the (sub)path resolves to a non-existent
+                attribute name or an object that is not an instance of ``nn.Module``.
+        """
+        if target == "":
+            return self
+
+        atoms: list[str] = target.split(".")
+        mod: torch.nn.Module = self
+
+        for item in atoms:
+            if not hasattr(mod, item):
+                raise AttributeError(
+                    mod._get_name() + " has no attribute `" + item + "`"
+                )
+
+            mod = getattr(mod, item)
+
+            if not isinstance(mod, torch.nn.Module):
+                raise AttributeError("`" + item + "` is not an nn.Module")
+
+        return mod
+
+    def set_submodule(
+        self, target: str, module: "Module", strict: bool = False
+    ) -> None:
+        """
+        Set the submodule given by ``target`` if it exists, otherwise throw an error.
+
+        .. note::
+            If ``strict`` is set to ``False`` (default), the method will replace an existing submodule
+            or create a new submodule if the parent module exists. If ``strict`` is set to ``True``,
+            the method will only attempt to replace an existing submodule and throw an error if
+            the submodule does not exist.
+
+        For example, let's say you have an ``nn.Module`` ``A`` that
+        looks like this:
+
+        .. code-block:: text
+
+            A(
+                (net_b): Module(
+                    (net_c): Module(
+                        (conv): Conv2d(3, 3, 3)
+                    )
+                    (linear): Linear(3, 3)
+                )
+            )
+
+        (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
+        submodule ``net_b``, which itself has two submodules ``net_c``
+        and ``linear``. ``net_c`` then has a submodule ``conv``.)
+
+        To override the ``Conv2d`` with a new submodule ``Linear``, you
+        could call ``set_submodule("net_b.net_c.conv", nn.Linear(1, 1))``
+        where ``strict`` could be ``True`` or ``False``
+
+        To add a new submodule ``Conv2d`` to the existing ``net_b`` module,
+        you would call ``set_submodule("net_b.conv", nn.Conv2d(1, 1, 1))``.
+
+        In the above if you set ``strict=True`` and call
+        ``set_submodule("net_b.conv", nn.Conv2d(1, 1, 1), strict=True)``, an AttributeError
+        will be raised because ``net_b`` does not have a submodule named ``conv``.
+
+        Args:
+            target: The fully-qualified string name of the submodule
+                to look for. (See above example for how to specify a
+                fully-qualified string.)
+            module: The module to set the submodule to.
+            strict: If ``False``, the method will replace an existing submodule
+                or create a new submodule if the parent module exists. If ``True``,
+                the method will only attempt to replace an existing submodule and throw an error
+                if the submodule doesn't already exist.
+
+        Raises:
+            ValueError: If the ``target`` string is empty or if ``module`` is not an instance of ``nn.Module``.
+            AttributeError: If at any point along the path resulting from
+                the ``target`` string the (sub)path resolves to a non-existent
+                attribute name or an object that is not an instance of ``nn.Module``.
+        """
+        if target == "":
+            raise ValueError("Cannot set the submodule without a target name!")
+
+        atoms: list[str] = target.split(".")
+        if not isinstance(module, torch.nn.Module):
+            raise ValueError(
+                "`" + "module" + f"` is not an nn.Module, found {type(module)}"
+            )
+        if len(atoms) == 1:
+            parent: torch.nn.Module = self
+        else:
+            parent_key = ".".join(atoms[:-1])
+            parent = self.get_submodule(parent_key)
+
+        if strict and not hasattr(parent, atoms[-1]):
+            raise AttributeError(
+                parent._get_name() + " has no attribute `" + atoms[-1] + "`"
+            )
+        if hasattr(parent, atoms[-1]):
+            mod = getattr(parent, atoms[-1])
+            if not isinstance(mod, torch.nn.Module):
+                raise AttributeError("`" + atoms[-1] + "` is not an nn.Module")
+        setattr(parent, atoms[-1], module)
+
+    def get_parameter(self, target: str) -> "Parameter":
+        """Return the parameter given by ``target`` if it exists, otherwise throw an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the Parameter
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.nn.Parameter: The Parameter referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not an
+                ``nn.Parameter``
+        """
+        module_path, _, param_name = target.rpartition(".")
+
+        mod: torch.nn.Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, param_name):
+            raise AttributeError(
+                mod._get_name() + " has no attribute `" + param_name + "`"
+            )
+
+        param: torch.nn.Parameter = getattr(mod, param_name)
+
+        if not isinstance(param, torch.nn.Parameter):
+            raise AttributeError("`" + param_name + "` is not an nn.Parameter")
+
+        return param
+
+    def get_buffer(self, target: str) -> "Tensor":
+        """Return the buffer given by ``target`` if it exists, otherwise throw an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the buffer
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.Tensor: The buffer referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not a
+                buffer
+        """
+        module_path, _, buffer_name = target.rpartition(".")
+
+        mod: torch.nn.Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, buffer_name):
+            raise AttributeError(
+                mod._get_name() + " has no attribute `" + buffer_name + "`"
+            )
+
+        buffer: torch.Tensor = getattr(mod, buffer_name)
+
+        if buffer_name not in mod._buffers:
+            raise AttributeError("`" + buffer_name + "` is not a buffer")
+
+        return buffer
+
+    def get_extra_state(self) -> Any:
+        """Return any extra state to include in the module's state_dict.
+
+        Implement this and a corresponding :func:`set_extra_state` for your module
+        if you need to store extra state. This function is called when building the
+        module's `state_dict()`.
+
+        Note that extra state should be picklable to ensure working serialization
+        of the state_dict. We only provide backwards compatibility guarantees
+        for serializing Tensors; other objects may break backwards compatibility if
+        their serialized pickled form changes.
+
+        Returns:
+            object: Any extra state to store in the module's state_dict
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.get_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+            "to report this bug."
+        )
+
+    def set_extra_state(self, state: Any) -> None:
+        """Set extra state contained in the loaded `state_dict`.
+
+        This function is called from :func:`load_state_dict` to handle any extra state
+        found within the `state_dict`. Implement this function and a corresponding
+        :func:`get_extra_state` for your module if you need to store extra state within its
+        `state_dict`.
+
+        Args:
+            state (dict): Extra state from the `state_dict`
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.set_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+            "to report this bug."
+        )
+
+    def _apply(self, fn, recurse=True):
+        if recurse:
+            for module in self.children():
+                module._apply(fn)
+
+        from torch._subclasses.fake_tensor import FakeTensor
+
+        def compute_should_use_set_data(tensor, tensor_applied) -> bool:
+            if torch._has_compatible_shallow_copy_type(
+                tensor, tensor_applied
+            ) and not isinstance(tensor_applied, FakeTensor):
+                # If the new tensor has compatible tensor type as the existing tensor,
+                # the current behavior is to change the tensor in-place using `.data =`,
+                # and the future behavior is to overwrite the existing tensor. However,
+                # changing the current behavior is a BC-breaking change, and we want it
+                # to happen in future releases. So for now we introduce the
+                # `torch.__future__.get_overwrite_module_params_on_conversion()`
+                # global flag to let the user control whether they want the future
+                # behavior of overwriting the existing tensor or not.
+                return not torch.__future__.get_overwrite_module_params_on_conversion()
+            else:
+                return False
+
+        should_use_swap_tensors = (
+            torch.__future__.get_swap_module_params_on_conversion()
+        )
+
+        for key, param in self._parameters.items():
+            if param is None:
+                continue
+            # Tensors stored in modules are graph leaves, and we don't want to
+            # track autograd history of `param_applied`, so we have to use
+            # `with torch.no_grad():`
+            with torch.no_grad():
+                param_applied = fn(param)
+            p_should_use_set_data = compute_should_use_set_data(param, param_applied)
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = (
+                should_use_swap_tensors
+                or is_traceable_wrapper_subclass(param_applied)
+                or isinstance(param, FakeTensor)
+            )
+
+            param_grad = param.grad
+            if p_should_use_swap_tensors:
+                try:
+                    if param_grad is not None:
+                        # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping.
+                        # Decrement use count of the gradient by setting to None
+                        param.grad = None
+                    param_applied = torch.nn.Parameter(
+                        # pyrefly: ignore [bad-argument-type]
+                        param_applied,
+                        requires_grad=param.requires_grad,
+                    )
+                    torch.utils.swap_tensors(param, param_applied)
+                except Exception as e:
+                    if param_grad is not None:
+                        param.grad = param_grad
+                    raise RuntimeError(
+                        f"_apply(): Couldn't swap {self._get_name()}.{key}"
+                    ) from e
+                out_param = param
+            elif p_should_use_set_data:
+                # pyrefly: ignore [bad-assignment]
+                param.data = param_applied
+                out_param = param
+            else:
+                assert isinstance(param, Parameter)
+                assert param.is_leaf
+                # pyrefly: ignore [bad-argument-type]
+                out_param = Parameter(param_applied, param.requires_grad)
+                self._parameters[key] = out_param
+
+            if param_grad is not None:
+                with torch.no_grad():
+                    grad_applied = fn(param_grad)
+                g_should_use_set_data = compute_should_use_set_data(
+                    param_grad, grad_applied
+                )
+                if p_should_use_swap_tensors:
+                    grad_applied.requires_grad_(param_grad.requires_grad)
+                    try:
+                        torch.utils.swap_tensors(param_grad, grad_applied)
+                    except Exception as e:
+                        raise RuntimeError(
+                            f"_apply(): Couldn't swap {self._get_name()}.{key}.grad"
+                        ) from e
+                    out_param.grad = param_grad
+                elif g_should_use_set_data:
+                    assert out_param.grad is not None
+                    out_param.grad.data = grad_applied
+                else:
+                    assert param_grad.is_leaf
+                    out_param.grad = grad_applied.requires_grad_(
+                        param_grad.requires_grad
+                    )
+
+        for key, buf in self._buffers.items():
+            if buf is not None:
+                self._buffers[key] = fn(buf)
+
+        return self
+
+    def apply(self, fn: Callable[["Module"], None]) -> Self:
+        r"""Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
+
+        Typical use includes initializing the parameters of a model
+        (see also :ref:`nn-init-doc`).
+
+        Args:
+            fn (:class:`Module` -> None): function to be applied to each submodule
+
+        Returns:
+            Module: self
+
+        Example::
+
+            >>> @torch.no_grad()
+            >>> def init_weights(m):
+            >>>     print(m)
+            >>>     if type(m) is nn.Linear:
+            >>>         m.weight.fill_(1.0)
+            >>>         print(m.weight)
+            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+            >>> net.apply(init_weights)
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[1., 1.],
+                    [1., 1.]], requires_grad=True)
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[1., 1.],
+                    [1., 1.]], requires_grad=True)
+            Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+
+        """
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def cuda(self, device: int | device | None = None) -> Self:
+        r"""Move all model parameters and buffers to the GPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing the optimizer if the module will
+        live on GPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cuda(device))
+
+    def ipu(self, device: int | device | None = None) -> Self:
+        r"""Move all model parameters and buffers to the IPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing the optimizer if the module will
+        live on IPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.ipu(device))
+
+    def xpu(self, device: int | device | None = None) -> Self:
+        r"""Move all model parameters and buffers to the XPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on XPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.xpu(device))
+
+    def mtia(self, device: int | device | None = None) -> Self:
+        r"""Move all model parameters and buffers to the MTIA.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing the optimizer if the module will
+        live on MTIA while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.mtia(device))
+
+    def cpu(self) -> Self:
+        r"""Move all model parameters and buffers to the CPU.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cpu())
+
+    def type(self, dst_type: dtype | str) -> Self:
+        r"""Casts all parameters and buffers to :attr:`dst_type`.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            dst_type (type or string): the desired type
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.type(dst_type))
+
+    def float(self) -> Self:
+        r"""Casts all floating point parameters and buffers to ``float`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.float() if t.is_floating_point() else t)
+
+    def double(self) -> Self:
+        r"""Casts all floating point parameters and buffers to ``double`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.double() if t.is_floating_point() else t)
+
+    def half(self) -> Self:
+        r"""Casts all floating point parameters and buffers to ``half`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
+
+    def bfloat16(self) -> Self:
+        r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t)
+
+    def to_empty(self, *, device: DeviceLikeType | None, recurse: bool = True) -> Self:
+        r"""Move the parameters and buffers to the specified device without copying storage.
+
+        Args:
+            device (:class:`torch.device`): The desired device of the parameters
+                and buffers in this module.
+            recurse (bool): Whether parameters and buffers of submodules should
+                be recursively moved to the specified device.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(
+            lambda t: torch.empty_like(t, device=device), recurse=recurse
+        )
+
+    @overload
+    def to(
+        self,
+        device: DeviceLikeType | None = ...,
+        dtype: dtype | None = ...,
+        non_blocking: bool = ...,
+    ) -> Self: ...
+
+    @overload
+    def to(self, dtype: dtype, non_blocking: bool = ...) -> Self: ...
+
+    @overload
+    def to(self, tensor: Tensor, non_blocking: bool = ...) -> Self: ...
+
+    def to(self, *args, **kwargs):
+        r"""Move and/or cast the parameters and buffers.
+
+        This can be called as
+
+        .. function:: to(device=None, dtype=None, non_blocking=False)
+           :noindex:
+
+        .. function:: to(dtype, non_blocking=False)
+           :noindex:
+
+        .. function:: to(tensor, non_blocking=False)
+           :noindex:
+
+        .. function:: to(memory_format=torch.channels_last)
+           :noindex:
+
+        Its signature is similar to :meth:`torch.Tensor.to`, but only accepts
+        floating point or complex :attr:`dtype`\ s. In addition, this method will
+        only cast the floating point or complex parameters and buffers to :attr:`dtype`
+        (if given). The integral parameters and buffers will be moved
+        :attr:`device`, if that is given, but with dtypes unchanged. When
+        :attr:`non_blocking` is set, it tries to convert/move asynchronously
+        with respect to the host if possible, e.g., moving CPU Tensors with
+        pinned memory to CUDA devices.
+
+        See below for examples.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (:class:`torch.device`): the desired device of the parameters
+                and buffers in this module
+            dtype (:class:`torch.dtype`): the desired floating point or complex dtype of
+                the parameters and buffers in this module
+            tensor (torch.Tensor): Tensor whose dtype and device are the desired
+                dtype and device for all parameters and buffers in this module
+            memory_format (:class:`torch.memory_format`): the desired memory
+                format for 4D parameters and buffers in this module (keyword
+                only argument)
+
+        Returns:
+            Module: self
+
+        Examples::
+
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> linear = nn.Linear(2, 2)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]])
+            >>> linear.to(torch.double)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]], dtype=torch.float64)
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
+            >>> gpu1 = torch.device("cuda:1")
+            >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')
+            >>> cpu = torch.device("cpu")
+            >>> linear.to(cpu)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16)
+
+            >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.3741+0.j,  0.2382+0.j],
+                    [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128)
+            >>> linear(torch.ones(3, 2, dtype=torch.cdouble))
+            tensor([[0.6122+0.j, 0.1150+0.j],
+                    [0.6122+0.j, 0.1150+0.j],
+                    [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128)
+
+        """
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
+            # pyrefly: ignore [not-iterable]
+            *args,
+            **kwargs,
+        )
+
+        if dtype is not None:
+            if not (dtype.is_floating_point or dtype.is_complex):
+                raise TypeError(
+                    "nn.Module.to only accepts floating point or complex "
+                    f"dtypes, but got desired dtype={dtype}"
+                )
+            if dtype.is_complex:
+                warnings.warn(
+                    "Complex modules are a new feature under active development whose design may change, "
+                    "and some modules might not work as expected when using complex tensors as parameters or buffers. "
+                    "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+                    "if a complex module does not work as expected.",
+                    stacklevel=2,
+                )
+
+        def convert(t):
+            try:
+                if convert_to_format is not None and t.dim() in (4, 5):
+                    return t.to(
+                        device,
+                        dtype if t.is_floating_point() or t.is_complex() else None,
+                        non_blocking,
+                        memory_format=convert_to_format,
+                    )
+                return t.to(
+                    device,
+                    dtype if t.is_floating_point() or t.is_complex() else None,
+                    non_blocking,
+                )
+            except NotImplementedError as e:
+                if str(e) == "Cannot copy out of meta tensor; no data!":
+                    raise NotImplementedError(
+                        f"{e} Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() "
+                        f"when moving module from meta to a different device."
+                    ) from None
+                else:
+                    raise
+
+        return self._apply(convert)
+
+    def register_full_backward_pre_hook(
+        self,
+        hook: Callable[["Module", _grad_t], None | _grad_t],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a backward pre-hook on the module.
+
+        The hook will be called every time the gradients for the module are computed.
+        The hook should have the following signature::
+
+            hook(module, grad_output) -> tuple[Tensor, ...], Tensor or None
+
+        The :attr:`grad_output` is a tuple. The hook should
+        not modify its arguments, but it can optionally return a new gradient with
+        respect to the output that will be used in place of :attr:`grad_output` in
+        subsequent computations. Entries in :attr:`grad_output` will be ``None`` for
+        all non-Tensor arguments.
+
+        For technical reasons, when this hook is applied to a Module, its forward function will
+        receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+        of each Tensor returned by the Module's forward function.
+
+        .. warning ::
+            Modifying inputs inplace is not allowed when using backward hooks and
+            will raise an error.
+
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward_pre`` hooks on this
+                :class:`torch.nn.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward_pre`` hooks
+                on this :class:`torch.nn.Module`. Note that global
+                ``backward_pre`` hooks registered with
+                :func:`register_module_full_backward_pre_hook` will fire before
+                all hooks registered by this method.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        handle = RemovableHandle(self._backward_pre_hooks)
+        self._backward_pre_hooks[handle.id] = hook
+        if prepend:
+            self._backward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def register_backward_hook(
+        self, hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t]
+    ) -> RemovableHandle:
+        r"""Register a backward hook on the module.
+
+        This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and
+        the behavior of this function will change in future versions.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        if self._is_full_backward_hook is True:
+            raise RuntimeError(
+                "Cannot use both regular backward hooks and full backward hooks on a "
+                "single Module. Please use only one of them."
+            )
+
+        self._is_full_backward_hook = False
+
+        handle = RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def register_full_backward_hook(
+        self,
+        hook: Callable[["Module", _grad_t, _grad_t], None | _grad_t],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a backward hook on the module.
+
+        The hook will be called every time the gradients with respect to a module are computed, and its firing rules are as follows:
+
+            1. Ordinarily, the hook fires when the gradients are computed with respect to the module inputs.
+            2. If none of the module inputs require gradients, the hook will fire when the gradients are computed
+               with respect to module outputs.
+            3. If none of the module outputs require gradients, then the hooks will not fire.
+
+        The hook should have the following signature::
+
+            hook(module, grad_input, grad_output) -> tuple(Tensor) or None
+
+        The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients
+        with respect to the inputs and outputs respectively. The hook should
+        not modify its arguments, but it can optionally return a new gradient with
+        respect to the input that will be used in place of :attr:`grad_input` in
+        subsequent computations. :attr:`grad_input` will only correspond to the inputs given
+        as positional arguments and all kwarg arguments are ignored. Entries
+        in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
+        arguments.
+
+        For technical reasons, when this hook is applied to a Module, its forward function will
+        receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+        of each Tensor returned by the Module's forward function.
+
+        .. warning ::
+            Modifying inputs or outputs inplace is not allowed when using backward hooks and
+            will raise an error.
+
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward`` hooks on this
+                :class:`torch.nn.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward`` hooks on
+                this :class:`torch.nn.Module`. Note that global
+                ``backward`` hooks registered with
+                :func:`register_module_full_backward_hook` will fire before
+                all hooks registered by this method.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        if self._is_full_backward_hook is False:
+            raise RuntimeError(
+                "Cannot use both regular backward hooks and full backward hooks on a "
+                "single Module. Please use only one of them."
+            )
+
+        self._is_full_backward_hook = True
+
+        handle = RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        if prepend:
+            self._backward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def _get_backward_hooks(self):
+        r"""Return the backward hooks for use in the call function.
+
+        It returns two lists, one with the full backward hooks and one with the non-full
+        backward hooks.
+        """
+        full_backward_hooks: list[Callable] = []
+        if _global_is_full_backward_hook is True:
+            full_backward_hooks += _global_backward_hooks.values()
+        if self._is_full_backward_hook is True:
+            full_backward_hooks += self._backward_hooks.values()
+
+        non_full_backward_hooks: list[Callable] = []
+        if _global_is_full_backward_hook is False:
+            non_full_backward_hooks += _global_backward_hooks.values()
+        if self._is_full_backward_hook is False:
+            non_full_backward_hooks += self._backward_hooks.values()
+
+        return full_backward_hooks, non_full_backward_hooks
+
+    def _get_backward_pre_hooks(self):
+        backward_pre_hooks: list[Callable] = []
+        backward_pre_hooks += _global_backward_pre_hooks.values()
+        backward_pre_hooks += self._backward_pre_hooks.values()
+
+        return backward_pre_hooks
+
+    def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn) -> None:
+        if not isinstance(result, torch.Tensor):
+            if not (
+                isinstance(result, tuple)
+                and all(isinstance(r, torch.Tensor) for r in result)
+            ):
+                warnings.warn(
+                    "Using non-full backward hooks on a Module that does not return a "
+                    "single Tensor or a tuple of Tensors is deprecated and will be removed "
+                    "in future versions. This hook will be missing some of the grad_output. "
+                    "Please use register_full_backward_hook to get the documented behavior.",
+                    FutureWarning,
+                    stacklevel=2,
+                )
+                return
+        else:
+            result = (result,)
+
+        if not isinstance(inputs, torch.Tensor):
+            if not (
+                isinstance(inputs, tuple)
+                and all(isinstance(i, torch.Tensor) for i in inputs)
+            ):
+                warnings.warn(
+                    "Using non-full backward hooks on a Module that does not take as input a "
+                    "single Tensor or a tuple of Tensors is deprecated and will be removed "
+                    "in future versions. This hook will be missing some of the grad_input. "
+                    "Please use register_full_backward_hook to get the documented behavior.",
+                    FutureWarning,
+                    stacklevel=2,
+                )
+                return
+        else:
+            inputs = (inputs,)
+
+        # At this point we are sure that inputs and result are tuple of Tensors
+        out_grad_fn = {r.grad_fn for r in result if r.grad_fn is not None}
+        if len(out_grad_fn) == 0 or (
+            len(out_grad_fn) == 1 and grad_fn not in out_grad_fn
+        ):
+            warnings.warn(
+                "Using a non-full backward hook when outputs are nested in python data structure "
+                "is deprecated and will be removed in future versions. This hook will be missing "
+                "some grad_output.",
+                FutureWarning,
+                stacklevel=2,
+            )
+        elif len(out_grad_fn) > 1:
+            warnings.warn(
+                "Using a non-full backward hook when outputs are generated by different autograd Nodes "
+                "is deprecated and will be removed in future versions. This hook will be missing "
+                "some grad_output. Please use register_full_backward_hook to get the documented behavior.",
+                FutureWarning,
+                stacklevel=2,
+            )
+        else:
+            # At this point the grad_output part of the hook will most likely be correct
+            inputs_grad_fn = {i.grad_fn for i in inputs if i.grad_fn is not None}
+
+            next_functions = {n[0] for n in grad_fn.next_functions}
+
+            if inputs_grad_fn != next_functions:
+                warnings.warn(
+                    "Using a non-full backward hook when the forward contains multiple autograd Nodes "
+                    "is deprecated and will be removed in future versions. This hook will be missing "
+                    "some grad_input. Please use register_full_backward_hook to get the documented "
+                    "behavior.",
+                    FutureWarning,
+                    stacklevel=2,
+                )
+
+    def register_forward_pre_hook(
+        self,
+        hook: Callable[[T, tuple[Any, ...]], Any | None]
+        | Callable[
+            [T, tuple[Any, ...], dict[str, Any]], tuple[Any, dict[str, Any]] | None
+        ],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a forward pre-hook on the module.
+
+        The hook will be called every time before :func:`forward` is invoked.
+
+
+        If ``with_kwargs`` is false or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        input. User can either return a tuple or a single modified value in the
+        hook. We will wrap the value into a tuple if a single value is returned
+        (unless that value is already a tuple). The hook should have the
+        following signature::
+
+            hook(module, args) -> None or modified input
+
+        If ``with_kwargs`` is true, the forward pre-hook will be passed the
+        kwargs given to the forward function. And if the hook modifies the
+        input, both the args and kwargs should be returned. The hook should have
+        the following signature::
+
+            hook(module, args, kwargs) -> None or a tuple of modified input and kwargs
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``forward_pre`` hooks on this
+                :class:`torch.nn.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward_pre`` hooks
+                on this :class:`torch.nn.Module`. Note that global
+                ``forward_pre`` hooks registered with
+                :func:`register_module_forward_pre_hook` will fire before all
+                hooks registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If true, the ``hook`` will be passed the kwargs
+                given to the forward function.
+                Default: ``False``
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = RemovableHandle(
+            self._forward_pre_hooks, extra_dict=self._forward_pre_hooks_with_kwargs
+        )
+        self._forward_pre_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_pre_hooks_with_kwargs[handle.id] = True
+
+        if prepend:
+            self._forward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def register_forward_hook(
+        self,
+        hook: Callable[[T, tuple[Any, ...], Any], Any | None]
+        | Callable[[T, tuple[Any, ...], dict[str, Any], Any], Any | None],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
+        always_call: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a forward hook on the module.
+
+        The hook will be called every time after :func:`forward` has computed an output.
+
+        If ``with_kwargs`` is ``False`` or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        output. It can modify the input inplace but it will not have effect on
+        forward since this is called after :func:`forward` is called. The hook
+        should have the following signature::
+
+            hook(module, args, output) -> None or modified output
+
+        If ``with_kwargs`` is ``True``, the forward hook will be passed the
+        ``kwargs`` given to the forward function and be expected to return the
+        output possibly modified. The hook should have the following signature::
+
+            hook(module, args, kwargs, output) -> None or modified output
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If ``True``, the provided ``hook`` will be fired
+                before all existing ``forward`` hooks on this
+                :class:`torch.nn.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward`` hooks on
+                this :class:`torch.nn.Module`. Note that global
+                ``forward`` hooks registered with
+                :func:`register_module_forward_hook` will fire before all hooks
+                registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If ``True``, the ``hook`` will be passed the
+                kwargs given to the forward function.
+                Default: ``False``
+            always_call (bool): If ``True`` the ``hook`` will be run regardless of
+                whether an exception is raised while calling the Module.
+                Default: ``False``
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = RemovableHandle(
+            self._forward_hooks,
+            extra_dict=[
+                self._forward_hooks_with_kwargs,
+                self._forward_hooks_always_called,
+            ],
+        )
+        self._forward_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_hooks_with_kwargs[handle.id] = True
+        if always_call:
+            self._forward_hooks_always_called[handle.id] = True
+        if prepend:
+            self._forward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def _slow_forward(self, *input, **kwargs):
+        tracing_state = torch._C._get_tracing_state()
+        if not tracing_state or isinstance(self.forward, torch._C.ScriptMethod):
+            return self.forward(*input, **kwargs)
+        recording_scopes = torch.jit._trace._trace_module_map is not None
+        if recording_scopes:
+            # type ignore was added because at this point one knows that
+            # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any]
+            name = torch.jit._trace._trace_module_map.get(self, None)  # type: ignore[operator, union-attr]
+            if name:
+                tracing_state.push_scope(name)
+            else:
+                recording_scopes = False
+        try:
+            result = self.forward(*input, **kwargs)
+        finally:
+            if recording_scopes:
+                tracing_state.pop_scope()
+        return result
+
+    def _wrapped_call_impl(self, *args, **kwargs):
+        if self._compiled_call_impl is not None:
+            return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
+        else:
+            return self._call_impl(*args, **kwargs)
+
+    # torchrec tests the code consistency with the following code
+    # fmt: off
+    def _call_impl(self, *args, **kwargs):
+        forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
+        # If we don't have any hooks, we want to skip the rest of the logic in
+        # this function, and just call forward.
+        if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
+                or _global_backward_pre_hooks or _global_backward_hooks
+                or _global_forward_hooks or _global_forward_pre_hooks):
+            return forward_call(*args, **kwargs)
+
+        result = None
+        called_always_called_hooks = set()
+
+        def inner():
+            nonlocal result, args, kwargs
+
+            full_backward_hooks, non_full_backward_hooks = [], []
+            backward_pre_hooks = []
+            if self._backward_pre_hooks or _global_backward_pre_hooks:
+                backward_pre_hooks = self._get_backward_pre_hooks()
+
+            if self._backward_hooks or _global_backward_hooks:
+                full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+
+            if _global_forward_pre_hooks or self._forward_pre_hooks:
+                for hook_id, hook in (
+                    *_global_forward_pre_hooks.items(),
+                    *self._forward_pre_hooks.items(),
+                ):
+                    if hook_id in self._forward_pre_hooks_with_kwargs:
+                        args_kwargs_result = hook(self, args, kwargs)  # type: ignore[misc]
+                        if args_kwargs_result is not None:
+                            if isinstance(args_kwargs_result, tuple) and len(args_kwargs_result) == 2:
+                                args, kwargs = args_kwargs_result
+                            else:
+                                raise RuntimeError(
+                                    "forward pre-hook must return None or a tuple "
+                                    f"of (new_args, new_kwargs), but got {args_kwargs_result}."
+                                )
+                    else:
+                        args_result = hook(self, args)
+                        if args_result is not None:
+                            if not isinstance(args_result, tuple):
+                                args_result = (args_result,)
+                            args = args_result
+
+            bw_hook = None
+            if full_backward_hooks or backward_pre_hooks:
+                bw_hook = BackwardHook(self, full_backward_hooks, backward_pre_hooks)
+                args = bw_hook.setup_input_hook(args)
+
+            result = forward_call(*args, **kwargs)
+            if _global_forward_hooks or self._forward_hooks:
+                for hook_id, hook in (
+                    *_global_forward_hooks.items(),
+                    *self._forward_hooks.items(),
+                ):
+                    # mark that always called hook is run
+                    if hook_id in self._forward_hooks_always_called or hook_id in _global_forward_hooks_always_called:
+                        called_always_called_hooks.add(hook_id)
+
+                    if hook_id in self._forward_hooks_with_kwargs or hook_id in _global_forward_hooks_with_kwargs:
+                        hook_result = hook(self, args, kwargs, result)
+                    else:
+                        hook_result = hook(self, args, result)
+
+                    if hook_result is not None:
+                        result = hook_result
+
+            if bw_hook:
+                if not isinstance(result, (torch.Tensor, tuple)):
+                    warnings.warn("For backward hooks to be called,"
+                                  " module output should be a Tensor or a tuple of Tensors"
+                                  f" but received {type(result)}", stacklevel=2)
+                result = bw_hook.setup_output_hook(result)
+
+            # Handle the non-full backward hooks
+            if non_full_backward_hooks:
+                var = result
+                while not isinstance(var, torch.Tensor):
+                    if isinstance(var, dict):
+                        var = next(v for v in var.values() if isinstance(v, torch.Tensor))
+                    else:
+                        var = var[0]
+                grad_fn = var.grad_fn
+                if grad_fn is not None:
+                    for hook in non_full_backward_hooks:
+                        grad_fn.register_hook(_WrappedHook(hook, self))
+                    self._maybe_warn_non_full_backward_hook(args, result, grad_fn)
+
+            return result
+
+        # This is technically not behavior equivalent when compiling, but it's
+        # incredibly unlikely we will ever support throwing an exception in NN
+        # module, and then catching it here, and then reraising it, and then
+        # catching it again, and expecting the resulting frame to be compiled.
+        # The reraise here just gunks up our exception handling for no good
+        # reason.  Don't try to run the always called hooks in event of
+        # exception.
+        if torch.compiler.is_compiling():
+            return inner()
+
+        try:
+            return inner()
+        except Exception:
+            # run always called hooks if they have not already been run
+            # For now only forward hooks have the always_call option but perhaps
+            # this functionality should be added to full backward hooks as well.
+            for hook_id, hook in _global_forward_hooks.items():
+                if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
+                    try:
+                        hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
+                        if hook_result is not None:
+                            result = hook_result
+                    except Exception as e:
+                        warnings.warn("global module forward hook with ``always_call=True`` raised an exception "
+                                      f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
+                        continue
+
+            for hook_id, hook in self._forward_hooks.items():
+                if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
+                    try:
+                        if hook_id in self._forward_hooks_with_kwargs:
+                            hook_result = hook(self, args, kwargs, result)  # type: ignore[possibly-undefined]
+                        else:
+                            hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
+                        if hook_result is not None:
+                            result = hook_result
+                    except Exception as e:
+                        warnings.warn("module forward hook with ``always_call=True`` raised an exception "
+                                      f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
+                        continue
+            # raise exception raised in try block
+            raise
+    # fmt: on
+
+    __call__: Callable[..., Any] = _wrapped_call_impl
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_compiled_call_impl", None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+        # Support loading old checkpoints that don't have the following attrs:
+        if "_forward_pre_hooks" not in self.__dict__:
+            self._forward_pre_hooks = OrderedDict()
+        if "_forward_pre_hooks_with_kwargs" not in self.__dict__:
+            self._forward_pre_hooks_with_kwargs = OrderedDict()
+        if "_forward_hooks_with_kwargs" not in self.__dict__:
+            self._forward_hooks_with_kwargs = OrderedDict()
+        if "_forward_hooks_always_called" not in self.__dict__:
+            self._forward_hooks_always_called = OrderedDict()
+        if "_state_dict_hooks" not in self.__dict__:
+            self._state_dict_hooks = OrderedDict()
+        if "_state_dict_pre_hooks" not in self.__dict__:
+            self._state_dict_pre_hooks = OrderedDict()
+        if "_load_state_dict_pre_hooks" not in self.__dict__:
+            self._load_state_dict_pre_hooks = OrderedDict()
+        if "_load_state_dict_post_hooks" not in self.__dict__:
+            self._load_state_dict_post_hooks = OrderedDict()
+        if "_non_persistent_buffers_set" not in self.__dict__:
+            self._non_persistent_buffers_set = set()
+        if "_is_full_backward_hook" not in self.__dict__:
+            self._is_full_backward_hook = None
+        if "_backward_pre_hooks" not in self.__dict__:
+            self._backward_pre_hooks = OrderedDict()
+
+    # It is crucial that the return type is not annotated as `Any`, otherwise type checking
+    # on `torch.nn.Module` and all its subclasses is largely disabled as a result. See:
+    # https://github.com/pytorch/pytorch/pull/115074
+    def __getattr__(self, name: str) -> Union[Tensor, "Module"]:
+        if "_parameters" in self.__dict__:
+            _parameters = self.__dict__["_parameters"]
+            if name in _parameters:
+                return _parameters[name]
+        if "_buffers" in self.__dict__:
+            _buffers = self.__dict__["_buffers"]
+            if name in _buffers:
+                return _buffers[name]
+        if "_modules" in self.__dict__:
+            modules = self.__dict__["_modules"]
+            if name in modules:
+                return modules[name]
+        raise AttributeError(
+            f"'{type(self).__name__}' object has no attribute '{name}'"
+        )
+
+    def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
+        def remove_from(*dicts_or_sets) -> None:
+            for d in dicts_or_sets:
+                if name in d:
+                    if isinstance(d, dict):
+                        del d[name]
+                    else:
+                        d.discard(name)
+
+        params = self.__dict__.get("_parameters")
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call"
+                )
+            remove_from(
+                self.__dict__,
+                self._buffers,
+                self._modules,
+                self._non_persistent_buffers_set,
+            )
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError(
+                    f"cannot assign '{torch.typename(value)}' as parameter '{name}' "
+                    "(torch.nn.Parameter or None expected)"
+                )
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get("_modules")
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call"
+                    )
+                remove_from(
+                    self.__dict__,
+                    self._parameters,
+                    self._buffers,
+                    self._non_persistent_buffers_set,
+                )
+                for hook in _global_module_registration_hooks.values():
+                    output = hook(self, name, value)
+                    if output is not None:
+                        value = output
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError(
+                        f"cannot assign '{torch.typename(value)}' as child module '{name}' "
+                        "(torch.nn.Module or None expected)"
+                    )
+                for hook in _global_module_registration_hooks.values():
+                    output = hook(self, name, value)
+                    if output is not None:
+                        value = output
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get("_buffers")
+                if isinstance(value, Buffer) or buffers is not None and name in buffers:
+                    if value is not None and not (
+                        isinstance(value, torch.Tensor)
+                        or hasattr(value, "__torch_function__")
+                    ):
+                        raise TypeError(
+                            f"cannot assign '{torch.typename(value)}' as buffer '{name}' "
+                            "(torch.nn.Buffer, torch.Tensor or None expected)"
+                        )
+                    if isinstance(value, Buffer):
+                        persistent = value.persistent
+                    else:
+                        persistent = name not in self._non_persistent_buffers_set
+                    # === HACK ===
+                    # This whole block below should just be:
+                    # self.register_buffer(name, value, persistent)
+
+                    # But to support subclasses of nn.Module that (wrongfully) implement a
+                    # register_buffer() method that doesn't have the "persistent"
+                    # argument. Only pass it in if it is accepted otherwise assume
+                    # it is always true
+                    if (
+                        getattr(self.register_buffer, "__func__", None)
+                        is torch.nn.Module.register_buffer
+                    ):
+                        self.register_buffer(name, value, persistent)
+                    else:
+                        sign = inspect.signature(self.register_buffer)
+                        if "persistent" in sign.parameters:
+                            self.register_buffer(name, value, persistent)
+                        else:
+                            if not persistent:
+                                raise RuntimeError(
+                                    "Registering a non-persistent buffer "
+                                    "on a Module subclass that implements "
+                                    "register_buffer() without the persistent "
+                                    "argument is not allowed."
+                                )
+                            # Assume that the implementation without the argument has the
+                            # behavior from before the argument was added: persistent=True
+                            self.register_buffer(name, value)
+                    # === HACK END ===
+                else:
+                    super().__setattr__(name, value)
+
+    def __delattr__(self, name) -> None:
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._buffers:
+            del self._buffers[name]
+            self._non_persistent_buffers_set.discard(name)
+        elif name in self._modules:
+            del self._modules[name]
+        else:
+            super().__delattr__(name)
+
+    def _register_state_dict_hook(self, hook):
+        r"""Register a post-hook for the :meth:`~torch.nn.Module.state_dict` method.
+
+        It should have the following signature::
+            hook(module, state_dict, prefix, local_metadata) -> None or state_dict
+
+        The registered hooks can modify the ``state_dict`` inplace or return a new one.
+        If a new ``state_dict`` is returned, it will only be respected if it is the root
+        module that :meth:`~nn.Module.state_dict` is called from.
+        """
+        if getattr(hook, "_from_public_api", False):
+            raise RuntimeError(
+                "Cannot register the same function as the state dict post hook that was "
+                "previously registered via register_state_dict_post_hook"
+            )
+        handle = RemovableHandle(self._state_dict_hooks)
+        self._state_dict_hooks[handle.id] = hook
+        return handle
+
+    def register_state_dict_post_hook(self, hook):
+        r"""Register a post-hook for the :meth:`~torch.nn.Module.state_dict` method.
+
+        It should have the following signature::
+            hook(module, state_dict, prefix, local_metadata) -> None
+
+        The registered hooks can modify the ``state_dict`` inplace.
+        """
+        # In _register_state_dict_hook there was a bug described in
+        # https://github.com/pytorch/pytorch/issues/117437 where the return value
+        # was only respected for the root module but not child submodules.
+        # We fix this in this public version by only allowing inplace modifications on
+        # the state_dict by the hook. However, since hooks registered via both these
+        # APIs will be added to `_state_dict_hooks` and the type of `_state_dict_hooks`
+        # cannot be changed due to many dependencies on it, we mark a hook
+        # as being registered via the public API by setting `_from_public_api` on it.
+        # In the implementation of `state_dict`, if the callable does not have this
+        # flag, the old behavior of respecting the return value will be preserved
+        # for the root module, otherwise, we ensure that the hook returns None.
+        hook._from_public_api = True
+        handle = RemovableHandle(self._state_dict_hooks)
+        self._state_dict_hooks[handle.id] = hook
+        return handle
+
+    def register_state_dict_pre_hook(self, hook):
+        r"""Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
+
+        It should have the following signature::
+            hook(module, prefix, keep_vars) -> None
+
+        The registered hooks can be used to perform pre-processing before the ``state_dict``
+        call is made.
+        """
+        handle = RemovableHandle(self._state_dict_pre_hooks)
+        self._state_dict_pre_hooks[handle.id] = hook
+        return handle
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars) -> None:
+        r"""Save module state to the `destination` dictionary.
+
+        The `destination` dictionary will contain the state
+        of the module, but not its descendants. This is called on every
+        submodule in :meth:`~torch.nn.Module.state_dict`.
+
+        In rare cases, subclasses can achieve class-specific behavior by
+        overriding this method with custom logic.
+
+        Args:
+            destination (dict): a dict where state will be stored
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+        """
+        for name, param in self._parameters.items():
+            if param is not None:
+                destination[prefix + name] = param if keep_vars else param.detach()
+        for name, buf in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                destination[prefix + name] = buf if keep_vars else buf.detach()
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if (
+            getattr(self.__class__, "get_extra_state", Module.get_extra_state)
+            is not Module.get_extra_state
+        ):
+            destination[extra_state_key] = self.get_extra_state()
+
+    # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns
+    # back that same object. But if they pass nothing, an `OrderedDict` is created and returned.
+    T_destination = TypeVar("T_destination", bound=dict[str, Any])
+
+    @overload
+    def state_dict(
+        self,
+        *,
+        destination: T_destination,
+        prefix: str = ...,
+        keep_vars: bool = ...,
+    ) -> T_destination: ...
+
+    @overload
+    def state_dict(
+        self,
+        *,
+        prefix: str = ...,
+        keep_vars: bool = ...,
+    ) -> dict[str, Any]: ...
+
+    # TODO: Change `*args` to `*` and remove the corresponding warning in docs when BC allows.
+    # Also remove the logic for arg parsing together.
+    def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
+        r"""Return a dictionary containing references to the whole state of the module.
+
+        Both parameters and persistent buffers (e.g. running averages) are
+        included. Keys are corresponding parameter and buffer names.
+        Parameters and buffers set to ``None`` are not included.
+
+        .. note::
+            The returned object is a shallow copy. It contains references
+            to the module's parameters and buffers.
+
+        .. warning::
+            Currently ``state_dict()`` also accepts positional arguments for
+            ``destination``, ``prefix`` and ``keep_vars`` in order. However,
+            this is being deprecated and keyword arguments will be enforced in
+            future releases.
+
+        .. warning::
+            Please avoid the use of argument ``destination`` as it is not
+            designed for end-users.
+
+        Args:
+            destination (dict, optional): If provided, the state of module will
+                be updated into the dict and the same object is returned.
+                Otherwise, an ``OrderedDict`` will be created and returned.
+                Default: ``None``.
+            prefix (str, optional): a prefix added to parameter and buffer
+                names to compose the keys in state_dict. Default: ``''``.
+            keep_vars (bool, optional): by default the :class:`~torch.Tensor` s
+                returned in the state dict are detached from autograd. If it's
+                set to ``True``, detaching will not be performed.
+                Default: ``False``.
+
+        Returns:
+            dict:
+                a dictionary containing a whole state of the module
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> module.state_dict().keys()
+            ['bias', 'weight']
+
+        """
+        # TODO: Remove `args` and the parsing logic when BC allows.
+        if len(args) > 0:
+            # DeprecationWarning is ignored by default
+            warnings.warn(
+                "Positional args are being deprecated, use kwargs instead. Refer to "
+                "https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict"
+                " for details.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            if destination is None:
+                destination = args[0]
+            if len(args) > 1 and prefix == "":
+                prefix = args[1]
+            if len(args) > 2 and keep_vars is False:
+                keep_vars = args[2]
+
+        if destination is None:
+            destination = OrderedDict()
+            # pyrefly: ignore [missing-attribute]
+            destination._metadata = OrderedDict()
+
+        local_metadata = dict(version=self._version)
+        if hasattr(destination, "_metadata"):
+            destination._metadata[prefix[:-1]] = local_metadata
+
+        for hook in self._state_dict_pre_hooks.values():
+            hook(self, prefix, keep_vars)
+        self._save_to_state_dict(destination, prefix, keep_vars)
+        for name, module in self._modules.items():
+            if module is not None:
+                module.state_dict(
+                    destination=destination,
+                    prefix=prefix + name + ".",
+                    keep_vars=keep_vars,
+                )
+        for hook in self._state_dict_hooks.values():
+            hook_result = hook(self, destination, prefix, local_metadata)
+            if not getattr(hook, "_from_public_api", False):
+                if hook_result is not None:
+                    destination = hook_result
+            else:
+                if hook_result is not None:
+                    raise RuntimeError("state_dict post-hook must return None")
+        return destination
+
+    def _register_load_state_dict_pre_hook(self, hook, with_module=False):
+        r"""See :meth:`~torch.nn.Module.register_load_state_dict_pre_hook` for details.
+
+        A subtle difference is that if ``with_module`` is set to ``False``, then the
+        hook will not take the ``module`` as the first argument whereas
+        :meth:`~torch.nn.Module.register_load_state_dict_pre_hook` always takes the
+        ``module`` as the first argument.
+
+        Arguments:
+            hook (Callable): Callable hook that will be invoked before
+                loading the state dict.
+            with_module (bool, optional): Whether or not to pass the module
+                instance to the hook as the first parameter.
+        """
+        handle = RemovableHandle(self._load_state_dict_pre_hooks)
+        self._load_state_dict_pre_hooks[handle.id] = _WrappedHook(
+            hook, self if with_module else None
+        )
+        return handle
+
+    def register_load_state_dict_pre_hook(self, hook):
+        r"""Register a pre-hook to be run before module's :meth:`~nn.Module.load_state_dict` is called.
+
+        It should have the following signature::
+            hook(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) -> None  # noqa: B950
+
+        Arguments:
+            hook (Callable): Callable hook that will be invoked before
+                loading the state dict.
+        """
+        return self._register_load_state_dict_pre_hook(hook, with_module=True)
+
+    def register_load_state_dict_post_hook(self, hook):
+        r"""Register a post-hook to be run after module's :meth:`~nn.Module.load_state_dict` is called.
+
+        It should have the following signature::
+            hook(module, incompatible_keys) -> None
+
+        The ``module`` argument is the current module that this hook is registered
+        on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting
+        of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys``
+        is a ``list`` of ``str`` containing the missing keys and
+        ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys.
+
+        The given incompatible_keys can be modified inplace if needed.
+
+        Note that the checks performed when calling :func:`load_state_dict` with
+        ``strict=True`` are affected by modifications the hook makes to
+        ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either
+        set of keys will result in an error being thrown when ``strict=True``, and
+        clearing out both missing and unexpected keys will avoid an error.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = RemovableHandle(self._load_state_dict_post_hooks)
+        self._load_state_dict_post_hooks[handle.id] = hook
+        return handle
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        r"""Copy parameters and buffers from :attr:`state_dict` into only this module, but not its descendants.
+
+        This is called on every submodule
+        in :meth:`~torch.nn.Module.load_state_dict`. Metadata saved for this
+        module in input :attr:`state_dict` is provided as :attr:`local_metadata`.
+        For state dicts without metadata, :attr:`local_metadata` is empty.
+        Subclasses can achieve class-specific backward compatible loading using
+        the version number at `local_metadata.get("version", None)`.
+        Additionally, :attr:`local_metadata` can also contain the key
+        `assign_to_params_buffers` that indicates whether keys should be
+        assigned their corresponding tensor in the state_dict.
+
+        .. note::
+            :attr:`state_dict` is not the same object as the input
+            :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So
+            it can be modified.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+            local_metadata (dict): a dict containing the metadata for this module.
+                See
+            strict (bool): whether to strictly enforce that the keys in
+                :attr:`state_dict` with :attr:`prefix` match the names of
+                parameters and buffers in this module
+            missing_keys (list of str): if ``strict=True``, add missing keys to
+                this list
+            unexpected_keys (list of str): if ``strict=True``, add unexpected
+                keys to this list
+            error_msgs (list of str): error messages should be added to this
+                list, and will be reported together in
+                :meth:`~torch.nn.Module.load_state_dict`
+        """
+        for hook in self._load_state_dict_pre_hooks.values():
+            hook(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+
+        persistent_buffers = {
+            k: v
+            for k, v in self._buffers.items()
+            if k not in self._non_persistent_buffers_set
+        }
+        local_name_params = itertools.chain(
+            self._parameters.items(),
+            # pyrefly: ignore [bad-argument-type]
+            persistent_buffers.items(),
+        )
+        local_state = {k: v for k, v in local_name_params if v is not None}
+        assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
+        use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion()
+
+        for name, param in local_state.items():
+            key = prefix + name
+            if key in state_dict:
+                input_param = state_dict[key]
+                if not torch.overrides.is_tensor_like(input_param):
+                    error_msgs.append(
+                        f'While copying the parameter named "{key}", '
+                        "expected torch.Tensor or Tensor-like object from checkpoint but "
+                        f"received {type(input_param)}"
+                    )
+                    continue
+
+                # This is used to avoid copying uninitialized parameters into
+                # non-lazy modules, since they dont have the hook to do the checks
+                # in such case, it will error when accessing the .shape attribute.
+                is_param_lazy = torch.nn.parameter.is_lazy(param)
+                # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+
+                if (
+                    not is_param_lazy
+                    and len(param.shape) == 0
+                    and len(input_param.shape) == 1
+                    and input_param.shape[0] == 1
+                ):
+                    input_param = input_param[0]
+
+                if not is_param_lazy and input_param.shape != param.shape:
+                    # local shape should match the one in checkpoint
+                    error_msgs.append(
+                        f"size mismatch for {key}: copying a param with shape {input_param.shape} from checkpoint, "
+                        f"the shape in current model is {param.shape}."
+                    )
+                    continue
+
+                if (
+                    param.is_meta
+                    and not input_param.is_meta
+                    and not assign_to_params_buffers
+                ):
+                    warnings.warn(
+                        f"for {key}: copying from a non-meta parameter in the checkpoint to a meta "
+                        "parameter in the current model, which is a no-op. (Did you mean to "
+                        "pass `assign=True` to assign items in the state dictionary to their "
+                        "corresponding key in the module instead of copying them in place?)",
+                        stacklevel=2,
+                    )
+
+                try:
+                    with torch.no_grad():
+                        if use_swap_tensors:
+                            new_input_param = param.module_load(
+                                input_param, assign=assign_to_params_buffers
+                            )
+                            if id(new_input_param) == id(input_param) or id(
+                                new_input_param
+                            ) == id(param):
+                                raise RuntimeError(
+                                    "module_load returned one of self or other, please .detach() "
+                                    "the result if returning one of the inputs in module_load"
+                                )
+                            if isinstance(param, torch.nn.Parameter):
+                                if not isinstance(new_input_param, torch.nn.Parameter):
+                                    new_input_param = torch.nn.Parameter(
+                                        new_input_param,
+                                        requires_grad=param.requires_grad,
+                                    )
+                                else:
+                                    new_input_param.requires_grad_(param.requires_grad)
+                            torch.utils.swap_tensors(param, new_input_param)
+                            del new_input_param
+                        elif assign_to_params_buffers:
+                            # Shape checks are already done above
+                            if isinstance(param, torch.nn.Parameter):
+                                if not isinstance(input_param, torch.nn.Parameter):
+                                    input_param = torch.nn.Parameter(
+                                        input_param, requires_grad=param.requires_grad
+                                    )
+                                else:
+                                    input_param.requires_grad_(param.requires_grad)
+                            setattr(self, name, input_param)
+                        else:
+                            param.copy_(input_param)
+                except Exception as ex:
+                    action = "swapping" if use_swap_tensors else "copying"
+                    error_msgs.append(
+                        f'While {action} the parameter named "{key}", '
+                        f"whose dimensions in the model are {param.size()} and "
+                        f"whose dimensions in the checkpoint are {input_param.size()}, "
+                        f"an exception occurred : {ex.args}."
+                    )
+            elif strict:
+                missing_keys.append(key)
+
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if (
+            getattr(self.__class__, "set_extra_state", Module.set_extra_state)
+            is not Module.set_extra_state
+        ):
+            if extra_state_key in state_dict:
+                self.set_extra_state(state_dict[extra_state_key])
+            elif strict:
+                missing_keys.append(extra_state_key)
+        elif strict and (extra_state_key in state_dict):
+            unexpected_keys.append(extra_state_key)
+
+        if strict:
+            for key in state_dict:
+                if key.startswith(prefix) and key != extra_state_key:
+                    input_name = key[len(prefix) :].split(".", 1)
+                    # Must be Module if it have attributes
+                    if len(input_name) > 1:
+                        if input_name[0] not in self._modules:
+                            unexpected_keys.append(key)
+                    elif input_name[0] not in local_state:
+                        unexpected_keys.append(key)
+
+    def load_state_dict(
+        self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False
+    ):
+        r"""Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
+
+        If :attr:`strict` is ``True``, then
+        the keys of :attr:`state_dict` must exactly match the keys returned
+        by this module's :meth:`~torch.nn.Module.state_dict` function.
+
+        .. warning::
+            If :attr:`assign` is ``True`` the optimizer must be created after
+            the call to :attr:`load_state_dict` unless
+            :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            strict (bool, optional): whether to strictly enforce that the keys
+                in :attr:`state_dict` match the keys returned by this module's
+                :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+            assign (bool, optional): When set to ``False``, the properties of the tensors
+                in the current module are preserved whereas setting it to ``True`` preserves
+                properties of the Tensors in the state dict. The only
+                exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`
+                for which the value from the module is preserved. Default: ``False``
+
+        Returns:
+            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+                * ``missing_keys`` is a list of str containing any keys that are expected
+                    by this module but missing from the provided ``state_dict``.
+                * ``unexpected_keys`` is a list of str containing the keys that are not
+                    expected by this module but present in the provided ``state_dict``.
+
+        Note:
+            If a parameter or buffer is registered as ``None`` and its corresponding key
+            exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a
+            ``RuntimeError``.
+        """
+        if not isinstance(state_dict, Mapping):
+            raise TypeError(
+                f"Expected state_dict to be dict-like, got {type(state_dict)}."
+            )
+
+        missing_keys: list[str] = []
+        unexpected_keys: list[str] = []
+        error_msgs: list[str] = []
+
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = OrderedDict(state_dict)
+        if metadata is not None:
+            # mypy isn't aware that "_metadata" exists in state_dict
+            state_dict._metadata = metadata  # type: ignore[attr-defined]
+
+        def load(module, local_state_dict, prefix="") -> None:
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            if assign:
+                local_metadata["assign_to_params_buffers"] = assign
+            module._load_from_state_dict(
+                local_state_dict,
+                prefix,
+                local_metadata,
+                True,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    child_prefix = prefix + name + "."
+                    child_state_dict = {
+                        k: v
+                        for k, v in local_state_dict.items()
+                        if k.startswith(child_prefix)
+                    }
+                    load(child, child_state_dict, child_prefix)  # noqa: F821
+
+            # Note that the hook can modify missing_keys and unexpected_keys.
+            incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys)
+            for hook in module._load_state_dict_post_hooks.values():
+                out = hook(module, incompatible_keys)
+                assert out is None, (
+                    "Hooks registered with ``register_load_state_dict_post_hook`` are not"
+                    "expected to return new values, if incompatible_keys need to be modified,"
+                    "it should be done inplace."
+                )
+
+        load(self, state_dict)
+        del load
+
+        if strict:
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Unexpected key(s) in state_dict: {}. ".format(
+                        ", ".join(f'"{k}"' for k in unexpected_keys)
+                    ),
+                )
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0,
+                    "Missing key(s) in state_dict: {}. ".format(
+                        ", ".join(f'"{k}"' for k in missing_keys)
+                    ),
+                )
+
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    self.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        return _IncompatibleKeys(missing_keys, unexpected_keys)
+
+    def _named_members(
+        self, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True
+    ):
+        r"""Help yield various names + members of modules."""
+        memo = set()
+        modules = (
+            self.named_modules(prefix=prefix, remove_duplicate=remove_duplicate)
+            if recurse
+            else [(prefix, self)]
+        )
+        for module_prefix, module in modules:
+            members = get_members_fn(module)
+            for k, v in members:
+                if v is None or v in memo:
+                    continue
+                if remove_duplicate:
+                    memo.add(v)
+                name = module_prefix + ("." if module_prefix else "") + k
+                yield name, v
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        r"""Return an iterator over module parameters.
+
+        This is typically passed to an optimizer.
+
+        Args:
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+
+        Yields:
+            Parameter: module parameter
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for param in model.parameters():
+            >>>     print(type(param), param.size())
+            <class 'torch.Tensor'> (20L,)
+            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for _name, param in self.named_parameters(recurse=recurse):
+            yield param
+
+    def named_parameters(
+        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+    ) -> Iterator[tuple[str, Parameter]]:
+        r"""Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
+
+        Args:
+            prefix (str): prefix to prepend to all parameter names.
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+            remove_duplicate (bool, optional): whether to remove the duplicated
+                parameters in the result. Defaults to True.
+
+        Yields:
+            (str, Parameter): Tuple containing the name and parameter
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, param in self.named_parameters():
+            >>>     if name in ['bias']:
+            >>>         print(param.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._parameters.items(),
+            prefix=prefix,
+            recurse=recurse,
+            remove_duplicate=remove_duplicate,
+        )
+        yield from gen
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
+        r"""Return an iterator over module buffers.
+
+        Args:
+            recurse (bool): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module.
+
+        Yields:
+            torch.Tensor: module buffer
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for buf in model.buffers():
+            >>>     print(type(buf), buf.size())
+            <class 'torch.Tensor'> (20L,)
+            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for _, buf in self.named_buffers(recurse=recurse):
+            yield buf
+
+    def named_buffers(
+        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+    ) -> Iterator[tuple[str, Tensor]]:
+        r"""Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
+
+        Args:
+            prefix (str): prefix to prepend to all buffer names.
+            recurse (bool, optional): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module. Defaults to True.
+            remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True.
+
+        Yields:
+            (str, torch.Tensor): Tuple containing the name and buffer
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, buf in self.named_buffers():
+            >>>     if name in ['running_var']:
+            >>>         print(buf.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._buffers.items(),
+            prefix=prefix,
+            recurse=recurse,
+            remove_duplicate=remove_duplicate,
+        )
+        yield from gen
+
+    def children(self) -> Iterator["Module"]:
+        r"""Return an iterator over immediate children modules.
+
+        Yields:
+            Module: a child module
+        """
+        for _name, module in self.named_children():
+            yield module
+
+    def named_children(self) -> Iterator[tuple[str, "Module"]]:
+        r"""Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
+
+        Yields:
+            (str, Module): Tuple containing a name and child module
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, module in model.named_children():
+            >>>     if name in ['conv4', 'conv5']:
+            >>>         print(module)
+
+        """
+        memo = set()
+        for name, module in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield name, module
+
+    def modules(self) -> Iterator["Module"]:
+        r"""Return an iterator over all modules in the network.
+
+        Yields:
+            Module: a module in the network
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.modules()):
+            ...     print(idx, '->', m)
+
+            0 -> Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+            1 -> Linear(in_features=2, out_features=2, bias=True)
+
+        """
+        for _, module in self.named_modules():
+            yield module
+
+    def named_modules(
+        self,
+        memo: set["Module"] | None = None,
+        prefix: str = "",
+        remove_duplicate: bool = True,
+    ):
+        r"""Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
+
+        Args:
+            memo: a memo to store the set of modules already added to the result
+            prefix: a prefix that will be added to the name of the module
+            remove_duplicate: whether to remove the duplicated module instances in the result
+                or not
+
+        Yields:
+            (str, Module): Tuple of name and module
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.named_modules()):
+            ...     print(idx, '->', m)
+
+            0 -> ('', Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            ))
+            1 -> ('0', Linear(in_features=2, out_features=2, bias=True))
+
+        """
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            if remove_duplicate:
+                memo.add(self)
+            yield prefix, self
+            for name, module in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ("." if prefix else "") + name
+                yield from module.named_modules(
+                    memo, submodule_prefix, remove_duplicate
+                )
+
+    def train(self, mode: bool = True) -> Self:
+        r"""Set the module in training mode.
+
+        This has an effect only on certain modules. See the documentation of
+        particular modules for details of their behaviors in training/evaluation
+        mode, i.e., whether they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                         mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        if not isinstance(mode, bool):
+            raise ValueError("training mode is expected to be boolean")
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+
+    def eval(self) -> Self:
+        r"""Set the module in evaluation mode.
+
+        This has an effect only on certain modules. See the documentation of
+        particular modules for details of their behaviors in training/evaluation
+        mode, i.e. whether they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.eval()` and several similar mechanisms that may be confused with it.
+
+        Returns:
+            Module: self
+        """
+        return self.train(False)
+
+    def requires_grad_(self, requires_grad: bool = True) -> Self:
+        r"""Change if autograd should record operations on parameters in this module.
+
+        This method sets the parameters' :attr:`requires_grad` attributes
+        in-place.
+
+        This method is helpful for freezing part of the module for finetuning
+        or training parts of a model individually (e.g., GAN training).
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.requires_grad_()` and several similar mechanisms that may be confused with it.
+
+        Args:
+            requires_grad (bool): whether autograd should record operations on
+                                  parameters in this module. Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        for p in self.parameters():
+            p.requires_grad_(requires_grad)
+        return self
+
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        r"""Reset gradients of all model parameters.
+
+        See similar function under :class:`torch.optim.Optimizer` for more context.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                See :meth:`torch.optim.Optimizer.zero_grad` for details.
+        """
+        if getattr(self, "_is_replica", False):
+            warnings.warn(
+                "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
+                "The parameters are copied (in a differentiable manner) from the original module. "
+                "This means they are not leaf nodes in autograd and so don't accumulate gradients. "
+                "If you need gradients in your forward method, consider using autograd.grad instead.",
+                stacklevel=2,
+            )
+
+        for p in self.parameters():
+            if p.grad is not None:
+                if set_to_none:
+                    p.grad = None
+                else:
+                    if p.grad.grad_fn is not None:
+                        p.grad.detach_()
+                    else:
+                        p.grad.requires_grad_(False)
+                    p.grad.zero_()
+
+    def share_memory(self) -> Self:
+        r"""See :meth:`torch.Tensor.share_memory_`."""
+        return self._apply(lambda t: t.share_memory_())
+
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def extra_repr(self) -> str:
+        r"""Return the extra representation of the module.
+
+        To print customized extra information, you should re-implement
+        this method in your own modules. Both single-line and multi-line
+        strings are acceptable.
+        """
+        return ""
+
+    def __repr__(self) -> str:
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + "("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+        main_str += ")"
+        return main_str
+
+    def __dir__(self):
+        module_attrs = dir(self.__class__)
+        attrs = list(self.__dict__.keys())
+        parameters = list(self._parameters.keys())
+        modules = list(self._modules.keys())
+        buffers = list(self._buffers.keys())
+        keys = module_attrs + attrs + parameters + modules + buffers
+
+        # Eliminate attrs that are not legal Python variable names
+        keys = [key for key in keys if not key[0].isdigit()]
+
+        return sorted(keys)
+
+    def _replicate_for_data_parallel(self):
+        replica = self.__new__(type(self))
+        replica.__dict__ = self.__dict__.copy()
+
+        # replicas do not have parameters themselves, the replicas reference the original
+        # module.
+        replica._parameters = {}
+        replica._buffers = replica._buffers.copy()
+        replica._modules = replica._modules.copy()
+        replica._is_replica = True  # type: ignore[assignment]
+
+        return replica
+
+    def compile(self, *args, **kwargs) -> None:
+        """
+        Compile this Module's forward using :func:`torch.compile`.
+
+        This Module's `__call__` method is compiled and all arguments are passed as-is
+        to :func:`torch.compile`.
+
+        See :func:`torch.compile` for details on the arguments for this function.
+        """
+        self._compiled_call_impl = torch.compile(self._call_impl, *args, **kwargs)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d492cdb3cf5a03c647760401fcc6f8709d87f1bd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/normalization.py
@@ -0,0 +1,430 @@
+# mypy: allow-untyped-defs
+import numbers
+from typing import Union
+
+import torch
+from torch import Size, Tensor
+from torch.nn import functional as F, init
+from torch.nn.parameter import Parameter
+
+from ._functions import CrossMapLRN2d as _cross_map_lrn2d
+from .module import Module
+
+
+__all__ = ["LocalResponseNorm", "CrossMapLRN2d", "LayerNorm", "GroupNorm", "RMSNorm"]
+
+
+class LocalResponseNorm(Module):
+    r"""Applies local response normalization over an input signal.
+
+    The input signal is composed of several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    .. math::
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+
+    Args:
+        size: amount of neighbouring channels used for normalization
+        alpha: multiplicative factor. Default: 0.0001
+        beta: exponent. Default: 0.75
+        k: additive factor. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    Examples::
+
+        >>> lrn = nn.LocalResponseNorm(2)
+        >>> signal_2d = torch.randn(32, 5, 24, 24)
+        >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7)
+        >>> output_2d = lrn(signal_2d)
+        >>> output_4d = lrn(signal_4d)
+
+    """
+
+    __constants__ = ["size", "alpha", "beta", "k"]
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(
+        self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0
+    ) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.local_response_norm(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self):
+        """
+        Return the extra representation of the module.
+        """
+        return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
+
+
+class CrossMapLRN2d(Module):
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(
+        self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1
+    ) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
+
+
+_shape_t = Union[int, list[int], Size]
+
+
+class LayerNorm(Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
+    the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The variance is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
+            :attr:`elementwise_affine` is ``True``). Default: ``True``.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+            The values are initialized to 1.
+        bias:   the learnable bias of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 0.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> # NLP Example
+        >>> batch, sentence_length, embedding_dim = 20, 5, 10
+        >>> embedding = torch.randn(batch, sentence_length, embedding_dim)
+        >>> layer_norm = nn.LayerNorm(embedding_dim)
+        >>> # Activate module
+        >>> layer_norm(embedding)
+        >>>
+        >>> # Image Example
+        >>> N, C, H, W = 20, 5, 10, 10
+        >>> input = torch.randn(N, C, H, W)
+        >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
+        >>> # as shown in the image below
+        >>> layer_norm = nn.LayerNorm([C, H, W])
+        >>> output = layer_norm(input)
+
+    .. image:: ../_static/img/nn/layer_norm.jpg
+        :scale: 50 %
+
+    """
+
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float = 1e-5,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(
+                torch.empty(self.normalized_shape, **factory_kwargs)
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(self.normalized_shape, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            if self.bias is not None:
+                init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps
+        )
+
+    def extra_repr(self) -> str:
+        return (
+            "{normalized_shape}, eps={eps}, "
+            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
+        )
+
+
+class GroupNorm(Module):
+    r"""Applies Group Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
+    :attr:`num_groups`. The mean and standard-deviation are calculated
+    separately over each group. :math:`\gamma` and :math:`\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+    The variance is calculated via the biased estimator, equivalent to
+    `torch.var(input, correction=0)`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    Examples::
+
+        >>> input = torch.randn(20, 6, 10, 10)
+        >>> # Separate 6 channels into 3 groups
+        >>> m = nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+    """
+
+    __constants__ = ["num_groups", "num_channels", "eps", "affine"]
+    num_groups: int
+    num_channels: int
+    eps: float
+    affine: bool
+
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 1e-5,
+        affine: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if num_channels % num_groups != 0:
+            raise ValueError(
+                f"num_channels ({num_channels}) must be divisible by num_groups ({num_groups})"
+            )
+
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_channels, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_channels, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+    def extra_repr(self) -> str:
+        return "{num_groups}, {num_channels}, eps={eps}, affine={affine}".format(
+            **self.__dict__
+        )
+
+
+class RMSNorm(Module):
+    r"""Applies Root Mean Square Layer Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Root Mean Square Layer Normalization <https://arxiv.org/pdf/1910.07467.pdf>`__
+
+    .. math::
+        y_i = \frac{x_i}{\mathrm{RMS}(x)} * \gamma_i, \quad
+        \text{where} \quad \text{RMS}(x) = \sqrt{\epsilon + \frac{1}{n} \sum_{i=1}^{n} x_i^2}
+
+    The RMS is taken over the last ``D`` dimensions, where ``D``
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the RMS is computed over
+    the last 2 dimensions of the input.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: ``torch.finfo(x.dtype).eps``
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> rms_norm = nn.RMSNorm([2, 3])
+        >>> input = torch.randn(2, 2, 3)
+        >>> rms_norm(input)
+
+    """
+
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: tuple[int, ...]
+    eps: float | None
+    elementwise_affine: bool
+
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float | None = None,
+        elementwise_affine: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(
+                torch.empty(self.normalized_shape, **factory_kwargs)
+            )
+        else:
+            self.register_parameter("weight", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            "{normalized_shape}, eps={eps}, "
+            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
+        )
+
+
+# TODO: ContrastiveNorm2d
+# TODO: DivisiveNorm2d
+# TODO: SubtractiveNorm2d
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5aa1e0d425548857d20b093041b190bc7f2f645
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/padding.py
@@ -0,0 +1,842 @@
+# mypy: allow-untyped-defs
+from collections.abc import Sequence
+
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.common_types import _size_2_t, _size_4_t, _size_6_t
+
+from .module import Module
+from .utils import _ntuple, _pair, _quadruple
+
+
+# TODO: grad_output size asserts in THNN
+
+__all__ = [
+    "CircularPad1d",
+    "CircularPad2d",
+    "CircularPad3d",
+    "ConstantPad1d",
+    "ConstantPad2d",
+    "ConstantPad3d",
+    "ReflectionPad1d",
+    "ReflectionPad2d",
+    "ReflectionPad3d",
+    "ReplicationPad1d",
+    "ReplicationPad2d",
+    "ReplicationPad3d",
+    "ZeroPad1d",
+    "ZeroPad2d",
+    "ZeroPad3d",
+]
+
+
+class _CircularPadNd(Module):
+    __constants__ = ["padding"]
+    padding: Sequence[int]
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        return F.pad(input, self.padding, "circular")
+
+    def extra_repr(self) -> str:
+        return f"{self.padding}"
+
+
+class CircularPad1d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.CircularPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 3., 0., 1., 2., 3., 0., 1.],
+                 [6., 7., 4., 5., 6., 7., 4., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad1d((3, 1))
+        >>> m(input)
+        tensor([[[1., 2., 3., 0., 1., 2., 3., 0.],
+                 [5., 6., 7., 4., 5., 6., 7., 4.]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class CircularPad2d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.CircularPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.],
+                  [2., 0., 1., 2., 0.],
+                  [5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.]]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 3 and input.dim() != 4:
+            raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
+
+
+class CircularPad3d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.CircularPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 4 and input.dim() != 5:
+            raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
+
+
+class _ConstantPadNd(Module):
+    __constants__ = ["padding", "value"]
+    value: float
+    padding: Sequence[int]
+
+    def __init__(self, value: float) -> None:
+        super().__init__()
+        self.value = value
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, "constant", self.value)
+
+    def extra_repr(self) -> str:
+        return f"padding={self.padding}, value={self.value}"
+
+
+class ConstantPad1d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000, -1.0491, -0.7152, -0.0749,  0.8530,  3.5000,
+                   3.5000],
+                 [ 3.5000,  3.5000, -1.3287,  1.8966,  0.1466, -0.2771,  3.5000,
+                   3.5000]]])
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad1d((3, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _pair(padding)
+
+
+class ConstantPad2d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad2d(2, 3.5)
+        >>> input = torch.randn(1, 2, 2)
+        >>> input
+        tensor([[[ 1.6585,  0.4320],
+                 [-0.8701, -0.4649]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -0.8701, -0.4649,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  1.6585,  0.4320],
+                 [ 3.5000,  3.5000,  3.5000, -0.8701, -0.4649],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+    """
+
+    __constants__ = ["padding", "value"]
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _quadruple(padding)
+
+
+class ConstantPad3d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ConstantPad3d(3, 3.5)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
+        >>> output = m(input)
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _ntuple(6)(padding)
+
+
+class _ReflectionPadNd(Module):
+    __constants__ = ["padding"]
+    padding: Sequence[int]
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, "reflect")
+
+    def extra_repr(self) -> str:
+        return f"{self.padding}"
+
+
+class ReflectionPad1d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+            Note that padding size should be less than the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ReflectionPad1d(2)
+        >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
+                 [6., 5., 4., 5., 6., 7., 6., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad1d((3, 1))
+        >>> m(input)
+        tensor([[[3., 2., 1., 0., 1., 2., 3., 2.],
+                 [7., 6., 5., 4., 5., 6., 7., 6.]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+
+class ReflectionPad2d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that padding size should be less than the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[7., 6., 7., 8., 7.],
+                  [4., 3., 4., 5., 4.],
+                  [1., 0., 1., 2., 1.],
+                  [4., 3., 4., 5., 4.],
+                  [7., 6., 7., 8., 7.]]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+
+class ReflectionPad3d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+            Note that padding size should be less than the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad3d(1)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2)
+        >>> m(input)
+        tensor([[[[[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]],
+                  [[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]]]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+
+class _ReplicationPadNd(Module):
+    __constants__ = ["padding"]
+    padding: Sequence[int]
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, "replicate")
+
+    def extra_repr(self) -> str:
+        return f"{self.padding}"
+
+
+class ReplicationPad1d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+            Note that the output dimensions must remain positive.
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReplicationPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[0., 0., 0., 1., 2., 3., 3., 3.],
+                 [4., 4., 4., 5., 6., 7., 7., 7.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad1d((3, 1))
+        >>> m(input)
+        tensor([[[0., 0., 0., 0., 1., 2., 3., 3.],
+                 [4., 4., 4., 4., 5., 6., 7., 7.]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+
+class ReplicationPad2d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that the output dimensions must remain positive.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ReplicationPad2d(2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [3., 3., 3., 4., 5., 5., 5.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [3., 3., 4., 5., 5.],
+                  [6., 6., 7., 8., 8.]]]])
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+
+class ReplicationPad3d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+            Note that the output dimensions must remain positive.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ReplicationPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+
+    # pyrefly: ignore [bad-override]
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+
+class ZeroPad1d(ConstantPad1d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000, -1.0491, -0.7152, -0.0749,  0.8530,  0.0000,
+                   0.0000],
+                 [ 0.0000,  0.0000, -1.3287,  1.8966,  0.1466, -0.2771,  0.0000,
+                   0.0000]]])
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000,  0.0000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad1d((3, 1))
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000]]])
+    """
+
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
+
+
+class ZeroPad2d(ConstantPad2d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad2d(2)
+        >>> input = torch.randn(1, 1, 3, 3)
+        >>> input
+        tensor([[[[-0.1678, -0.4418,  1.9466],
+                  [ 0.9604, -0.4219, -0.5241],
+                  [-0.9162, -0.5436, -0.6446]]]])
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.1678, -0.4418,  1.9466,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.9604, -0.4219, -0.5241,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.9162, -0.5436, -0.6446,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000, -0.1678, -0.4418,  1.9466,  0.0000],
+                  [ 0.0000,  0.9604, -0.4219, -0.5241,  0.0000],
+                  [ 0.0000, -0.9162, -0.5436, -0.6446,  0.0000]]]])
+    """
+
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
+
+
+class ZeroPad3d(ConstantPad3d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ZeroPad3d(3)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
+        >>> output = m(input)
+    """
+
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c9e0878f0b5ecc48878c63115aafc2128b3afd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pixelshuffle.py
@@ -0,0 +1,127 @@
+import torch.nn.functional as F
+from torch import Tensor
+
+from .module import Module
+
+
+__all__ = ["PixelShuffle", "PixelUnshuffle"]
+
+
+class PixelShuffle(Module):
+    r"""Rearrange elements in a tensor according to an upscaling factor.
+
+    Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor.
+
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et al. (2016) for more details.
+
+    Args:
+        upscale_factor (int): factor to increase spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \div \text{upscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \times \text{upscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \times \text{upscale\_factor}
+
+    Examples::
+
+        >>> pixel_shuffle = nn.PixelShuffle(3)
+        >>> input = torch.randn(1, 9, 4, 4)
+        >>> output = pixel_shuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 1, 12, 12])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    __constants__ = ["upscale_factor"]
+    upscale_factor: int
+
+    def __init__(self, upscale_factor: int) -> None:
+        super().__init__()
+        self.upscale_factor = upscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.pixel_shuffle(input, self.upscale_factor)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"upscale_factor={self.upscale_factor}"
+
+
+class PixelUnshuffle(Module):
+    r"""Reverse the PixelShuffle operation.
+
+    Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements
+    in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+    :math:`(*, C \times r^2, H, W)`, where r is a downscale factor.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et al. (2016) for more details.
+
+    Args:
+        downscale_factor (int): factor to decrease spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \times \text{downscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \div \text{downscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \div \text{downscale\_factor}
+
+    Examples::
+
+        >>> pixel_unshuffle = nn.PixelUnshuffle(3)
+        >>> input = torch.randn(1, 1, 12, 12)
+        >>> output = pixel_unshuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 9, 4, 4])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    __constants__ = ["downscale_factor"]
+    downscale_factor: int
+
+    def __init__(self, downscale_factor: int) -> None:
+        super().__init__()
+        self.downscale_factor = downscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.pixel_unshuffle(input, self.downscale_factor)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"downscale_factor={self.downscale_factor}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc57c25b168396fa9ceff5b32fd368befa094af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/pooling.py
@@ -0,0 +1,1550 @@
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.common_types import (
+    _ratio_2_t,
+    _ratio_3_t,
+    _size_1_t,
+    _size_2_opt_t,
+    _size_2_t,
+    _size_3_opt_t,
+    _size_3_t,
+    _size_any_opt_t,
+    _size_any_t,
+)
+
+from .module import Module
+from .utils import _pair, _single, _triple
+
+
+__all__ = [
+    "MaxPool1d",
+    "MaxPool2d",
+    "MaxPool3d",
+    "MaxUnpool1d",
+    "MaxUnpool2d",
+    "MaxUnpool3d",
+    "AvgPool1d",
+    "AvgPool2d",
+    "AvgPool3d",
+    "FractionalMaxPool2d",
+    "FractionalMaxPool3d",
+    "LPPool1d",
+    "LPPool2d",
+    "LPPool3d",
+    "AdaptiveMaxPool1d",
+    "AdaptiveMaxPool2d",
+    "AdaptiveMaxPool3d",
+    "AdaptiveAvgPool1d",
+    "AdaptiveAvgPool2d",
+    "AdaptiveAvgPool3d",
+]
+
+
+class _MaxPoolNd(Module):
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "return_indices",
+        "ceil_mode",
+    ]
+    return_indices: bool
+    ceil_mode: bool
+
+    def __init__(
+        self,
+        kernel_size: _size_any_t,
+        stride: _size_any_t | None = None,
+        padding: _size_any_t = 0,
+        dilation: _size_any_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self) -> str:
+        return (
+            "kernel_size={kernel_size}, stride={stride}, padding={padding}"
+            ", dilation={dilation}, ceil_mode={ceil_mode}".format(**self.__dict__)
+        )
+
+
+class MaxPool1d(_MaxPoolNd):
+    r"""Applies a 1D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1}
+                input(N_i, C_j, stride \times k + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
+    sliding window. This `link`_ has a nice visualization of the pooling parameters.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: The size of the sliding window, must be > 0.
+        stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`,
+
+          where ``ceil_mode = False``
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                   \times (\text{kernel\_size} - 1) - 1}{\text{stride}}\right\rfloor + 1
+
+          where ``ceil_mode = True``
+
+          .. math::
+              L_{out} = \left\lceil \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1 + (stride - 1)}{\text{stride}}\right\rceil + 1
+
+        - Ensure that the last pooling starts inside the image, make :math:`L_{out} = L_{out} - 1`
+          when :math:`(L_{out} - 1) * \text{stride} >= L_{in} + \text{padding}`.
+
+    Examples::
+
+        >>> # pool of size=3, stride=2
+        >>> m = nn.MaxPool1d(3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    dilation: _size_1_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.max_pool1d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
+
+
+class MaxPool2d(_MaxPoolNd):
+    r"""Applies a 2D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]}
+                    \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]}
+                    \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    dilation: _size_2_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.max_pool2d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
+
+
+class MaxPool3d(_MaxPoolNd):
+    r"""Applies a 3D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
+                (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
+                (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
+                (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    dilation: _size_3_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.max_pool3d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
+
+
+class _MaxUnpoolNd(Module):
+    def extra_repr(self) -> str:
+        return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}"
+
+
+class MaxUnpool1d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
+
+    :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`.
+        - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool1d(2, stride=2)
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+
+        >>> # Example showcasing the use of output_size
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
+
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: _size_1_t | None = None,
+        padding: _size_1_t = 0,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if (stride is not None) else kernel_size)
+        self.padding = _single(padding)
+
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: list[int] | None = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool1d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
+
+
+class MaxUnpool2d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
+
+    :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool2d(2, stride=2)
+        >>> input = torch.tensor([[[[ 1.,  2.,  3.,  4.],
+                                    [ 5.,  6.,  7.,  8.],
+                                    [ 9., 10., 11., 12.],
+                                    [13., 14., 15., 16.]]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])
+        >>> # Now using output_size to resolve an ambiguous size for the inverse
+        >>> input = torch.tensor([[[[ 1.,  2.,  3.,  4.,  5.],
+                                    [ 6.,  7.,  8.,  9., 10.],
+                                    [11., 12., 13., 14., 15.],
+                                    [16., 17., 18., 19., 20.]]]])
+        >>> output, indices = pool(input)
+        >>> # This call will not work without specifying output_size
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  7.,  0.,  9.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.],
+                  [ 0., 17.,  0., 19.,  0.]]]])
+
+
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: _size_2_t | None = None,
+        padding: _size_2_t = 0,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride if (stride is not None) else kernel_size)
+        self.padding = _pair(padding)
+
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: list[int] | None = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool2d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
+
+
+class MaxUnpool3d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+
+    :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs section below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+
+          .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool3d(3, stride=2)
+        >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15))
+        >>> unpooled_output = unpool(output, indices)
+        >>> unpooled_output.size()
+        torch.Size([20, 16, 51, 33, 15])
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: _size_3_t | None = None,
+        padding: _size_3_t = 0,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride if (stride is not None) else kernel_size)
+        self.padding = _triple(padding)
+
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: list[int] | None = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool3d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
+
+
+class _AvgPoolNd(Module):
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+    ]
+
+    def extra_repr(self) -> str:
+        return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}"
+
+
+class AvgPool1d(_AvgPoolNd):
+    r"""Applies a 1D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`,
+    output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k`
+    can be precisely described as:
+
+    .. math::
+
+        \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k-1}
+                               \text{input}(N_i, C_j, \text{stride} \times l + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    .. note::
+        pad should be at most half of effective kernel size.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
+    an ``int`` or a one-element tuple.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} +
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in}
+          + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in
+          :math:`L_{out}` being reduced by one.
+
+    Examples::
+
+        >>> # pool with window of size=3, stride=2
+        >>> m = nn.AvgPool1d(3, stride=2)
+        >>> m(torch.tensor([[[1., 2, 3, 4, 5, 6, 7]]]))
+        tensor([[[2., 4., 6.]]])
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = None,
+        padding: _size_1_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if stride is not None else kernel_size)
+        self.padding = _single(padding)
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.avg_pool1d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+        )
+
+
+class AvgPool2d(_AvgPoolNd):
+    r"""Applies a 2D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    .. note::
+        pad should be at most half of effective kernel size.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
+
+        - a single ``int`` or a single-element tuple -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
+
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] -
+                \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
+                \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region,
+          resulting in :math:`H_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}`.
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    """
+
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+        "divisor_override",
+    ]
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: _size_2_t | None = None,
+        padding: _size_2_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+        divisor_override: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.avg_pool2d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+            self.divisor_override,
+        )
+
+
+class AvgPool3d(_AvgPoolNd):
+    r"""Applies a 3D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\
+                                              & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k,
+                                                      \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)}
+                                                     {kD \times kH \times kW}
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    .. note::
+        pad should be at most half of effective kernel size.
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on all three sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise :attr:`kernel_size` will be used
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] -
+                    \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] -
+                    \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
+                    \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the padded region,
+          resulting in :math:`D_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}` and :math:`H_{out}`.
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+    """
+
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+        "divisor_override",
+    ]
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: _size_3_t | None = None,
+        padding: _size_3_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+        divisor_override: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.avg_pool3d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+            self.divisor_override,
+        )
+
+    def __setstate__(self, d):
+        super().__setstate__(d)
+        self.__dict__.setdefault("padding", 0)
+        self.__dict__.setdefault("ceil_mode", False)
+        self.__dict__.setdefault("count_include_pad", True)
+
+
+class FractionalMaxPool2d(Module):
+    r"""Applies a 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)`
+        output_size: the target output size of the image of the form `oH x oW`.
+                     Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`.
+                     Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1).
+                      Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}`
+                      and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}`
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`.
+
+    Examples:
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"]
+
+    kernel_size: _size_2_t
+    return_indices: bool
+    output_size: _size_2_t
+    output_ratio: _ratio_2_t
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        output_size: _size_2_t | None = None,
+        output_ratio: _ratio_2_t | None = None,
+        return_indices: bool = False,
+        _random_samples=None,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer("_random_samples", _random_samples)
+        self.output_size = _pair(output_size) if output_size is not None else None
+        self.output_ratio = _pair(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError(
+                "FractionalMaxPool2d requires specifying either "
+                "an output size, or a pooling ratio"
+            )
+        if output_size is not None and output_ratio is not None:
+            raise ValueError(
+                "only one of output_size and output_ratio may be specified"
+            )
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1):
+                raise ValueError(
+                    f"output_ratio must be between 0 and 1 (got {output_ratio})"
+                )
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool2d(
+            input,
+            self.kernel_size,
+            self.output_size,
+            self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples,
+        )
+
+
+class FractionalMaxPool3d(Module):
+    r"""Applies a 3D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number `k` (for a square kernel of `k x k x k`) or a tuple `(kt x kh x kw)`,
+                     `k` must greater than 0.
+        output_size: the target output size of the image of the form `oT x oH x oW`.
+                     Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})`
+
+    Examples:
+        >>> # pool of cubic window of size=3, and target output size 13x12x11
+        >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11))
+        >>> # pool of cubic window and target output size being half of input size
+        >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32, 16)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"]
+    kernel_size: _size_3_t
+    return_indices: bool
+    output_size: _size_3_t
+    output_ratio: _ratio_3_t
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        output_size: _size_3_t | None = None,
+        output_ratio: _ratio_3_t | None = None,
+        return_indices: bool = False,
+        _random_samples=None,
+    ) -> None:
+        super().__init__()
+        if (isinstance(kernel_size, int) and kernel_size <= 0) or (
+            isinstance(kernel_size, (tuple, list))
+            and not all(k > 0 for k in kernel_size)
+        ):
+            raise ValueError(f"kernel_size must greater than 0, but got {kernel_size}")
+        self.kernel_size = _triple(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer("_random_samples", _random_samples)
+        self.output_size = _triple(output_size) if output_size is not None else None
+        self.output_ratio = _triple(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError(
+                "FractionalMaxPool3d requires specifying either "
+                "an output size, or a pooling ratio"
+            )
+        if output_size is not None and output_ratio is not None:
+            raise ValueError(
+                "only one of output_size and output_ratio may be specified"
+            )
+        if self.output_ratio is not None:
+            if not (
+                0 < self.output_ratio[0] < 1
+                and 0 < self.output_ratio[1] < 1
+                and 0 < self.output_ratio[2] < 1
+            ):
+                raise ValueError(
+                    f"output_ratio must be between 0 and 1 (got {output_ratio})"
+                )
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool3d(
+            input,
+            self.kernel_size,
+            self.output_size,
+            self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples,
+        )
+
+
+class _LPPoolNd(Module):
+    __constants__ = ["norm_type", "kernel_size", "stride", "ceil_mode"]
+
+    norm_type: float
+    ceil_mode: bool
+
+    def __init__(
+        self,
+        norm_type: float,
+        kernel_size: _size_any_t,
+        stride: _size_any_t | None = None,
+        ceil_mode: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self) -> str:
+        return (
+            "norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, "
+            "ceil_mode={ceil_mode}".format(**self.__dict__)
+        )
+
+
+class LPPool1d(_LPPoolNd):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: a single int, the size of the window
+        stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+        >>> # power-2 pool of window of length 3, with stride 2.
+        >>> m = nn.LPPool1d(2, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool1d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class LPPool2d(_LPPoolNd):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool2d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool2d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class LPPool3d(_LPPoolNd):
+    r"""Applies a 3D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height, width and depth dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool3d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool3d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class _AdaptiveMaxPoolNd(Module):
+    __constants__ = ["output_size", "return_indices"]
+    return_indices: bool
+
+    def __init__(
+        self, output_size: _size_any_opt_t, return_indices: bool = False
+    ) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+
+    def extra_repr(self) -> str:
+        return f"output_size={self.output_size}"
+
+
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+
+
+class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool1d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveMaxPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a
+                     square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}`
+                     can be either a ``int``, or ``None`` which means the size will be the same as that
+                     of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool2d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveMaxPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveMaxPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single
+                     :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`.
+                     :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a
+                     ``int``, or ``None`` which means the size will be the same as that of the input.
+
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool3d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveMaxPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveMaxPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
+
+
+class _AdaptiveAvgPoolNd(Module):
+    __constants__ = ["output_size"]
+
+    def __init__(self, output_size: _size_any_opt_t) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def extra_repr(self) -> str:
+        return f"output_size={self.output_size}"
+
+
+class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
+    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.adaptive_avg_pool1d(input, self.output_size)
+
+
+class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
+    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where
+          :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveAvgPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.adaptive_avg_pool2d(input, self.output_size)
+
+
+class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`,
+          where :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveAvgPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveAvgPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.adaptive_avg_pool3d(input, self.output_size)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..68e8292870fc8a8d19ce3307294377b162c8b6fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/rnn.py
@@ -0,0 +1,1850 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import math
+import numbers
+import warnings
+import weakref
+from typing import overload
+from typing_extensions import deprecated
+
+import torch
+from torch import _VF, Tensor
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from torch.nn.utils.rnn import PackedSequence
+
+from .module import Module
+
+
+__all__ = [
+    "RNNBase",
+    "RNN",
+    "LSTM",
+    "GRU",
+    "RNNCellBase",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+]
+
+_rnn_impls = {
+    "RNN_TANH": _VF.rnn_tanh,
+    "RNN_RELU": _VF.rnn_relu,
+}
+
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+
+@deprecated(
+    "`apply_permutation` is deprecated, please use `tensor.index_select(dim, permutation)` instead",
+    category=FutureWarning,
+)
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return _apply_permutation(tensor, permutation, dim)
+
+
+class RNNBase(Module):
+    r"""Base class for RNN modules (RNN, LSTM, GRU).
+
+    Implements aspects of RNNs shared by the RNN, LSTM, and GRU classes, such as module initialization
+    and utility methods for parameter storage management.
+
+    .. note::
+        The forward method is not implemented by the RNNBase class.
+
+    .. note::
+        LSTM and GRU classes override some methods implemented by RNNBase.
+    """
+
+    __constants__ = [
+        "mode",
+        "input_size",
+        "hidden_size",
+        "num_layers",
+        "bias",
+        "batch_first",
+        "dropout",
+        "bidirectional",
+        "proj_size",
+    ]
+    __jit_unused_properties__ = ["all_weights"]
+
+    mode: str
+    input_size: int
+    hidden_size: int
+    num_layers: int
+    bias: bool
+    batch_first: bool
+    dropout: float
+    bidirectional: bool
+    proj_size: int
+
+    def __init__(
+        self,
+        mode: str,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        proj_size: int = 0,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.proj_size = proj_size
+        self._flat_weight_refs: list[weakref.ReferenceType[Parameter] | None] = []
+        num_directions = 2 if bidirectional else 1
+
+        if (
+            not isinstance(dropout, numbers.Number)
+            or not 0 <= dropout <= 1
+            or isinstance(dropout, bool)
+        ):
+            raise ValueError(
+                "dropout should be a number in range [0, 1] "
+                "representing the probability of an element being "
+                "zeroed"
+            )
+        if dropout > 0 and num_layers == 1:
+            warnings.warn(
+                "dropout option adds dropout after all but last "
+                "recurrent layer, so non-zero dropout expects "
+                f"num_layers greater than 1, but got dropout={dropout} and "
+                f"num_layers={num_layers}",
+                stacklevel=2,
+            )
+
+        if not isinstance(hidden_size, int):
+            raise TypeError(
+                f"hidden_size should be of type int, got: {type(hidden_size).__name__}"
+            )
+        if hidden_size <= 0:
+            raise ValueError("hidden_size must be greater than zero")
+        if num_layers <= 0:
+            raise ValueError("num_layers must be greater than zero")
+        if proj_size < 0:
+            raise ValueError(
+                "proj_size should be a positive integer or zero to disable projections"
+            )
+        if proj_size >= hidden_size:
+            raise ValueError("proj_size has to be smaller than hidden_size")
+
+        if mode == "LSTM":
+            gate_size = 4 * hidden_size
+        elif mode == "GRU":
+            gate_size = 3 * hidden_size
+        elif mode == "RNN_TANH":
+            gate_size = hidden_size
+        elif mode == "RNN_RELU":
+            gate_size = hidden_size
+        else:
+            raise ValueError("Unrecognized RNN mode: " + mode)
+
+        self._flat_weights_names = []
+        self._all_weights = []
+        for layer in range(num_layers):
+            for direction in range(num_directions):
+                real_hidden_size = proj_size if proj_size > 0 else hidden_size
+                layer_input_size = (
+                    input_size if layer == 0 else real_hidden_size * num_directions
+                )
+
+                w_ih = Parameter(
+                    torch.empty((gate_size, layer_input_size), **factory_kwargs)
+                )
+                w_hh = Parameter(
+                    torch.empty((gate_size, real_hidden_size), **factory_kwargs)
+                )
+                b_ih = Parameter(torch.empty(gate_size, **factory_kwargs))
+                # Second bias vector included for CuDNN compatibility. Only one
+                # bias vector is needed in standard definition.
+                b_hh = Parameter(torch.empty(gate_size, **factory_kwargs))
+                layer_params: tuple[Tensor, ...] = ()
+                if self.proj_size == 0:
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh)
+                    else:
+                        layer_params = (w_ih, w_hh)
+                else:
+                    w_hr = Parameter(
+                        torch.empty((proj_size, hidden_size), **factory_kwargs)
+                    )
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh, w_hr)
+                    else:
+                        layer_params = (w_ih, w_hh, w_hr)
+
+                suffix = "_reverse" if direction == 1 else ""
+                param_names = ["weight_ih_l{}{}", "weight_hh_l{}{}"]
+                if bias:
+                    param_names += ["bias_ih_l{}{}", "bias_hh_l{}{}"]
+                if self.proj_size > 0:
+                    param_names += ["weight_hr_l{}{}"]
+                param_names = [x.format(layer, suffix) for x in param_names]
+
+                for name, param in zip(param_names, layer_params, strict=True):
+                    setattr(self, name, param)
+                self._flat_weights_names.extend(param_names)
+                self._all_weights.append(param_names)
+
+        self._init_flat_weights()
+
+        self.reset_parameters()
+
+    def _init_flat_weights(self) -> None:
+        self._flat_weights = [
+            getattr(self, wn) if hasattr(self, wn) else None
+            for wn in self._flat_weights_names
+        ]
+        self._flat_weight_refs = [
+            weakref.ref(w) if w is not None else None for w in self._flat_weights
+        ]
+        self.flatten_parameters()
+
+    def __setattr__(self, attr, value) -> None:
+        if hasattr(self, "_flat_weights_names") and attr in self._flat_weights_names:
+            # keep self._flat_weights up to date if you do self.weight = ...
+            idx = self._flat_weights_names.index(attr)
+            self._flat_weights[idx] = value
+        super().__setattr__(attr, value)
+
+    def flatten_parameters(self) -> None:
+        """Reset parameter data pointer so that they can use faster code paths.
+
+        Right now, this works only if the module is on the GPU and cuDNN is enabled.
+        Otherwise, it's a no-op.
+        """
+        # Short-circuits if _flat_weights is only partially instantiated
+        if len(self._flat_weights) != len(self._flat_weights_names):
+            return
+
+        for w in self._flat_weights:
+            if not isinstance(w, Tensor):
+                return
+        # Short-circuits if any tensor in self._flat_weights is not acceptable to cuDNN
+        # or the tensors in _flat_weights are of different dtypes
+
+        first_fw = self._flat_weights[0]  # type: ignore[union-attr]
+        dtype = first_fw.dtype  # type: ignore[union-attr]
+        for fw in self._flat_weights:
+            if (
+                not isinstance(fw, Tensor)
+                or fw.dtype != dtype
+                or not fw.is_cuda
+                or not torch.backends.cudnn.is_acceptable(fw)
+            ):
+                return
+
+        # If any parameters alias, we fall back to the slower, copying code path. This is
+        # a sufficient check, because overlapping parameter buffers that don't completely
+        # alias would break the assumptions of the uniqueness check in
+        # Module.named_parameters().
+        unique_data_ptrs = {
+            p.data_ptr()  # type: ignore[union-attr]
+            for p in self._flat_weights
+        }
+        if len(unique_data_ptrs) != len(self._flat_weights):
+            return
+
+        with torch.cuda.device_of(first_fw):
+            import torch.backends.cudnn.rnn as rnn
+
+            # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is
+            # an inplace operation on self._flat_weights
+            with torch.no_grad():
+                if torch._use_cudnn_rnn_flatten_weight():
+                    num_weights = 4 if self.bias else 2
+                    if self.proj_size > 0:
+                        num_weights += 1
+                    torch._cudnn_rnn_flatten_weight(
+                        self._flat_weights,  # type: ignore[arg-type]
+                        num_weights,
+                        self.input_size,
+                        rnn.get_cudnn_mode(self.mode),
+                        self.hidden_size,
+                        self.proj_size,
+                        self.num_layers,
+                        self.batch_first,
+                        bool(self.bidirectional),
+                    )
+
+    def _apply(self, fn, recurse=True):
+        self._flat_weight_refs = []
+        ret = super()._apply(fn, recurse)
+
+        # Resets _flat_weights
+        # Note: be v. careful before removing this, as 3rd party device types
+        # likely rely on this behavior to properly .to() modules like LSTM.
+        self._init_flat_weights()
+
+        return ret
+
+    def reset_parameters(self) -> None:
+        stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0
+        for weight in self.parameters():
+            init.uniform_(weight, -stdv, stdv)
+
+    def check_input(self, input: Tensor, batch_sizes: Tensor | None) -> None:
+        if not torch.jit.is_scripting():
+            if (
+                input.dtype != self._flat_weights[0].dtype  # type: ignore[union-attr]
+                and not torch._C._is_any_autocast_enabled()
+            ):
+                raise ValueError(
+                    f"input must have the type {self._flat_weights[0].dtype}, got type {input.dtype}"  # type: ignore[union-attr]
+                )
+        expected_input_dim = 2 if batch_sizes is not None else 3
+        if input.dim() != expected_input_dim:
+            raise RuntimeError(
+                f"input must have {expected_input_dim} dimensions, got {input.dim()}"
+            )
+        if self.input_size != input.size(-1):
+            raise RuntimeError(
+                f"input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}"
+            )
+
+    def get_expected_hidden_size(
+        self, input: Tensor, batch_sizes: Tensor | None
+    ) -> tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if self.proj_size > 0:
+            expected_hidden_size = (
+                self.num_layers * num_directions,
+                mini_batch,
+                self.proj_size,
+            )
+        else:
+            expected_hidden_size = (
+                self.num_layers * num_directions,
+                mini_batch,
+                self.hidden_size,
+            )
+        return expected_hidden_size
+
+    def check_hidden_size(
+        self,
+        hx: Tensor,
+        expected_hidden_size: tuple[int, int, int],
+        msg: str = "Expected hidden size {}, got {}",
+    ) -> None:
+        if hx.size() != expected_hidden_size:
+            raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
+
+    def _weights_have_changed(self):
+        # Returns True if the weight tensors have changed since the last forward pass.
+        # This is the case when used with torch.func.functional_call(), for example.
+        weights_changed = False
+        for ref, name in zip(
+            self._flat_weight_refs, self._flat_weights_names, strict=True
+        ):
+            weight = getattr(self, name) if hasattr(self, name) else None
+            if weight is not None and ref is not None and ref() is not weight:
+                weights_changed = True
+                break
+        return weights_changed
+
+    def check_forward_args(
+        self, input: Tensor, hidden: Tensor, batch_sizes: Tensor | None
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(hidden, expected_hidden_size)
+
+    def permute_hidden(self, hx: Tensor, permutation: Tensor | None):
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+    def extra_repr(self) -> str:
+        s = "{input_size}, {hidden_size}"
+        if self.proj_size != 0:
+            s += ", proj_size={proj_size}"
+        if self.num_layers != 1:
+            s += ", num_layers={num_layers}"
+        if self.bias is not True:
+            s += ", bias={bias}"
+        if self.batch_first is not False:
+            s += ", batch_first={batch_first}"
+        if self.dropout != 0:
+            s += ", dropout={dropout}"
+        if self.bidirectional is not False:
+            s += ", bidirectional={bidirectional}"
+        return s.format(**self.__dict__)
+
+    def _update_flat_weights(self) -> None:
+        if not torch.jit.is_scripting():
+            if self._weights_have_changed():
+                self._init_flat_weights()
+
+    def __getstate__(self):
+        # If weights have been changed, update the _flat_weights in __getstate__ here.
+        self._update_flat_weights()
+        # Don't serialize the weight references.
+        state = self.__dict__.copy()
+        del state["_flat_weight_refs"]
+        return state
+
+    def __setstate__(self, d):
+        super().__setstate__(d)
+        if "all_weights" in d:
+            self._all_weights = d["all_weights"]
+        # In PyTorch 1.8 we added a proj_size member variable to LSTM.
+        # LSTMs that were serialized via torch.save(module) before PyTorch 1.8
+        # don't have it, so to preserve compatibility we set proj_size here.
+        if "proj_size" not in d:
+            self.proj_size = 0
+
+        if not isinstance(self._all_weights[0][0], str):
+            num_layers = self.num_layers
+            num_directions = 2 if self.bidirectional else 1
+            self._flat_weights_names = []
+            self._all_weights = []
+            for layer in range(num_layers):
+                for direction in range(num_directions):
+                    suffix = "_reverse" if direction == 1 else ""
+                    weights = [
+                        "weight_ih_l{}{}",
+                        "weight_hh_l{}{}",
+                        "bias_ih_l{}{}",
+                        "bias_hh_l{}{}",
+                        "weight_hr_l{}{}",
+                    ]
+                    weights = [x.format(layer, suffix) for x in weights]
+                    if self.bias:
+                        if self.proj_size > 0:
+                            self._all_weights += [weights]
+                            self._flat_weights_names.extend(weights)
+                        else:
+                            self._all_weights += [weights[:4]]
+                            self._flat_weights_names.extend(weights[:4])
+                    else:
+                        if self.proj_size > 0:
+                            self._all_weights += [weights[:2]] + [weights[-1:]]
+                            self._flat_weights_names.extend(
+                                weights[:2] + [weights[-1:]]
+                            )
+                        else:
+                            self._all_weights += [weights[:2]]
+                            self._flat_weights_names.extend(weights[:2])
+            self._flat_weights = [
+                getattr(self, wn) if hasattr(self, wn) else None
+                for wn in self._flat_weights_names
+            ]
+
+        self._flat_weight_refs = [
+            weakref.ref(w) if w is not None else None for w in self._flat_weights
+        ]
+
+    @property
+    def all_weights(self) -> list[list[Parameter]]:
+        return [
+            [getattr(self, weight) for weight in weights]
+            for weights in self._all_weights
+        ]
+
+    def _replicate_for_data_parallel(self):
+        replica = super()._replicate_for_data_parallel()
+        # Need to copy these caches, otherwise the replica will share the same
+        # flat weights list.
+        replica._flat_weights = replica._flat_weights[:]
+        replica._flat_weights_names = replica._flat_weights_names[:]
+        return replica
+
+
+class RNN(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)
+
+    Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}`
+    non-linearity to an input sequence. For each element in the input sequence,
+    each layer computes the following function:
+
+    .. math::
+        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
+    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
+    previous layer at time `t-1` or the initial hidden state at time `0`.
+    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
+
+    .. code-block:: python
+
+        # Efficient implementation equivalent to the following with bidirectional=False
+        rnn = nn.RNN(input_size, hidden_size, num_layers)
+        params = dict(rnn.named_parameters())
+        def forward(x, hx=None, batch_first=False):
+            if batch_first:
+                x = x.transpose(0, 1)
+            seq_len, batch_size, _ = x.size()
+            if hx is None:
+                hx = torch.zeros(rnn.num_layers, batch_size, rnn.hidden_size)
+            h_t_minus_1 = hx.clone()
+            h_t = hx.clone()
+            output = []
+            for t in range(seq_len):
+                for layer in range(rnn.num_layers):
+                    input_t = x[t] if layer == 0 else h_t[layer - 1]
+                    h_t[layer] = torch.tanh(
+                        input_t @ params[f"weight_ih_l{layer}"].T
+                        + h_t_minus_1[layer] @ params[f"weight_hh_l{layer}"].T
+                        + params[f"bias_hh_l{layer}"]
+                        + params[f"bias_ih_l{layer}"]
+                    )
+                output.append(h_t[-1].clone())
+                h_t_minus_1 = h_t.clone()
+            output = torch.stack(output)
+            if batch_first:
+                output = output.transpose(0, 1)
+            return output, h_t
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two RNNs together to form a `stacked RNN`,
+            with the second RNN taking in outputs of the first RNN and
+            computing the final results. Default: 1
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            RNN layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
+
+    Inputs: input, hx
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **hx**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
+          state for the input sequence batch. Defaults to zeros if not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{out} ={} & \text{hidden\_size}
+            \end{aligned}
+
+    Outputs: output, h_n
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the RNN, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
+          for each element in the batch.
+
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
+            `(hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            of shape `(hidden_size, hidden_size)`
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional RNNs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. include:: ../cudnn_rnn_determinism.rst
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.RNN(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    @overload
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        nonlinearity: str = "tanh",
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs) -> None: ...
+
+    def __init__(self, *args, **kwargs):
+        if "proj_size" in kwargs:
+            raise ValueError(
+                "proj_size argument is only supported for LSTM, not RNN or GRU"
+            )
+        if len(args) > 3:
+            self.nonlinearity = args[3]
+            args = args[:3] + args[4:]
+        else:
+            self.nonlinearity = kwargs.pop("nonlinearity", "tanh")
+        if self.nonlinearity == "tanh":
+            mode = "RNN_TANH"
+        elif self.nonlinearity == "relu":
+            mode = "RNN_RELU"
+        else:
+            raise ValueError(
+                f"Unknown nonlinearity '{self.nonlinearity}'. Select from 'tanh' or 'relu'."
+            )
+        super().__init__(mode, *args, **kwargs)
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: Tensor,
+        hx: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor]:
+        pass
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: PackedSequence,
+        hx: Tensor | None = None,
+    ) -> tuple[PackedSequence, Tensor]:
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        """
+        Runs the forward pass.
+        """
+        self._update_flat_weights()
+
+        num_directions = 2 if self.bidirectional else 1
+        orig_input = input
+
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            # script() is unhappy when max_batch_size is different type in cond branches, so we duplicate
+            if hx is None:
+                hx = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            batch_sizes = None
+            if input.dim() not in (2, 3):
+                raise ValueError(
+                    f"RNN: Expected input to be 2D or 3D, got {input.dim()}D tensor instead"
+                )
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor"
+                        )
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor"
+                    )
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                hx = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        assert hx is not None
+        self.check_forward_args(input, hx, batch_sizes)
+        assert self.mode == "RNN_TANH" or self.mode == "RNN_RELU"
+        if batch_sizes is None:
+            if self.mode == "RNN_TANH":
+                result = _VF.rnn_tanh(
+                    input,
+                    hx,
+                    self._flat_weights,  # type: ignore[arg-type]
+                    self.bias,
+                    self.num_layers,
+                    self.dropout,
+                    self.training,
+                    self.bidirectional,
+                    self.batch_first,
+                )
+            else:
+                result = _VF.rnn_relu(
+                    input,
+                    hx,
+                    self._flat_weights,  # type: ignore[arg-type]
+                    self.bias,
+                    self.num_layers,
+                    self.dropout,
+                    self.training,
+                    self.bidirectional,
+                    self.batch_first,
+                )
+        else:
+            if self.mode == "RNN_TANH":
+                result = _VF.rnn_tanh(
+                    input,
+                    batch_sizes,
+                    hx,
+                    self._flat_weights,  # type: ignore[arg-type]
+                    self.bias,
+                    self.num_layers,
+                    self.dropout,
+                    self.training,
+                    self.bidirectional,
+                )
+            else:
+                result = _VF.rnn_relu(
+                    input,
+                    batch_sizes,
+                    hx,
+                    self._flat_weights,  # type: ignore[arg-type]
+                    self.bias,
+                    self.num_layers,
+                    self.dropout,
+                    self.training,
+                    self.bidirectional,
+                )
+
+        output = result[0]
+        hidden = result[1]
+
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+
+        if not is_batched:  # type: ignore[possibly-undefined]
+            output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+            hidden = hidden.squeeze(1)
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+
+# XXX: LSTM and GRU implementation is different from RNNBase, this is because:
+# 1. we want to support nn.LSTM and nn.GRU in TorchScript and TorchScript in
+#    its current state could not support the python Union Type or Any Type
+# 2. TorchScript static typing does not allow a Function or Callable type in
+#    Dict values, so we have to separately call _VF instead of using _rnn_impls
+# 3. This is temporary only and in the transition state that we want to make it
+#    on time for the release
+#
+# More discussion details in https://github.com/pytorch/pytorch/pull/23266
+#
+# TODO: remove the overriding implementations for LSTM and GRU when TorchScript
+# support expressing these two modules generally.
+
+
+class LSTM(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,proj_size=0,device=None,dtype=None)
+
+    Apply a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
+            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
+            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
+            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
+            h_t = o_t \odot \tanh(c_t) \\
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
+    is the hidden state of the layer at time `t-1` or the initial hidden
+    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
+    :math:`o_t` are the input, forget, cell, and output gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
+    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
+    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
+    Second, the output hidden state of each layer will be multiplied by a learnable projection
+    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
+    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
+    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two LSTMs together to form a `stacked LSTM`,
+            with the second LSTM taking in outputs of the first LSTM and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            LSTM layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
+        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
+
+    Inputs: input, (h_0, c_0)
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          initial hidden state for each element in the input sequence.
+          Defaults to zeros if (h_0, c_0) is not provided.
+        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          initial cell state for each element in the input sequence.
+          Defaults to zeros if (h_0, c_0) is not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{cell} ={} & \text{hidden\_size} \\
+                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
+            \end{aligned}
+
+    Outputs: output, (h_n, c_n)
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the LSTM, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
+          a concatenation of the forward and reverse hidden states at each time step in the sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          final hidden state for each element in the sequence. When ``bidirectional=True``,
+          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
+        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          final cell state for each element in the sequence. When ``bidirectional=True``,
+          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If
+            ``proj_size > 0`` was specified, the shape will be
+            `(4*hidden_size, num_directions * proj_size)` for `k > 0`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
+            was specified, the shape will be `(4*hidden_size, proj_size)`.
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
+        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
+            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
+            specified.
+        weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        weight_hh_l[k]_reverse:  Analogous to `weight_hh_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        bias_ih_l[k]_reverse:  Analogous to `bias_ih_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        bias_hh_l[k]_reverse:  Analogous to `bias_hh_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        weight_hr_l[k]_reverse:  Analogous to `weight_hr_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified.
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
+        former contains the final forward and reverse hidden states, while the latter contains the
+        final forward hidden state and the initial reverse hidden state.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. note::
+        ``proj_size`` should be smaller than ``hidden_size``.
+
+    .. include:: ../cudnn_rnn_determinism.rst
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+    """
+
+    @overload
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        proj_size: int = 0,
+        device=None,
+        dtype=None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs) -> None: ...
+
+    def __init__(self, *args, **kwargs):
+        super().__init__("LSTM", *args, **kwargs)
+
+    def get_expected_cell_size(
+        self, input: Tensor, batch_sizes: Tensor | None
+    ) -> tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (
+            self.num_layers * num_directions,
+            mini_batch,
+            self.hidden_size,
+        )
+        return expected_hidden_size
+
+    # In the future, we should prevent mypy from applying contravariance rules here.
+    # See torch/nn/modules/module.py::_forward_unimplemented
+    def check_forward_args(
+        self,
+        input: Tensor,
+        hidden: tuple[Tensor, Tensor],  # type: ignore[override]
+        batch_sizes: Tensor | None,
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        self.check_hidden_size(
+            hidden[0],
+            self.get_expected_hidden_size(input, batch_sizes),
+            "Expected hidden[0] size {}, got {}",
+        )
+        self.check_hidden_size(
+            hidden[1],
+            self.get_expected_cell_size(input, batch_sizes),
+            "Expected hidden[1] size {}, got {}",
+        )
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    def permute_hidden(  # type: ignore[override]
+        self,
+        hx: tuple[Tensor, Tensor],
+        permutation: Tensor | None,
+    ) -> tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(
+            hx[1], permutation
+        )
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    @overload  # type: ignore[override]
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: Tensor,
+        hx: tuple[Tensor, Tensor] | None = None,
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:  # noqa: F811
+        pass
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: PackedSequence,
+        hx: tuple[Tensor, Tensor] | None = None,
+    ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]:  # noqa: F811
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        self._update_flat_weights()
+
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        batch_sizes = None
+        num_directions = 2 if self.bidirectional else 1
+        real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            if hx is None:
+                h_zeros = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    real_hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+                c_zeros = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+                hx = (h_zeros, c_zeros)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            if input.dim() not in (2, 3):
+                raise ValueError(
+                    f"LSTM: Expected input to be 2D or 3D, got {input.dim()}D instead"
+                )
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                h_zeros = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    real_hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+                c_zeros = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+                hx = (h_zeros, c_zeros)
+                self.check_forward_args(input, hx, batch_sizes)
+            else:
+                if is_batched:
+                    if hx[0].dim() != 3 or hx[1].dim() != 3:
+                        msg = (
+                            "For batched 3-D input, hx and cx should "
+                            f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors"
+                        )
+                        raise RuntimeError(msg)
+                else:
+                    if hx[0].dim() != 2 or hx[1].dim() != 2:
+                        msg = (
+                            "For unbatched 2-D input, hx and cx should "
+                            f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors"
+                        )
+                        raise RuntimeError(msg)
+                    hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1))
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                self.check_forward_args(input, hx, batch_sizes)
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        if batch_sizes is None:
+            result = _VF.lstm(
+                input,
+                hx,
+                self._flat_weights,  # type: ignore[arg-type]
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = _VF.lstm(
+                input,
+                batch_sizes,
+                hx,
+                self._flat_weights,  # type: ignore[arg-type]
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+
+class GRU(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)
+
+    Apply a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)}
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two GRUs together to form a `stacked GRU`,
+            with the second GRU taking in outputs of the first GRU and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            GRU layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
+
+    Inputs: input, h_0
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
+          :math:`(D * \text{num\_layers}, N, H_{out})`
+          containing the initial hidden state for the input sequence. Defaults to zeros if not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{out} ={} & \text{hidden\_size}
+            \end{aligned}
+
+    Outputs: output, h_n
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the GRU, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
+          for the input sequence.
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional GRUs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. note::
+        The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks.
+        In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the
+        previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix
+        `W` and addition of bias:
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn})
+            \end{aligned}
+
+        This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}`
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn}))
+            \end{aligned}
+
+        This implementation differs on purpose for efficiency.
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.GRU(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    @overload
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,
+        dropout: float = 0.0,
+        bidirectional: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs) -> None: ...
+
+    def __init__(self, *args, **kwargs):
+        if "proj_size" in kwargs:
+            raise ValueError(
+                "proj_size argument is only supported for LSTM, not RNN or GRU"
+            )
+        super().__init__("GRU", *args, **kwargs)
+
+    @overload  # type: ignore[override]
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: Tensor,
+        hx: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor]:  # noqa: F811
+        pass
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(
+        self,
+        input: PackedSequence,
+        hx: Tensor | None = None,
+    ) -> tuple[PackedSequence, Tensor]:  # noqa: F811
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        self._update_flat_weights()
+
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            if hx is None:
+                num_directions = 2 if self.bidirectional else 1
+                hx = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            batch_sizes = None
+            if input.dim() not in (2, 3):
+                raise ValueError(
+                    f"GRU: Expected input to be 2D or 3D, got {input.dim()}D instead"
+                )
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor"
+                        )
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor"
+                    )
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                num_directions = 2 if self.bidirectional else 1
+                hx = torch.zeros(
+                    self.num_layers * num_directions,
+                    max_batch_size,
+                    self.hidden_size,
+                    dtype=input.dtype,
+                    device=input.device,
+                )
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.gru(
+                input,
+                hx,
+                self._flat_weights,  # type: ignore[arg-type]
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+                self.batch_first,
+            )
+        else:
+            result = _VF.gru(
+                input,
+                batch_sizes,
+                hx,
+                self._flat_weights,  # type: ignore[arg-type]
+                self.bias,
+                self.num_layers,
+                self.dropout,
+                self.training,
+                self.bidirectional,
+            )
+        output = result[0]
+        hidden = result[1]
+
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
+            )
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = hidden.squeeze(1)
+
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+
+class RNNCellBase(Module):
+    __constants__ = ["input_size", "hidden_size", "bias"]
+
+    input_size: int
+    hidden_size: int
+    bias: bool
+    weight_ih: Tensor
+    weight_hh: Tensor
+    # WARNING: bias_ih and bias_hh purposely not defined here.
+    # See https://github.com/pytorch/pytorch/issues/39670
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool,
+        num_chunks: int,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_ih = Parameter(
+            torch.empty((num_chunks * hidden_size, input_size), **factory_kwargs)
+        )
+        self.weight_hh = Parameter(
+            torch.empty((num_chunks * hidden_size, hidden_size), **factory_kwargs)
+        )
+        if bias:
+            self.bias_ih = Parameter(
+                torch.empty(num_chunks * hidden_size, **factory_kwargs)
+            )
+            self.bias_hh = Parameter(
+                torch.empty(num_chunks * hidden_size, **factory_kwargs)
+            )
+        else:
+            self.register_parameter("bias_ih", None)
+            self.register_parameter("bias_hh", None)
+
+        self.reset_parameters()
+
+    def extra_repr(self) -> str:
+        s = "{input_size}, {hidden_size}"
+        if "bias" in self.__dict__ and self.bias is not True:
+            s += ", bias={bias}"
+        if "nonlinearity" in self.__dict__ and self.nonlinearity != "tanh":
+            s += ", nonlinearity={nonlinearity}"
+        return s.format(**self.__dict__)
+
+    def reset_parameters(self) -> None:
+        stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0
+        for weight in self.parameters():
+            init.uniform_(weight, -stdv, stdv)
+
+
+class RNNCell(RNNCellBase):
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+
+    .. math::
+
+        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})
+
+    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+
+    Inputs: input, hidden
+        - **input**: tensor containing input features
+        - **hidden**: tensor containing the initial hidden state
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+
+    Shape:
+        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`.
+        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
+          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
+        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    Examples::
+
+        >>> rnn = nn.RNNCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    __constants__ = ["input_size", "hidden_size", "bias", "nonlinearity"]
+    nonlinearity: str
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        nonlinearity: str = "tanh",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        self.nonlinearity = nonlinearity
+
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
+        if input.dim() not in (1, 2):
+            raise ValueError(
+                f"RNNCell: Expected input to be 1D or 2D, got {input.dim()}D instead"
+            )
+        if hx is not None and hx.dim() not in (1, 2):
+            raise ValueError(
+                f"RNNCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead"
+            )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        if self.nonlinearity == "tanh":
+            ret = _VF.rnn_tanh_cell(
+                input,
+                hx,
+                self.weight_ih,
+                self.weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = _VF.rnn_relu_cell(
+                input,
+                hx,
+                self.weight_ih,
+                self.weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}")
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+
+class LSTMCell(RNNCellBase):
+    r"""A long short-term memory (LSTM) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
+        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
+        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
+        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
+        c' = f \odot c + i \odot g \\
+        h' = o \odot \tanh(c') \\
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, (h_0, c_0)
+        - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features
+        - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state
+        - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state
+
+          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
+
+    Outputs: (h_1, c_1)
+        - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state
+        - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(4*hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(4*hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Examples::
+
+        >>> rnn = nn.LSTMCell(10, 20)  # (input_size, hidden_size)
+        >>> input = torch.randn(2, 3, 10)  # (time_steps, batch, input_size)
+        >>> hx = torch.randn(3, 20)  # (batch, hidden_size)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(input.size()[0]):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+        >>> output = torch.stack(output, dim=0)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+
+    def forward(
+        self, input: Tensor, hx: tuple[Tensor, Tensor] | None = None
+    ) -> tuple[Tensor, Tensor]:
+        if input.dim() not in (1, 2):
+            raise ValueError(
+                f"LSTMCell: Expected input to be 1D or 2D, got {input.dim()}D instead"
+            )
+        if hx is not None:
+            for idx, value in enumerate(hx):
+                if value.dim() not in (1, 2):
+                    raise ValueError(
+                        f"LSTMCell: Expected hx[{idx}] to be 1D or 2D, got {value.dim()}D instead"
+                    )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            zeros = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+            hx = (zeros, zeros)
+        else:
+            hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx
+
+        ret = _VF.lstm_cell(
+            input,
+            hx,
+            self.weight_ih,
+            self.weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = (ret[0].squeeze(0), ret[1].squeeze(0))
+        return ret
+
+
+class GRUCell(RNNCellBase):
+    r"""A gated recurrent unit (GRU) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
+        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
+        n = \tanh(W_{in} x + b_{in} + r \odot (W_{hn} h + b_{hn})) \\
+        h' = (1 - z) \odot n + z \odot h
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, hidden
+        - **input** : tensor containing input features
+        - **hidden** : tensor containing the initial hidden
+          state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** : tensor containing the next hidden state
+          for each element in the batch
+
+    Shape:
+        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`.
+        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
+          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
+        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(3*hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(3*hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Examples::
+
+        >>> rnn = nn.GRUCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+
+    def forward(self, input: Tensor, hx: Tensor | None = None) -> Tensor:
+        if input.dim() not in (1, 2):
+            raise ValueError(
+                f"GRUCell: Expected input to be 1D or 2D, got {input.dim()}D instead"
+            )
+        if hx is not None and hx.dim() not in (1, 2):
+            raise ValueError(
+                f"GRUCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead"
+            )
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(
+                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
+            )
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        ret = _VF.gru_cell(
+            input,
+            hx,
+            self.weight_ih,
+            self.weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec531abce695374b919fbf92d4863ce73da515f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/sparse.py
@@ -0,0 +1,549 @@
+# mypy: allow-untyped-defs
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F, init
+from torch.nn.parameter import Parameter
+
+from .module import Module
+
+
+__all__ = ["Embedding", "EmbeddingBag"]
+
+
+class Embedding(Module):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                     therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                     i.e. it remains as a fixed "pad". For a newly constructed Embedding,
+                                     the embedding vector at :attr:`padding_idx` will default to all zeros,
+                                     but can be updated to another value to be used as the padding vector.
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
+                                 See Notes for more details regarding sparse gradients.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+                         initialized from :math:`\mathcal{N}(0, 1)`
+
+    Shape:
+        - Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
+
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+
+    .. note::
+        When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the
+        :attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be
+        modified in-place, performing a differentiable operation on ``Embedding.weight`` before
+        calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when
+        :attr:`max_norm` is not ``None``. For example::
+
+            n, d, m = 3, 5, 7
+            embedding = nn.Embedding(n, d, max_norm=1.0)
+            W = torch.randn((m, d), requires_grad=True)
+            idx = torch.tensor([1, 2])
+            a = (
+                embedding.weight.clone() @ W.t()
+            )  # weight must be cloned for this to be differentiable
+            b = embedding(idx) @ W.t()  # modifies weight in-place
+            out = a.unsqueeze(0) + b.unsqueeze(1)
+            loss = out.sigmoid().prod()
+            loss.backward()
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # example with padding_idx
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = torch.LongTensor([[0, 2, 0, 5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+
+        >>> # example of changing `pad` vector
+        >>> padding_idx = 0
+        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     embedding.weight[padding_idx] = torch.ones(3)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+    """
+
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "padding_idx",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "sparse",
+    ]
+
+    num_embeddings: int
+    embedding_dim: int
+    padding_idx: int | None
+    max_norm: float | None
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    freeze: bool
+    sparse: bool
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int | None = None,
+        max_norm: float | None = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Tensor | None = None,
+        _freeze: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = Parameter(
+                torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
+                requires_grad=not _freeze,
+            )
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = Parameter(_weight, requires_grad=not _freeze)
+
+        self.sparse = sparse
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.embedding(
+            input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    def extra_repr(self) -> str:
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        if self.sparse is not False:
+            s += ", sparse=True"
+        return s.format(**self.__dict__)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings,
+        freeze=True,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+    ):
+        r"""Create Embedding instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the Embedding.
+                First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
+            freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
+            padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                         therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                         i.e. it remains as a fixed "pad".
+            max_norm (float, optional): See module initialization documentation.
+            norm_type (float, optional): See module initialization documentation. Default ``2``.
+            scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
+            sparse (bool, optional): See module initialization documentation.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embedding = nn.Embedding.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([1])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embedding(input)
+            tensor([[ 4.0000,  5.1000,  6.3000]])
+        """
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
+        rows, cols = embeddings.shape
+        embedding = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            _freeze=freeze,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            sparse=sparse,
+        )
+        return embedding
+
+
+class EmbeddingBag(Module):
+    r"""Compute sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings.
+
+    For bags of constant length, no :attr:`per_sample_weights`, no indices equal to :attr:`padding_idx`,
+    and with 2D inputs, this class
+
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
+
+    However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
+    operations.
+
+    EmbeddingBag also supports per-sample weights as an argument to the forward
+    pass. This scales the output of the Embedding before performing a weighted
+    reduction as specified by ``mode``. If :attr:`per_sample_weights` is passed, the
+    only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
+    :attr:`per_sample_weights`.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
+                                 into consideration. ``"mean"`` computes the average of the values
+                                 in the bag, ``"max"`` computes the max value over each bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
+                                 Notes for more details regarding sparse gradients. Note: this option is not
+                                 supported when ``mode="max"``.
+        include_last_offset (bool, optional): if ``True``, the size of offsets is equal to the number of bags + 1.
+                                              The last element is the size of the input, or the ending index position
+                                              of the last bag (sequence). This matches the CSR format. Ignored when
+                                              input is 2D. Default ``False``.
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the
+                                     gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated
+                                     during training, i.e. it remains as a fixed "pad". For a newly constructed
+                                     EmbeddingBag, the embedding vector at :attr:`padding_idx` will default to all
+                                     zeros, but can be updated to another value to be used as the padding vector.
+                                     Note that the embedding vector at :attr:`padding_idx` is excluded from the
+                                     reduction.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
+                         initialized from :math:`\mathcal{N}(0, 1)`.
+
+    Examples::
+
+        >>> # an EmbeddingBag module containing 10 tensors of size 3
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding_sum(input, offsets)
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
+
+        >>> # Example with padding_idx
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
+        >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> embedding_sum(input, offsets)
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7082,  3.2145, -2.6251]])
+
+        >>> # An EmbeddingBag can be loaded from an Embedding like so
+        >>> embedding = nn.Embedding(10, 3, padding_idx=2)
+        >>> embedding_sum = nn.EmbeddingBag.from_pretrained(
+                embedding.weight,
+                padding_idx=embedding.padding_idx,
+                mode='sum')
+    """
+
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "mode",
+        "sparse",
+        "include_last_offset",
+        "padding_idx",
+    ]
+
+    num_embeddings: int
+    embedding_dim: int
+    max_norm: float | None
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    mode: str
+    sparse: bool
+    include_last_offset: bool
+    padding_idx: int | None
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        max_norm: float | None = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        _weight: Tensor | None = None,
+        include_last_offset: bool = False,
+        padding_idx: int | None = None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        if _weight is None:
+            self.weight = Parameter(
+                torch.empty((num_embeddings, embedding_dim), **factory_kwargs)
+            )
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = Parameter(_weight)
+        self.mode = mode
+        self.sparse = sparse
+        self.include_last_offset = include_last_offset
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(
+        self,
+        input: Tensor,
+        offsets: Tensor | None = None,
+        per_sample_weights: Tensor | None = None,
+    ) -> Tensor:
+        """Forward pass of EmbeddingBag.
+
+        Args:
+            input (Tensor): Tensor containing bags of indices into the embedding matrix.
+            offsets (Tensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                the starting index position of each bag (sequence) in :attr:`input`.
+            per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
+                to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
+                must have exactly the same shape as input and is treated as having the same
+                :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
+
+        Returns:
+            Tensor output shape of `(B, embedding_dim)`.
+
+        .. note::
+
+            A few notes about ``input`` and ``offsets``:
+
+            - :attr:`input` and :attr:`offsets` have to be of the same type, either int or long
+
+            - If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences)
+              each of fixed length ``N``, and this will return ``B`` values aggregated in a way
+              depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+            - If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of
+              multiple bags (sequences).  :attr:`offsets` is required to be a 1D tensor containing the
+              starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets` of shape `(B)`,
+              :attr:`input` will be viewed as having ``B`` bags. Empty bags (i.e., having 0-length) will have
+              returned vectors filled by zeros.
+        """
+        return F.embedding_bag(
+            input,
+            self.weight,
+            offsets,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            per_sample_weights,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+
+    def extra_repr(self) -> str:
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        s += ", mode={mode}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings: Tensor,
+        freeze: bool = True,
+        max_norm: float | None = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        include_last_offset: bool = False,
+        padding_idx: int | None = None,
+    ) -> "EmbeddingBag":
+        r"""Create EmbeddingBag instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag.
+                First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'.
+            freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True``
+            max_norm (float, optional): See module initialization documentation. Default: ``None``
+            norm_type (float, optional): See module initialization documentation. Default ``2``.
+            scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
+            mode (str, optional): See module initialization documentation. Default: ``"mean"``
+            sparse (bool, optional): See module initialization documentation. Default: ``False``.
+            include_last_offset (bool, optional): See module initialization documentation. Default: ``False``.
+            padding_idx (int, optional): See module initialization documentation. Default: ``None``.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([[1, 0]])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embeddingbag(input)
+            tensor([[ 2.5000,  3.7000,  4.6500]])
+        """
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
+        rows, cols = embeddings.shape
+        embeddingbag = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            mode=mode,
+            sparse=sparse,
+            include_last_offset=include_last_offset,
+            padding_idx=padding_idx,
+        )
+        embeddingbag.weight.requires_grad = not freeze
+        return embeddingbag
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6841e85ed6d2e423aa30e95b5b1d3e62f30ec9fb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/transformer.py
@@ -0,0 +1,1256 @@
+# mypy: allow-untyped-defs
+import copy
+import warnings
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.init import xavier_uniform_
+
+from .activation import MultiheadAttention
+from .container import ModuleList
+from .dropout import Dropout
+from .linear import Linear
+from .module import Module
+from .normalization import LayerNorm
+
+
+__all__ = [
+    "Transformer",
+    "TransformerEncoder",
+    "TransformerDecoder",
+    "TransformerEncoderLayer",
+    "TransformerDecoderLayer",
+]
+
+
+def _generate_square_subsequent_mask(
+    sz: int,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
+) -> Tensor:
+    r"""Generate a square causal mask for the sequence.
+
+    The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+    """
+    return torch.triu(
+        torch.full((sz, sz), float("-inf"), dtype=dtype, device=device),
+        diagonal=1,
+    )
+
+
+def _get_seq_len(src: Tensor, batch_first: bool) -> int | None:
+    if src.is_nested:
+        return None
+    else:
+        src_size = src.size()
+        if len(src_size) == 2:
+            # unbatched: S, E
+            return src_size[0]
+        else:
+            # batched: B, S, E if batch_first else S, B, E
+            seq_len_pos = 1 if batch_first else 0
+            return src_size[seq_len_pos]
+
+
+class Transformer(Module):
+    r"""A basic transformer layer.
+
+
+    This Transformer layer implements the original Transformer architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build an efficient transformer layer from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of encoder/decoder intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before
+            other attention and feedforward operations, otherwise after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples:
+        >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+        >>> src = torch.rand((10, 32, 512))
+        >>> tgt = torch.rand((20, 32, 512))
+        >>> out = transformer_model(src, tgt)
+
+    Note: A full example to apply nn.Transformer module for the word language model is available in
+    https://github.com/pytorch/examples/tree/master/word_language_model
+    """
+
+    def __init__(
+        self,
+        d_model: int = 512,
+        nhead: int = 8,
+        num_encoder_layers: int = 6,
+        num_decoder_layers: int = 6,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
+        custom_encoder: Any | None = None,
+        custom_decoder: Any | None = None,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = False,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward,
+                dropout,
+                activation,
+                layer_norm_eps,
+                batch_first,
+                norm_first,
+                bias,
+                **factory_kwargs,
+            )
+            encoder_norm = LayerNorm(
+                d_model,
+                eps=layer_norm_eps,
+                bias=bias,
+                # pyrefly: ignore [bad-argument-type]
+                **factory_kwargs,
+            )
+            self.encoder = TransformerEncoder(
+                encoder_layer, num_encoder_layers, encoder_norm
+            )
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward,
+                dropout,
+                activation,
+                layer_norm_eps,
+                batch_first,
+                norm_first,
+                bias,
+                **factory_kwargs,
+            )
+            decoder_norm = LayerNorm(
+                d_model,
+                eps=layer_norm_eps,
+                bias=bias,
+                # pyrefly: ignore [bad-argument-type]
+                **factory_kwargs,
+            )
+            self.decoder = TransformerDecoder(
+                decoder_layer, num_decoder_layers, decoder_norm
+            )
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        self.batch_first = batch_first
+
+    def forward(
+        self,
+        src: Tensor,
+        tgt: Tensor,
+        src_mask: Tensor | None = None,
+        tgt_mask: Tensor | None = None,
+        memory_mask: Tensor | None = None,
+        src_key_padding_mask: Tensor | None = None,
+        tgt_key_padding_mask: Tensor | None = None,
+        memory_key_padding_mask: Tensor | None = None,
+        src_is_causal: bool | None = None,
+        tgt_is_causal: bool | None = None,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Take in and process masked source/target sequences.
+
+        .. note::
+
+            If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are
+            not allowed to participate in the attention,
+            which is the opposite of the definition for :attr:`attn_mask`
+            in :func:`torch.nn.functional.scaled_dot_product_attention`.
+
+        Args:
+            src: the sequence to the encoder (required).
+            tgt: the sequence to the decoder (required).
+            src_mask: the additive mask for the src sequence (optional).
+            tgt_mask: the additive mask for the tgt sequence (optional).
+            memory_mask: the additive mask for the encoder output (optional).
+            src_key_padding_mask: the Tensor mask for src keys per batch (optional).
+            tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the Tensor mask for memory keys per batch (optional).
+            src_is_causal: If specified, applies a causal mask as ``src_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``src_is_causal`` provides a hint that ``src_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory_mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or
+              `(N, S, E)` if `batch_first=True`.
+            - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`.
+            - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`.
+            - memory_mask: :math:`(T, S)`.
+            - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
+            - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+
+            Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked
+            positions. If a BoolTensor is provided, positions with ``True``
+            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+            is provided, it will be added to the attention weight.
+            [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+            the attention. If a BoolTensor is provided, the positions with the
+            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+
+            - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+
+            Note: Due to the multi-head attention architecture in the transformer model,
+            the output sequence length of a transformer is same as the input sequence
+            (i.e. target) length of the decoder.
+
+            where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the
+            batch size, :math:`E` is the feature number
+
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> output = transformer_model(
+            ...     src, tgt, src_mask=src_mask, tgt_mask=tgt_mask
+            ... )
+        """
+        is_batched = src.dim() == 3
+        if not self.batch_first and src.size(1) != tgt.size(1) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+        elif self.batch_first and src.size(0) != tgt.size(0) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+
+        if src.size(-1) != self.d_model or tgt.size(-1) != self.d_model:
+            raise RuntimeError(
+                "the feature number of src and tgt must be equal to d_model"
+            )
+
+        memory = self.encoder(
+            src,
+            mask=src_mask,
+            src_key_padding_mask=src_key_padding_mask,
+            is_causal=src_is_causal,
+        )
+        output = self.decoder(
+            tgt,
+            memory,
+            tgt_mask=tgt_mask,
+            memory_mask=memory_mask,
+            tgt_key_padding_mask=tgt_key_padding_mask,
+            memory_key_padding_mask=memory_key_padding_mask,
+            tgt_is_causal=tgt_is_causal,
+            memory_is_causal=memory_is_causal,
+        )
+        return output
+
+    @staticmethod
+    def generate_square_subsequent_mask(
+        sz: int,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> Tensor:
+        r"""Generate a square causal mask for the sequence.
+
+        The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+        """
+        return _generate_square_subsequent_mask(sz, dtype=dtype, device=device)
+
+    def _reset_parameters(self) -> None:
+        r"""Initiate parameters in the transformer model."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+
+
+class TransformerEncoder(Module):
+    r"""TransformerEncoder is a stack of N encoder layers.
+
+    This TransformerEncoder layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    .. warning::
+        All layers in the TransformerEncoder are initialized with the same parameters.
+        It is recommended to manually initialize the layers after creating the TransformerEncoder instance.
+
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+        enable_nested_tensor: if True, input will automatically convert to nested tensor
+            (and convert back on output). This will improve the overall performance of
+            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+
+    Examples:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+
+    __constants__ = ["norm"]
+
+    def __init__(
+        self,
+        encoder_layer: "TransformerEncoderLayer",
+        num_layers: int,
+        norm: Module | None = None,
+        enable_nested_tensor: bool = True,
+        mask_check: bool = True,
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        # this attribute saves the value providedat object construction
+        self.enable_nested_tensor = enable_nested_tensor
+        # this attribute controls whether nested tensors are used
+        self.use_nested_tensor = enable_nested_tensor
+        self.mask_check = mask_check
+
+        enc_layer = "encoder_layer"
+        why_not_sparsity_fast_path = ""
+        if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer):
+            why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer"
+        elif encoder_layer.norm_first:
+            why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True"
+        elif not encoder_layer.self_attn.batch_first:
+            why_not_sparsity_fast_path = (
+                f"{enc_layer}.self_attn.batch_first was not True"
+                + "(use batch_first for better inference performance)"
+            )
+        elif not encoder_layer.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = (
+                f"{enc_layer}.self_attn._qkv_same_embed_dim was not True"
+            )
+        elif encoder_layer.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False"
+        elif not encoder_layer.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = (
+                f"{enc_layer}.activation_relu_or_gelu was not True"
+            )
+        elif encoder_layer.norm1.eps != encoder_layer.norm2.eps:
+            why_not_sparsity_fast_path = (
+                f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
+            )
+        elif encoder_layer.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd"
+
+        if enable_nested_tensor and why_not_sparsity_fast_path:
+            warnings.warn(
+                f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}",
+                stacklevel=2,
+            )
+            self.use_nested_tensor = False
+
+    def forward(
+        self,
+        src: Tensor,
+        mask: Tensor | None = None,
+        src_key_padding_mask: Tensor | None = None,
+        is_causal: bool | None = None,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``is_causal`` provides a hint that ``mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(mask),
+            other_name="mask",
+            target_type=src.dtype,
+        )
+
+        mask = F._canonical_mask(
+            mask=mask,
+            mask_name="mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+
+        output = src
+        convert_to_nested = False
+        first_layer = self.layers[0]
+        src_key_padding_mask_for_layers = src_key_padding_mask
+        why_not_sparsity_fast_path = ""
+        str_first_layer = "self.layers[0]"
+        batch_first = first_layer.self_attn.batch_first
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        do_mask_check = getattr(self, "mask_check", True)
+
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = (
+                "torch.backends.mha.get_fastpath_enabled() was not True"
+            )
+        elif not hasattr(self, "use_nested_tensor"):
+            why_not_sparsity_fast_path = "use_nested_tensor attribute not present"
+        elif not self.use_nested_tensor:
+            why_not_sparsity_fast_path = (
+                "self.use_nested_tensor (set in init) was not True"
+            )
+        elif first_layer.training:
+            why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
+        elif src.dim() != 3:
+            why_not_sparsity_fast_path = (
+                f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+            )
+        elif src_key_padding_mask is None:
+            why_not_sparsity_fast_path = "src_key_padding_mask was None"
+        # This check avoids a call to torch._nested_tensor_from_mask_left_aligned() that
+        # breaks in torch.compile.
+        elif do_mask_check and torch.compiler.is_compiling():
+            why_not_sparsity_fast_path = (
+                "mask_check enabled with torch.compile or torch.export"
+            )
+        elif do_mask_check and not torch._nested_tensor_from_mask_left_aligned(
+            src, src_key_padding_mask.logical_not()
+        ):
+            why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
+        elif output.is_nested:
+            why_not_sparsity_fast_path = "NestedTensor input is not supported"
+        elif mask is not None:
+            why_not_sparsity_fast_path = (
+                "src_key_padding_mask and mask were both supplied"
+            )
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                first_layer.self_attn.in_proj_weight,
+                first_layer.self_attn.in_proj_bias,
+                first_layer.self_attn.out_proj.weight,
+                first_layer.self_attn.out_proj.bias,
+                first_layer.norm1.weight,
+                first_layer.norm1.bias,
+                first_layer.norm2.weight,
+                first_layer.norm2.bias,
+                first_layer.linear1.weight,
+                first_layer.linear1.bias,
+                first_layer.linear2.weight,
+                first_layer.linear2.bias,
+            )
+            _supported_device_type = [
+                "cpu",
+                "cuda",
+                "xpu",
+                torch.utils.backend_registration._privateuse1_backend_name,
+            ]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif src.device.type not in _supported_device_type:
+                why_not_sparsity_fast_path = (
+                    f"src device is neither one of {_supported_device_type}"
+                )
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
+
+            if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None):
+                convert_to_nested = True
+                output = torch._nested_tensor_from_mask(
+                    output, src_key_padding_mask.logical_not(), mask_check=False
+                )
+                src_key_padding_mask_for_layers = None
+
+        seq_len = _get_seq_len(src, batch_first)
+        is_causal = _detect_is_causal_mask(mask, is_causal, seq_len)
+
+        for mod in self.layers:
+            output = mod(
+                output,
+                src_mask=mask,
+                is_causal=is_causal,
+                src_key_padding_mask=src_key_padding_mask_for_layers,
+            )
+
+        if convert_to_nested:
+            output = output.to_padded_tensor(0.0, src.size())
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers.
+
+    This TransformerDecoder layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    .. warning::
+        All layers in the TransformerDecoder are initialized with the same parameters.
+        It is recommended to manually initialize the layers after creating the TransformerDecoder instance.
+
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+
+    Examples:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+
+    __constants__ = ["norm"]
+
+    def __init__(
+        self,
+        decoder_layer: "TransformerDecoderLayer",
+        num_layers: int,
+        norm: Module | None = None,
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        tgt_mask: Tensor | None = None,
+        memory_mask: Tensor | None = None,
+        tgt_key_padding_mask: Tensor | None = None,
+        memory_key_padding_mask: Tensor | None = None,
+        tgt_is_causal: bool | None = None,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        output = tgt
+
+        seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first)
+        tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len)
+
+        for mod in self.layers:
+            output = mod(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                tgt_is_causal=tgt_is_causal,
+                memory_is_causal=memory_is_causal,
+            )
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+
+    This TransformerEncoderLayer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    TransformerEncoderLayer can handle either traditional torch.tensor inputs,
+    or Nested Tensor inputs.  Derived classes are expected to similarly accept
+    both input formats.  (Not all combinations of inputs are currently
+    supported by TransformerEncoderLayer while Nested Tensor is in prototype
+    state.)
+
+    If you are implementing a custom layer, you may derive it either from
+    the Module or TransformerEncoderLayer class.  If your custom layer
+    supports both torch.Tensors and Nested Tensors inputs, make its
+    implementation a derived class of TransformerEncoderLayer. If your custom
+    Layer supports only torch.Tensor inputs, derive its implementation from
+    Module.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to attention and feedforward
+            operations, respectively. Otherwise it's done after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(
+        ...     d_model=512, nhead=8, batch_first=True
+        ... )
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+
+    Fast path:
+        forward() will use a special optimized implementation described in
+        `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
+        conditions are met:
+
+        - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
+          argument ``requires_grad``
+        - training is disabled (using ``.eval()``)
+        - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``)
+        - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu``
+        - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed
+        - if src is a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_, neither ``src_mask``
+          nor ``src_key_padding_mask`` is passed
+        - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case
+          unless the caller has manually modified one without modifying the other)
+
+        If the optimized implementation is in use, a
+        `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be
+        passed for ``src`` to represent padding more efficiently than using a padding
+        mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
+        returned, and an additional speedup proportional to the fraction of the input that
+        is padding can be expected.
+
+        .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
+    """
+
+    __constants__ = ["norm_first"]
+
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = False,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            bias=bias,
+            batch_first=batch_first,
+            **factory_kwargs,
+        )
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+
+        self.norm_first = norm_first
+        # pyrefly: ignore [bad-argument-type]
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore [bad-argument-type]
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            activation = _get_activation_fn(activation)
+
+        # We can't test self.activation in forward() in TorchScript,
+        # so stash some information about it instead.
+        if activation is F.relu or isinstance(activation, torch.nn.ReLU):
+            self.activation_relu_or_gelu = 1
+        elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
+            self.activation_relu_or_gelu = 2
+        else:
+            self.activation_relu_or_gelu = 0
+        self.activation = activation
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "activation"):
+            self.activation = F.relu
+
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Tensor | None = None,
+        src_key_padding_mask: Tensor | None = None,
+        is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``src mask``.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``src_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(src_mask),
+            other_name="src_mask",
+            target_type=src.dtype,
+        )
+
+        src_mask = F._canonical_mask(
+            mask=src_mask,
+            mask_name="src_mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+
+        why_not_sparsity_fast_path = ""
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = (
+                "torch.backends.mha.get_fastpath_enabled() was not True"
+            )
+        elif src.dim() != 3:
+            why_not_sparsity_fast_path = (
+                f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+            )
+        elif self.training:
+            why_not_sparsity_fast_path = "training is enabled"
+        elif not self.self_attn.batch_first:
+            why_not_sparsity_fast_path = "self_attn.batch_first was not True"
+        elif self.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = "self_attn was passed bias=False"
+        elif not self.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
+        elif not self.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
+        elif self.norm1.eps != self.norm2.eps:
+            why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
+        elif src.is_nested and (
+            src_key_padding_mask is not None or src_mask is not None
+        ):
+            why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input"
+        elif self.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        elif any(
+            len(getattr(m, "_forward_hooks", {}))
+            + len(getattr(m, "_forward_pre_hooks", {}))
+            for m in self.modules()
+        ):
+            why_not_sparsity_fast_path = "forward pre-/hooks are attached to the module"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                self.self_attn.in_proj_weight,
+                self.self_attn.in_proj_bias,
+                self.self_attn.out_proj.weight,
+                self.self_attn.out_proj.bias,
+                self.norm1.weight,
+                self.norm1.bias,
+                self.norm2.weight,
+                self.norm2.bias,
+                self.linear1.weight,
+                self.linear1.bias,
+                self.linear2.weight,
+                self.linear2.bias,
+            )
+
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            _supported_device_type = [
+                "cpu",
+                "cuda",
+                "xpu",
+                torch.utils.backend_registration._privateuse1_backend_name,
+            ]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif not all(
+                (x.device.type in _supported_device_type) for x in tensor_args
+            ):
+                why_not_sparsity_fast_path = (
+                    "some Tensor argument's device is neither one of "
+                    f"{_supported_device_type}"
+                )
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
+
+            if not why_not_sparsity_fast_path:
+                merged_mask, mask_type = self.self_attn.merge_masks(
+                    src_mask, src_key_padding_mask, src
+                )
+                return torch._transformer_encoder_layer_fwd(
+                    src,
+                    self.self_attn.embed_dim,
+                    self.self_attn.num_heads,
+                    self.self_attn.in_proj_weight,
+                    self.self_attn.in_proj_bias,
+                    self.self_attn.out_proj.weight,
+                    self.self_attn.out_proj.bias,
+                    self.activation_relu_or_gelu == 2,
+                    self.norm_first,
+                    self.norm1.eps,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    self.linear1.weight,
+                    self.linear1.bias,
+                    self.linear2.weight,
+                    self.linear2.bias,
+                    merged_mask,
+                    mask_type,
+                )
+
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(
+                self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal
+            )
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(
+                x
+                + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal)
+            )
+            x = self.norm2(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(
+        self,
+        x: Tensor,
+        attn_mask: Tensor | None,
+        key_padding_mask: Tensor | None,
+        is_causal: bool = False,
+    ) -> Tensor:
+        x = self.self_attn(
+            x,
+            x,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+            is_causal=is_causal,
+        )[0]
+        return self.dropout1(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+
+
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+
+    This TransformerDecoderLayer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to self attention, multihead
+            attention and feedforward operations, respectively. Otherwise it's done after.
+            Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = decoder_layer(tgt, memory)
+
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> decoder_layer = nn.TransformerDecoderLayer(
+        ...     d_model=512, nhead=8, batch_first=True
+        ... )
+        >>> memory = torch.rand(32, 10, 512)
+        >>> tgt = torch.rand(32, 20, 512)
+        >>> out = decoder_layer(tgt, memory)
+    """
+
+    __constants__ = ["norm_first"]
+
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: str | Callable[[Tensor], Tensor] = F.relu,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = False,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            batch_first=batch_first,
+            bias=bias,
+            **factory_kwargs,
+        )
+        self.multihead_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=dropout,
+            batch_first=batch_first,
+            bias=bias,
+            **factory_kwargs,
+        )
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+
+        self.norm_first = norm_first
+        # pyrefly: ignore [bad-argument-type]
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore [bad-argument-type]
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore [bad-argument-type]
+        self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+
+    def __setstate__(self, state):
+        if "activation" not in state:
+            state["activation"] = F.relu
+        super().__setstate__(state)
+
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        tgt_mask: Tensor | None = None,
+        memory_mask: Tensor | None = None,
+        tgt_key_padding_mask: Tensor | None = None,
+        memory_key_padding_mask: Tensor | None = None,
+        tgt_is_causal: bool = False,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``False``.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+
+        x = tgt
+        if self.norm_first:
+            x = x + self._sa_block(
+                self.norm1(x), tgt_mask, tgt_key_padding_mask, tgt_is_causal
+            )
+            x = x + self._mha_block(
+                self.norm2(x),
+                memory,
+                memory_mask,
+                memory_key_padding_mask,
+                memory_is_causal,
+            )
+            x = x + self._ff_block(self.norm3(x))
+        else:
+            x = self.norm1(
+                x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal)
+            )
+            x = self.norm2(
+                x
+                + self._mha_block(
+                    x, memory, memory_mask, memory_key_padding_mask, memory_is_causal
+                )
+            )
+            x = self.norm3(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(
+        self,
+        x: Tensor,
+        attn_mask: Tensor | None,
+        key_padding_mask: Tensor | None,
+        is_causal: bool = False,
+    ) -> Tensor:
+        x = self.self_attn(
+            x,
+            x,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+
+    # multihead attention block
+    def _mha_block(
+        self,
+        x: Tensor,
+        mem: Tensor,
+        attn_mask: Tensor | None,
+        key_padding_mask: Tensor | None,
+        is_causal: bool = False,
+    ) -> Tensor:
+        x = self.multihead_attn(
+            x,
+            mem,
+            mem,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            need_weights=False,
+        )[0]
+        return self.dropout2(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+
+
+def _get_clones(module, N):
+    # FIXME: copy.deepcopy() is not defined on nn.module
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}")
+
+
+def _detect_is_causal_mask(
+    mask: Tensor | None,
+    is_causal: bool | None = None,
+    size: int | None = None,
+) -> bool:
+    """Return whether the given attention mask is causal.
+
+    Warning:
+    If ``is_causal`` is not ``None``, its value will be returned as is.  If a
+    user supplies an incorrect ``is_causal`` hint,
+
+    ``is_causal=False`` when the mask is in fact a causal attention.mask
+       may lead to reduced performance relative to what would be achievable
+       with ``is_causal=True``;
+    ``is_causal=True`` when the mask is in fact not a causal attention.mask
+       may lead to incorrect and unpredictable execution - in some scenarios,
+       a causal mask may be applied based on the hint, in other execution
+       scenarios the specified mask may be used.  The choice may not appear
+       to be deterministic, in that a number of factors like alignment,
+       hardware SKU, etc influence the decision whether to use a mask or
+       rely on the hint.
+    ``size`` if not None, check whether the mask is a causal mask of the provided size
+       Otherwise, checks for any causal mask.
+    """
+    # Prevent type refinement
+    make_causal = is_causal is True
+
+    if is_causal is None and mask is not None:
+        sz = size if size is not None else mask.size(-2)
+        causal_comparison = _generate_square_subsequent_mask(
+            sz, device=mask.device, dtype=mask.dtype
+        )
+
+        # Do not use `torch.equal` so we handle batched masks by
+        # broadcasting the comparison.
+        if mask.size() == causal_comparison.size():
+            make_causal = bool((mask == causal_comparison).all())
+        else:
+            make_causal = False
+
+    return make_causal
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..29e58bc6a9f3779924584e2934874a1333b3e501
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/upsampling.py
@@ -0,0 +1,298 @@
+# mypy: allow-untyped-defs
+
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.common_types import _ratio_2_t, _ratio_any_t, _size_2_t, _size_any_t
+
+from .module import Module
+
+
+__all__ = ["Upsample", "UpsamplingNearest2d", "UpsamplingBilinear2d"]
+
+
+class Upsample(Module):
+    r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, ``'bicubic'``, or ``'trilinear'``.
+            Default: ``False``
+        recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
+            interpolation calculation. If `recompute_scale_factor` is ``True``, then
+            `scale_factor` must be passed in and `scale_factor` is used to compute the
+            output `size`. The computed output `size` will be used to infer new scales for
+            the interpolation. Note that when `scale_factor` is floating-point, it may differ
+            from the recomputed `scale_factor` due to rounding and precision issues.
+            If `recompute_scale_factor` is ``False``, then `size` or `scale_factor` will
+            be used directly for interpolation.
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})`
+          or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        D_{out} = \left\lfloor D_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, `bicubic`, and `trilinear`) don't proportionally
+        align the output and input pixels, and thus the output values can depend
+        on the input size. This was the default behavior for these modes up to
+        version 0.3.1. Since then, the default behavior is
+        ``align_corners = False``. See below for concrete examples on how this
+        affects the outputs.
+
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='nearest')
+        >>> m(input)
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> m(input)
+        tensor([[[[1.0000, 1.2500, 1.7500, 2.0000],
+                  [1.5000, 1.7500, 2.2500, 2.5000],
+                  [2.5000, 2.7500, 3.2500, 3.5000],
+                  [3.0000, 3.2500, 3.7500, 4.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> m(input)
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
+
+        >>> # Try scaling the same data in a larger tensor
+        >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
+        >>> input_3x3[:, :, :2, :2].copy_(input)
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+        >>> input_3x3
+        tensor([[[[1., 2., 0.],
+                  [3., 4., 0.],
+                  [0., 0., 0.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("seems to fail when other tests are run in the same session")
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> # Notice that values in top left corner are the same with the small input (except at boundary)
+        >>> m(input_3x3)
+        tensor([[[[1.0000, 1.2500, 1.7500, 1.5000, 0.5000, 0.0000],
+                  [1.5000, 1.7500, 2.2500, 1.8750, 0.6250, 0.0000],
+                  [2.5000, 2.7500, 3.2500, 2.6250, 0.8750, 0.0000],
+                  [2.2500, 2.4375, 2.8125, 2.2500, 0.7500, 0.0000],
+                  [0.7500, 0.8125, 0.9375, 0.7500, 0.2500, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> # Notice that values in top left corner are now changed
+        >>> m(input_3x3)
+        tensor([[[[1.0000, 1.4000, 1.8000, 1.6000, 0.8000, 0.0000],
+                  [1.8000, 2.2000, 2.6000, 2.2400, 1.1200, 0.0000],
+                  [2.6000, 3.0000, 3.4000, 2.8800, 1.4400, 0.0000],
+                  [2.4000, 2.7200, 3.0400, 2.5600, 1.2800, 0.0000],
+                  [1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
+    """
+
+    __constants__ = [
+        "size",
+        "scale_factor",
+        "mode",
+        "align_corners",
+        "name",
+        "recompute_scale_factor",
+    ]
+    name: str
+    size: _size_any_t | None
+    scale_factor: _ratio_any_t | None
+    mode: str
+    align_corners: bool | None
+    recompute_scale_factor: bool | None
+
+    def __init__(
+        self,
+        size: _size_any_t | None = None,
+        scale_factor: _ratio_any_t | None = None,
+        mode: str = "nearest",
+        align_corners: bool | None = None,
+        recompute_scale_factor: bool | None = None,
+    ) -> None:
+        super().__init__()
+        self.name = type(self).__name__
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+        self.recompute_scale_factor = recompute_scale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.interpolate(
+            input,
+            self.size,
+            self.scale_factor,
+            self.mode,
+            self.align_corners,
+            recompute_scale_factor=self.recompute_scale_factor,
+        )
+
+    def __setstate__(self, state):
+        if "recompute_scale_factor" not in state:
+            state["recompute_scale_factor"] = True
+
+        super().__setstate__(state)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        if self.scale_factor is not None:
+            info = "scale_factor=" + repr(self.scale_factor)
+        else:
+            info = "size=" + repr(self.size)
+        info += ", mode=" + repr(self.mode)
+        return info
+
+
+class UpsamplingNearest2d(Upsample):
+    r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+          H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+          W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> m = nn.UpsamplingNearest2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
+    """
+
+    def __init__(
+        self,
+        size: _size_2_t | None = None,
+        scale_factor: _ratio_2_t | None = None,
+    ) -> None:
+        super().__init__(size, scale_factor, mode="nearest")
+
+
+class UpsamplingBilinear2d(Upsample):
+    r"""Applies a 2D bilinear upsampling to an input signal composed of several input channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
+        equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+        H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
+    """
+
+    def __init__(
+        self,
+        size: _size_2_t | None = None,
+        scale_factor: _ratio_2_t | None = None,
+    ) -> None:
+        super().__init__(size, scale_factor, mode="bilinear", align_corners=True)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dffadefe152d527090aef870f87a7a7565eac25
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/modules/utils.py
@@ -0,0 +1,83 @@
+# mypy: allow-untyped-defs
+import collections
+from itertools import repeat
+from typing import Any
+
+
+__all__ = ["consume_prefix_in_state_dict_if_present"]
+
+
+def _ntuple(n, name="parse"):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    parse.__name__ = name
+    return parse
+
+
+_single = _ntuple(1, "_single")
+_pair = _ntuple(2, "_pair")
+_triple = _ntuple(3, "_triple")
+_quadruple = _ntuple(4, "_quadruple")
+
+
+def _reverse_repeat_tuple(t, n):
+    r"""Reverse the order of `t` and repeat each element for `n` times.
+
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
+    """
+    return tuple(x for x in reversed(t) for _ in range(n))
+
+
+def _list_with_default(out_size: list[int], defaults: list[int]) -> list[int]:
+    import torch
+
+    if isinstance(out_size, (int, torch.SymInt)):
+        # pyrefly: ignore [bad-return]
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError(f"Input dimension should be at least {len(out_size) + 1}")
+    return [
+        v if v is not None else d
+        for v, d in zip(out_size, defaults[-len(out_size) :], strict=False)
+    ]
+
+
+def consume_prefix_in_state_dict_if_present(
+    state_dict: dict[str, Any],
+    prefix: str,
+) -> None:
+    r"""Strip the prefix in state_dict in place, if any.
+
+    .. note::
+        Given a `state_dict` from a DP/DDP model, a local model can load it by applying
+        `consume_prefix_in_state_dict_if_present(state_dict, "module.")` before calling
+        :meth:`torch.nn.Module.load_state_dict`.
+
+    Args:
+        state_dict (OrderedDict): a state-dict to be loaded to the model.
+        prefix (str): prefix.
+    """
+    keys = list(state_dict.keys())
+    for key in keys:
+        if key.startswith(prefix):
+            newkey = key[len(prefix) :]
+            state_dict[newkey] = state_dict.pop(key)
+
+    # also strip the prefix in metadata if any.
+    if hasattr(state_dict, "_metadata"):
+        keys = list(state_dict._metadata.keys())
+        for key in keys:
+            # for the metadata dict, the key can be:
+            # '': for the DDP module, which we want to remove.
+            # 'module': for the actual model.
+            # 'module.xx.xx': for the rest.
+            if len(key) == 0:
+                continue
+            # handling both, 'module' case and  'module.' cases
+            if key == prefix.replace(".", "") or key.startswith(prefix):
+                newkey = key[len(prefix) :]
+                state_dict._metadata[newkey] = state_dict._metadata.pop(key)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad8648d10aadc8dec59ea7ebc54aa77cd60ee4f5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/__init__.py
@@ -0,0 +1,27 @@
+from typing_extensions import deprecated
+
+from torch.nn.parallel.data_parallel import data_parallel, DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+
+
+__all__ = [
+    "replicate",
+    "scatter",
+    "parallel_apply",
+    "gather",
+    "data_parallel",
+    "DataParallel",
+    "DistributedDataParallel",
+]
+
+
+@deprecated(
+    "`torch.nn.parallel.DistributedDataParallelCPU` is deprecated, "
+    "please use `torch.nn.parallel.DistributedDataParallel` instead.",
+    category=FutureWarning,
+)
+class DistributedDataParallelCPU(DistributedDataParallel):
+    pass
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a2eace9eff15b06df7958588afd8e1580bb8a7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py
@@ -0,0 +1,131 @@
+import warnings
+from itertools import chain
+
+import torch
+from torch._utils import _get_device_index
+from torch.autograd import Function
+from torch.nn.parallel import comm
+
+
+class Broadcast(Function):
+    @staticmethod
+    def forward(ctx, target_gpus, *inputs):
+        assert all(i.device.type != "cpu" for i in inputs), (
+            "Broadcast function not implemented for CPU tensors"
+        )
+        target_gpus = [_get_device_index(x, True) for x in target_gpus]
+        ctx.target_gpus = target_gpus
+        if len(inputs) == 0:
+            return ()
+        ctx.num_inputs = len(inputs)
+        ctx.input_device = inputs[0].get_device()
+        outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
+        non_differentiables = []
+        for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
+            if not input_requires_grad:
+                non_differentiables.extend(output[idx] for output in outputs)
+        ctx.mark_non_differentiable(*non_differentiables)
+        return tuple(chain.from_iterable(outputs))
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None,) + ReduceAddCoalesced.apply(
+            ctx.input_device, ctx.num_inputs, *grad_outputs
+        )
+
+
+class ReduceAddCoalesced(Function):
+    @staticmethod
+    def forward(ctx, destination, num_inputs, *grads):
+        ctx.target_gpus = [
+            grads[i].get_device() for i in range(0, len(grads), num_inputs)
+        ]
+
+        grads_ = [grads[i : i + num_inputs] for i in range(0, len(grads), num_inputs)]
+        return comm.reduce_add_coalesced(grads_, destination)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (
+            None,
+            None,
+        ) + Broadcast.apply(ctx.target_gpus, *grad_outputs)
+
+
+class Gather(Function):
+    @staticmethod
+    def forward(ctx, target_device, dim, *inputs):
+        assert all(i.device.type != "cpu" for i in inputs), (
+            "Gather function not implemented for CPU tensors"
+        )
+        if target_device == "cpu":
+            ctx.target_device = "cpu"
+        else:
+            target_device = _get_device_index(target_device, True)
+            ctx.target_device = target_device
+        ctx.dim = dim
+        ctx.input_gpus = tuple(i.get_device() for i in inputs)
+        if all(t.dim() == 0 for t in inputs) and dim == 0:
+            inputs = tuple(t.view(1) for t in inputs)
+            warnings.warn(
+                "Was asked to gather along dimension 0, but all "
+                "input tensors were scalars; will instead unsqueeze "
+                "and return a vector.",
+                stacklevel=2,
+            )
+            ctx.unsqueezed_scalar = True
+        else:
+            ctx.unsqueezed_scalar = False
+        ctx.input_sizes = tuple(i.size(ctx.dim) for i in inputs)
+        return comm.gather(inputs, ctx.dim, ctx.target_device)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        scattered_grads = Scatter.apply(
+            ctx.input_gpus, ctx.input_sizes, ctx.dim, grad_output
+        )
+        if ctx.unsqueezed_scalar:
+            scattered_grads = tuple(g[0] for g in scattered_grads)
+        return (None, None) + scattered_grads
+
+
+class Scatter(Function):
+    @staticmethod
+    def forward(ctx, target_gpus, chunk_sizes, dim, input):
+        target_gpus = [_get_device_index(x, True) for x in target_gpus]
+        ctx.dim = dim
+        ctx.input_device = input.get_device() if input.device.type != "cpu" else -1
+        streams = None
+        if torch.accelerator.is_available() and ctx.input_device == -1:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(torch.device(device)) for device in target_gpus]
+        outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            for i, output in enumerate(outputs):
+                with torch.accelerator.device_index(target_gpus[i]):
+                    main_stream = torch.accelerator.current_stream()
+                    main_stream.wait_stream(streams[i])
+                    output.record_stream(main_stream)
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        return None, None, None, Gather.apply(ctx.input_device, ctx.dim, *grad_output)
+
+
+# background streams used for copying
+_streams: list[torch.Stream | None] | None = None
+
+
+def _get_stream(device: torch.device):
+    """Get a background stream for copying between CPU and target device."""
+    global _streams
+    if device.type == "cpu" or not torch.accelerator.is_available():
+        return None
+    assert torch.accelerator.current_accelerator().type == device.type
+    if _streams is None:
+        _streams = [None] * torch.accelerator.device_count()
+    if _streams[device.index] is None:
+        _streams[device.index] = torch.Stream(device.index)
+    return _streams[device.index]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..255c0c4b332712a714610801f11c8e2b33df3671
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py
@@ -0,0 +1,261 @@
+# mypy: allow-untyped-defs
+import warnings
+
+import torch
+from torch._utils import (
+    _flatten_dense_tensors,
+    _get_device_index,
+    _handle_complex,
+    _reorder_tensors_as,
+    _take_tensors,
+    _unflatten_dense_tensors,
+)
+from torch.cuda import nccl
+
+
+def broadcast(tensor, devices=None, *, out=None):
+    r"""Broadcasts a tensor to specified GPU devices.
+
+    Args:
+        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
+        devices (Iterable[torch.device, str or int], optional): an iterable of
+          GPU devices, among which to broadcast.
+        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
+          store output results.
+
+    .. note::
+        Exactly one of :attr:`devices` and :attr:`out` must be specified.
+
+    Returns:
+        - If :attr:`devices` is specified,
+            a tuple containing copies of :attr:`tensor`, placed on
+            :attr:`devices`.
+        - If :attr:`out` is specified,
+            a tuple containing :attr:`out` tensors, each containing a copy of
+            :attr:`tensor`.
+    """
+    tensor = _handle_complex(tensor)
+    if not ((devices is None) ^ (out is None)):
+        raise RuntimeError(
+            f"Exactly one of 'devices' and 'out' must be specified, but got devices={devices} and out={out}"
+        )
+    if devices is not None:
+        devices = [_get_device_index(d) for d in devices]
+        return torch._C._broadcast(tensor, devices)
+    else:
+        # pyrefly: ignore [bad-argument-type]
+        return torch._C._broadcast_out(tensor, out)
+
+
+def broadcast_coalesced(tensors, devices, buffer_size=10485760):
+    """Broadcast a sequence of tensors to the specified GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number of synchronizations.
+
+    Args:
+        tensors (sequence): tensors to broadcast. Must be on the same device,
+          either CPU or GPU.
+        devices (Iterable[torch.device, str or int]): an iterable of GPU
+          devices, among which to broadcast.
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
+    """
+    devices = [_get_device_index(d) for d in devices]
+    tensors = [_handle_complex(t) for t in tensors]
+    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
+
+
+def reduce_add(inputs, destination=None):
+    """Sum tensors from multiple GPUs.
+
+    All inputs should have matching shapes, dtype, and layout. The output tensor
+    will be of the same shape, dtype, and layout.
+
+    Args:
+        inputs (Iterable[Tensor]): an iterable of tensors to add.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+
+    Returns:
+        A tensor containing an elementwise sum of all inputs, placed on the
+        :attr:`destination` device.
+    """
+    destination = _get_device_index(destination, optional=True)
+    input_size = inputs[0].size()
+    root_index = None  # index of input tensor that already is on the correct device
+    for i, inp in enumerate(inputs):
+        assert inp.device.type != "cpu", "reduce_add expects all inputs to be on GPUs"
+        if inp.get_device() == destination:
+            root_index = i
+        if inp.size() != input_size:
+            got = "x".join(str(x) for x in inp.size())
+            expected = "x".join(str(x) for x in input_size)
+            raise ValueError(
+                f"input {i} has invalid size: got {got}, but expected {expected}"
+            )
+    if root_index is None:
+        raise RuntimeError(
+            "reduce_add expects destination to be on the same GPU with one of the tensors"
+        )
+
+    if len(inputs) == 1:
+        return inputs[0]
+
+    if nccl.is_available(inputs):
+        result = torch.empty_like(inputs[root_index])
+        nccl.reduce(inputs, output=result, root=root_index)
+    else:
+        destination_device = torch.device(inputs[root_index].device.type, destination)
+        nonroot = [t for i, t in enumerate(inputs) if i != root_index]
+        # make a new tensor w/o clone
+        result = inputs[root_index] + nonroot[0].to(
+            device=destination_device, non_blocking=True
+        )
+        for other in nonroot[1:]:
+            result.add_(other.to(device=destination_device, non_blocking=True))
+    return result
+
+
+def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
+    """Sum tensors from multiple GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Args:
+        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
+            contain tensors from a single device.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple of tensors containing an elementwise sum of each group of
+        inputs, placed on the ``destination`` device.
+    """
+    # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just
+    #       return `inputs`.
+    dense_tensors: list[list] = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
+    output = []
+    ref_order = []
+    # process sparse ones first since they may have different sizes on different gpus
+    for tensor_at_gpus in zip(*inputs, strict=True):
+        if all(t.is_sparse for t in tensor_at_gpus):
+            result = reduce_add(tensor_at_gpus, destination)  # this will be sparse too
+            output.append(result)
+            ref_order.append(tensor_at_gpus[0])
+        else:
+            for coll, t in zip(dense_tensors, tensor_at_gpus, strict=True):
+                coll.append(t.to_dense() if t.is_sparse else t)
+            ref_order.append(dense_tensors[0][-1])
+    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
+    # now the dense ones, which have consistent sizes
+    for chunks in zip(*itrs, strict=True):
+        flat_tensors = [
+            _flatten_dense_tensors(chunk) for chunk in chunks
+        ]  # (num_gpus,)
+        flat_result = reduce_add(flat_tensors, destination)
+        for t in _unflatten_dense_tensors(flat_result, chunks[0]):
+            # The unflattened tensors do not share storage, and we don't expose
+            # base flat tensor anyways, so give them different version counters.
+            # See NOTE [ Version Counter in comm.*_coalesced ]
+            output.append(t.data)
+    return tuple(_reorder_tensors_as(output, ref_order))
+
+
+def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=None):
+    """Scatters tensor across multiple GPUs.
+
+    Args:
+        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
+        devices (Iterable[torch.device, str or int], optional): an iterable of
+          GPU devices, among which to scatter.
+        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
+          each device. It should match :attr:`devices` in length and sums to
+          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
+          into equal chunks.
+        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
+          Default: ``0``.
+        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
+          which to execute the scatter. If not specified, the default stream will
+          be utilized.
+        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
+          store output results. Sizes of these tensors must match that of
+          :attr:`tensor`, except for :attr:`dim`, where the total size must
+          sum to ``tensor.size(dim)``.
+
+    .. note::
+        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
+        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
+        will be inferred from sizes of :attr:`out`.
+
+    Returns:
+        - If :attr:`devices` is specified,
+            a tuple containing chunks of :attr:`tensor`, placed on
+            :attr:`devices`.
+        - If :attr:`out` is specified,
+            a tuple containing :attr:`out` tensors, each containing a chunk of
+            :attr:`tensor`.
+    """
+    tensor = _handle_complex(tensor)
+    if out is None:
+        # pyrefly: ignore [not-iterable]
+        devices = [_get_device_index(d) for d in devices]
+        return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
+    else:
+        if devices is not None:
+            raise RuntimeError(
+                f"'devices' must not be specified when 'out' is specified, but got devices={devices}"
+            )
+        if chunk_sizes is not None:
+            raise RuntimeError(
+                f"'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes={chunk_sizes}"
+            )
+        return tuple(torch._C._scatter_out(tensor, out, dim, streams))
+
+
+def gather(tensors, dim=0, destination=None, *, out=None):
+    r"""Gathers tensors from multiple GPU devices.
+
+    Args:
+        tensors (Iterable[Tensor]): an iterable of tensors to gather.
+          Tensor sizes in all dimensions other than :attr:`dim` have to match.
+        dim (int, optional): a dimension along which the tensors will be
+          concatenated. Default: ``0``.
+        destination (torch.device, str, or int, optional): the output device.
+          Can be CPU or CUDA. Default: the current CUDA device.
+        out (Tensor, optional, keyword-only): the tensor to store gather result.
+          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
+          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
+          Can be on CPU or CUDA.
+
+    .. note::
+        :attr:`destination` must not be specified when :attr:`out` is specified.
+
+    Returns:
+        - If :attr:`destination` is specified,
+            a tensor located on :attr:`destination` device, that is a result of
+            concatenating :attr:`tensors` along :attr:`dim`.
+        - If :attr:`out` is specified,
+            the :attr:`out` tensor, now containing results of concatenating
+            :attr:`tensors` along :attr:`dim`.
+    """
+    tensors = [_handle_complex(t) for t in tensors]
+    if out is None:
+        if destination == -1:
+            warnings.warn(
+                "Using -1 to represent CPU tensor is deprecated. Please use a "
+                'device object or string instead, e.g., "cpu".',
+                FutureWarning,
+                stacklevel=2,
+            )
+        destination = _get_device_index(destination, allow_cpu=True, optional=True)
+        return torch._C._gather(tensors, dim, destination)
+    else:
+        if destination is not None:
+            raise RuntimeError(
+                f"'destination' must not be specified when 'out' is specified, but got destination={destination}"
+            )
+        return torch._C._gather_out(tensors, out, dim)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2319439f092bed9a4277838dcb3b794de64b97
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/data_parallel.py
@@ -0,0 +1,289 @@
+# mypy: allow-untyped-defs
+import operator
+import warnings
+from collections.abc import Sequence
+from itertools import chain
+from typing import Any, Generic, TypeVar
+
+import torch
+from torch._utils import (
+    _get_all_device_indices,
+    _get_available_device_type,
+    _get_device_index,
+    _get_devices_properties,
+)
+from torch.nn.modules import Module
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter_kwargs
+
+
+__all__ = ["DataParallel", "data_parallel"]
+
+
+def _check_balance(device_ids: Sequence[int | torch.device]) -> None:
+    imbalance_warn = """
+    There is an imbalance between your GPUs. You may want to exclude GPU {} which
+    has less than 75% of the memory or cores of GPU {}. You can do so by setting
+    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
+    environment variable."""
+    device_ids = [_get_device_index(x, True) for x in device_ids]
+    dev_props = _get_devices_properties(device_ids)
+
+    def warn_imbalance(get_prop) -> bool:
+        values = [get_prop(props) for props in dev_props]
+        min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1))
+        max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
+        if min_val / max_val < 0.75:
+            warnings.warn(
+                imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]),
+                stacklevel=2,
+            )
+            return True
+        return False
+
+    if warn_imbalance(lambda props: props.total_memory):
+        return
+    if warn_imbalance(lambda props: props.multi_processor_count):
+        return
+
+
+T = TypeVar("T", bound=Module)
+
+
+class DataParallel(Module, Generic[T]):
+    r"""Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given :attr:`module` by
+    splitting the input across the specified devices by chunking in the batch
+    dimension (other objects will be copied once per device). In the forward
+    pass, the module is replicated on each device, and each replica handles a
+    portion of the input. During the backwards pass, gradients from each replica
+    are summed into the original module.
+
+    The batch size should be larger than the number of GPUs used.
+
+    .. warning::
+        It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
+        instead of this class, to do multi-GPU training, even if there is only a single
+        node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`.
+
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel but some types are specially handled. tensors will be
+    **scattered** on dim specified (default 0). tuple, list and dict types will
+    be shallow copied. The other types will be shared among different threads
+    and can be corrupted if written to in the model's forward pass.
+
+    The parallelized :attr:`module` must have its parameters and buffers on
+    ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel`
+    module.
+
+    .. warning::
+        In each forward, :attr:`module` is **replicated** on each device, so any
+        updates to the running module in ``forward`` will be lost. For example,
+        if :attr:`module` has a counter attribute that is incremented in each
+        ``forward``, it will always stay at the initial value because the update
+        is done on the replicas which are destroyed after ``forward``. However,
+        :class:`~torch.nn.DataParallel` guarantees that the replica on
+        ``device[0]`` will have its parameters and buffers sharing storage with
+        the base parallelized :attr:`module`. So **in-place** updates to the
+        parameters or buffers on ``device[0]`` will be recorded. E.g.,
+        :class:`~torch.nn.BatchNorm2d` and :func:`~torch.nn.utils.spectral_norm`
+        rely on this behavior to update the buffers.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        will be invoked ``len(device_ids)`` times, each with inputs located on
+        a particular device. Particularly, the hooks are only guaranteed to be
+        executed in correct order with respect to operations on corresponding
+        devices. For example, it is not guaranteed that hooks set via
+        :meth:`~torch.nn.Module.register_forward_pre_hook` be executed before
+        `all` ``len(device_ids)`` :meth:`~torch.nn.Module.forward` calls, but
+        that each such hook be executed before the corresponding
+        :meth:`~torch.nn.Module.forward` call of that device.
+
+    .. warning::
+        When :attr:`module` returns a scalar (i.e., 0-dimensional tensor) in
+        :func:`forward`, this wrapper will return a vector of length equal to
+        number of devices used in data parallelism, containing the result from
+        each device.
+
+    .. note::
+        There is a subtlety in using the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`pack-rnn-unpack-with-data-parallelism` section in FAQ for
+        details.
+
+
+    Args:
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)  # input_var can be on any device, including CPU
+    """
+
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+
+    def __init__(
+        self,
+        module: T,
+        device_ids: Sequence[int | torch.device] | None = None,
+        output_device: int | torch.device | None = None,
+        dim: int = 0,
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once("torch.nn.parallel.DataParallel")
+        device_type = _get_available_device_type()
+        if device_type is None or device_type == "mps":
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = _get_all_device_indices()
+
+        if device_ids is None:
+            raise RuntimeError("no available devices were found")
+
+        if output_device is None:
+            output_device = device_ids[0]
+
+        self.dim = dim
+        self.module = module
+        self.device_ids = [_get_device_index(x, True) for x in device_ids]
+        self.output_device = _get_device_index(output_device, True)
+        # pyrefly: ignore [read-only]
+        self.src_device_obj = torch.device(device_type, self.device_ids[0])
+
+        if device_type == "cuda":
+            _check_balance(self.device_ids)
+
+        if len(self.device_ids) == 1:
+            self.module.to(self.src_device_obj)
+
+    def forward(self, *inputs: Any, **kwargs: Any) -> Any:
+        with torch.autograd.profiler.record_function("DataParallel.forward"):
+            if not self.device_ids:
+                return self.module(*inputs, **kwargs)
+
+            # pyrefly: ignore [bad-argument-type]
+            for t in chain(self.module.parameters(), self.module.buffers()):
+                if t.device != self.src_device_obj:
+                    raise RuntimeError(
+                        "module must have its parameters and buffers "
+                        f"on device {self.src_device_obj} (device_ids[0]) but found one of "
+                        f"them on device: {t.device}"
+                    )
+
+            inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            # for forward function without any inputs, empty list and dict will be created
+            # so the module can be executed on one device which is the first one in device_ids
+            if not inputs and not module_kwargs:
+                inputs = ((),)
+                module_kwargs = ({},)
+
+            if len(self.device_ids) == 1:
+                return self.module(*inputs[0], **module_kwargs[0])
+            replicas = self.replicate(self.module, self.device_ids[: len(inputs)])
+            outputs = self.parallel_apply(replicas, inputs, module_kwargs)
+            return self.gather(outputs, self.output_device)
+
+    def replicate(self, module: T, device_ids: Sequence[int | torch.device]) -> list[T]:
+        return replicate(module, device_ids, not torch.is_grad_enabled())
+
+    def scatter(
+        self,
+        inputs: tuple[Any, ...],
+        kwargs: dict[str, Any] | None,
+        device_ids: Sequence[int | torch.device],
+    ) -> Any:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(
+        self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any
+    ) -> list[Any]:
+        return parallel_apply(
+            replicas, inputs, kwargs, self.device_ids[: len(replicas)]
+        )
+
+    def gather(self, outputs: Any, output_device: int | torch.device) -> Any:
+        return gather(outputs, output_device, dim=self.dim)
+
+
+def data_parallel(
+    module: Module,
+    inputs: Any,
+    device_ids: Sequence[int | torch.device] | None = None,
+    output_device: int | torch.device | None = None,
+    dim: int = 0,
+    module_kwargs: Any | None = None,
+) -> torch.Tensor:
+    r"""Evaluate module(input) in parallel across the GPUs given in device_ids.
+
+    This is the functional version of the DataParallel module.
+
+    Args:
+        module (Module): the module to evaluate in parallel
+        inputs (Tensor): inputs to the module
+        device_ids (list of int or torch.device): GPU ids on which to replicate module
+        output_device (list of int or torch.device): GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Tensor containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,) if inputs is not None else ()
+
+    device_type = _get_available_device_type()
+
+    if device_type is None:
+        raise RuntimeError("device type could not be determined")
+
+    if device_ids is None:
+        device_ids = _get_all_device_indices()
+
+    if device_ids is None:
+        raise RuntimeError("no available devices were found")
+
+    if output_device is None:
+        output_device = device_ids[0]
+
+    device_ids = [_get_device_index(x, True) for x in device_ids]
+    output_device = _get_device_index(output_device, True)
+    # pyrefly: ignore [no-matching-overload]
+    src_device_obj = torch.device(device_type, device_ids[0])
+
+    # pyrefly: ignore [bad-argument-type]
+    for t in chain(module.parameters(), module.buffers()):
+        if t.device != src_device_obj:
+            raise RuntimeError(
+                "module must have its parameters and buffers "
+                f"on device {src_device_obj} (device_ids[0]) but found one of "
+                f"them on device: {t.device}"
+            )
+
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    # for module without any inputs, empty list and dict will be created
+    # so the module can be executed on one device which is the first one in device_ids
+    if not inputs and not module_kwargs:
+        inputs = ((),)
+        module_kwargs = ({},)
+
+    assert module_kwargs is not None
+
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[: len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..4899d123e80a124f31e45ed832bba195af32c353
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py
@@ -0,0 +1,2434 @@
+# mypy: allow-untyped-defs
+import copy
+import functools
+import inspect
+import itertools
+import logging
+import os
+import sys
+import warnings
+import weakref
+from collections import defaultdict, deque
+from collections.abc import Callable
+from contextlib import contextmanager
+from dataclasses import dataclass, fields, is_dataclass
+from enum import auto, Enum
+from typing import Any, Optional, TYPE_CHECKING
+
+import torch
+import torch.distributed as dist
+from torch._utils import _get_device_index
+from torch.autograd import Function, Variable
+from torch.distributed.algorithms.join import Join, Joinable, JoinHook
+from torch.nn.modules import Module
+from torch.nn.parallel.scatter_gather import gather, scatter_kwargs
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+
+RPC_AVAILABLE = False
+if dist.is_available():
+    from torch.distributed.distributed_c10d import (
+        _get_default_group,
+        _rank_not_in_group,
+        ReduceOp,
+    )
+    from torch.distributed.utils import (
+        _alloc_storage,
+        _cast_forward_inputs,
+        _free_storage,
+        _sync_module_states,
+        _to_kwargs,
+        _verify_param_shape_across_processes,
+    )
+if dist.rpc.is_available():
+    RPC_AVAILABLE = True
+    from torch.distributed.rpc import RRef
+
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
+
+
+__all__ = ["DistributedDataParallel"]
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _MixedPrecision:
+    """
+    This configures DDP-native mixed precision training.
+
+    Attributes:
+        param_dtype (torch.dtype): This specifies the dtype for model
+            parameters, inputs (when ``cast_forward_inputs`` is set to
+            ``True``), and therefore the dtype for computation.
+            However, outside the forward and backward passes, parameters are in
+            full precision. Model checkpointing always happens in full
+            precision.
+        reduce_dtype (torch.dtype): This specifies the dtype for gradient
+            reduction, which is permitted to differ from ``param_dtype``.
+        buffer_dtype (torch.dtype): This specifies the dtype for buffers.
+
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Only floating point tensors are cast to their specified dtypes.
+
+    .. note:: ``state_dict`` checkpoints parameters and buffers in full
+        precision.
+
+    .. note:: Each low precision dtype must be specified explicitly. For
+        example, ``_MixedPrecision(reduce_dtype=torch.float16)`` only specifies
+        the reduction dtype to be low precision, and DDP will not cast
+        parameters or buffers.
+
+    .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction
+        happens in ``param_dtype`` if specified or the original parameter dtype
+        otherwise. For example, ``_MixedPrecision(param_dtype=torch.float16)``
+        would result in communication occurring in fp16.
+    """
+
+    param_dtype: torch.dtype | None = None
+    reduce_dtype: torch.dtype | None = None
+    buffer_dtype: torch.dtype | None = None
+    # TODO (rohan-varma): keep_low_precision_grads: bool = False
+    # TODO (rohan-varma): APIs to allow users to run batchnorm and layernorm
+    # in full precision. For DDP, this can be implemented by not performing the
+    # parameter cast for BN and LN units.
+
+
+def _cast_buffers(mixed_precision_config, root_module):
+    """Casts buffers to the given ``buffer_dtype``."""
+    for buf in root_module.buffers():
+        if hasattr(buf, "_ddp_ignored") and buf._ddp_ignored:
+            continue
+
+        buf.data = buf.to(dtype=mixed_precision_config.buffer_dtype)
+
+
+def _setup_mixed_precision_params(mixed_precision_config, root_module):
+    """Create and free storage for the mixed precision parameters."""
+    for param in root_module.parameters():
+        # Do not setup mixed precision for DDP ignored parameters.
+        if hasattr(param, "_ddp_ignored") and param._ddp_ignored:
+            continue
+
+        if not hasattr(param, "_mp_param"):
+            param._mp_param = torch.zeros_like(
+                param,
+                device=param.device,
+                dtype=mixed_precision_config.param_dtype,
+                requires_grad=param.requires_grad,
+            )
+            _free_storage(param._mp_param)
+            # _fp_param will point to the full precision param so it can be switched
+            # back to at the end of forward / backward.
+            param._fp_param = param.data
+
+
+def _tree_flatten_with_rref(output):
+    output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
+    if output_is_rref:
+        output_tensor_list, treespec = tree_flatten(output.local_value())
+    else:
+        output_tensor_list, treespec = tree_flatten(output)
+    # Need to return flattened tensors, spec to re-pack them, as well
+    # as if the return type was actually an RRef to reconstruct.
+    return output_tensor_list, treespec, output_is_rref
+
+
+def _tree_unflatten_with_rref(output, treespec, output_is_rref):
+    output = tree_unflatten(output, treespec)
+    if output_is_rref:
+        output = RRef(output)
+    return output
+
+
+def _find_tensors(obj):
+    r"""Recursively find all tensors contained in the specified object."""
+    if RPC_AVAILABLE and isinstance(obj, RRef):
+        # If the current node is the owner of the RRef, unwrap it and try to
+        # find Tensors.
+        # TODO: Expand to remote RRefs.
+        if obj.is_owner():
+            return _find_tensors(obj.local_value())
+    if isinstance(obj, torch.Tensor):
+        return [obj]
+    if isinstance(obj, (list, tuple)):
+        return itertools.chain.from_iterable(map(_find_tensors, obj))
+    if isinstance(obj, dict):
+        return itertools.chain.from_iterable(map(_find_tensors, obj.values()))
+    if is_dataclass(obj):
+        return itertools.chain.from_iterable(
+            map(_find_tensors, (getattr(obj, f.name) for f in fields(obj)))
+        )
+
+    return []
+
+
+def _dump_DDP_relevant_env_vars():
+    relevant_env_vars = [
+        "RANK",
+        "LOCAL_RANK",
+        "WORLD_SIZE",
+        "MASTER_PORT",
+        "MASTER_ADDR",
+        "CUDA_VISIBLE_DEVICES",
+        "GLOO_SOCKET_IFNAME",
+        "GLOO_DEVICE_TRANSPORT",
+        "NCCL_SOCKET_IFNAME",
+        "TORCH_NCCL_BLOCKING_WAIT",
+        "NCCL_DEBUG",
+        "NCCL_DEBUG_SUBSYS",
+        "NCCL_IB_DISABLE",
+        # More NCCL env vars:
+        "NCCL_P2P_DISABLE",
+        "NCCL_P2P_LEVEL",
+        "NCCL_SHM_DISABLE",
+        "NCCL_SOCKET_NTHREADS",
+        "NCCL_NSOCKS_PERTHREAD",
+        "NCCL_BUFFSIZE",
+        "NCCL_NTHREADS",
+        "NCCL_RINGS",
+        "NCCL_MAX_NCHANNELS",
+        "NCCL_MIN_NCHANNELS",
+        "NCCL_CHECKS_DISABLE",
+        "NCCL_CHECK_POINTERS",
+        "NCCL_LAUNCH_MODE",
+        "NCCL_IB_HCA",
+        "NCCL_IB_TIMEOUT",
+        "NCCL_IB_RETRY_CNT",
+        "NCCL_IB_GID_INDEX",
+        "NCCL_IB_SL",
+        "NCCL_IB_TC",
+        "NCCL_IB_AR_THRESHOLD",
+        "NCCL_IB_CUDA_SUPPORT",
+        "NCCL_NET_GDR_LEVEL",
+        "NCCL_NET_GDR_READ",
+        "NCCL_SINGLE_RING_THRESHOLD",
+        "NCCL_LL_THRESHOLD",
+        "NCCL_TREE_THRESHOLD",
+        "NCCL_ALGO",
+        "NCCL_PROTO",
+        "NCCL_IGNORE_CPU_AFFINITY",
+        "NCCL_DEBUG_FILE",
+        "NCCL_COLLNET_ENABLE",
+        "NCCL_TOPO_FILE",
+        "NCCL_TOPO_DUMP_FILE",
+        "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+    ]
+    formatted_output = ""
+    for var in relevant_env_vars:
+        value = os.environ.get(var, "N/A")
+        formatted_output += f"env:{var}={value}\n"
+    print(formatted_output)
+
+
+class _BufferCommHookLocation(Enum):
+    PRE_FORWARD = auto()
+    POST_FORWARD = auto()
+
+
+@dataclass
+class _BufferCommHook:
+    buffer_comm_hook: Callable
+    buffer_comm_hook_state: Any
+    buffer_comm_hook_location: _BufferCommHookLocation
+
+
+# Add a DDPSink to run various functions when backwards starts, such as
+# queueing call back of out-most backward/graph task,
+# this helps call back is fired after all gradients' calculation
+# is completed.
+class _DDPSink(Function):
+    @staticmethod
+    # pyrefly: ignore [bad-override]
+    def forward(ctx, ddp_weakref, *inputs):
+        # set_materialize_grads(False) will ensure that None gradients stay as
+        # None and are not filled with zeros.
+        ctx.set_materialize_grads(False)
+        ctx.ddp_weakref = ddp_weakref
+        ret = inputs
+        if ddp_weakref()._ddp_sink_clone:
+            ret = tuple(
+                inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
+            )
+        return ret
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        # Enqueue delay allreduce for static graph training on the first
+        # iteration.
+        ddp_weakref = ctx.ddp_weakref()
+        reducer = ddp_weakref.reducer
+        static_graph = ddp_weakref.static_graph
+        delay_ar_enqueued = (
+            static_graph and ddp_weakref._static_graph_delay_allreduce_enqueued
+        )
+        if static_graph and not delay_ar_enqueued:
+            Variable._execution_engine.queue_callback(  # type: ignore[call-arg,misc]
+                reducer._delay_all_reduce
+            )
+            ddp_weakref._static_graph_delay_allreduce_enqueued = True
+
+        return (None, *grad_outputs)
+
+
+class _DDPJoinHook(JoinHook):
+    def __init__(self, ddp, divide_by_initial_world_size):
+        """Set config variables for internal usage."""
+        assert isinstance(ddp, DistributedDataParallel), (
+            "DDP join hook requires passing in a DistributedDataParallel "
+            "instance as the state"
+        )
+        assert ddp.logger is not None
+        ddp.logger._set_uneven_input_join()
+        self.ddp = ddp
+        self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
+        super().__init__()
+
+    def main_hook(self):
+        """Shadow the DDP collective communication operations in the forward and backward passes."""
+        ddp = self.ddp
+        # Buckets are rebuilt only once during a training period
+        ddp.reducer._rebuild_buckets()
+
+        # Schedule a broadcast if we are syncing module buffers in the
+        # forward pass
+        # TODO: make DDP uneven inputs context manager support buffer
+        # comm hook (https://github.com/pytorch/pytorch/issues/65436)
+        ddp._check_and_sync_module_buffers()
+
+        # Check if need to sync in the backward pass
+        should_sync_backwards = ddp._check_global_requires_backward_grad_sync(
+            is_joined_rank=True
+        )
+        # Forward parameter sync is disabled in the next iteration if we
+        # are skipping gradient sync this iteration, so set
+        # `require_forward_param_sync` accordingly
+        ddp.require_forward_param_sync = should_sync_backwards
+        if not should_sync_backwards:
+            return
+
+        # Schedule one allreduce per gradient bucket to match the backward
+        # pass allreduce
+        ddp._match_all_reduce_for_bwd_pass()
+
+        # Check if we need to allreduce locally unused parameters
+        if ddp.find_unused_parameters:
+            ddp._match_unused_params_allreduce()
+
+        # Rebuilt parameters are pushed only once during a training period
+        ddp.reducer._push_all_rebuilt_params()
+
+    def post_hook(self, is_last_joiner: bool):
+        """Sync the final model to ensure that the model is the same across all processes."""
+        self.ddp._sync_final_model(is_last_joiner)
+
+
+class DistributedDataParallel(Module, Joinable):
+    r"""Implement distributed data parallelism based on ``torch.distributed`` at module level.
+
+    This container provides data parallelism by synchronizing gradients
+    across each model replica. The devices to synchronize across are
+    specified by the input ``process_group``, which is the entire world
+    by default. Note that ``DistributedDataParallel`` does not chunk or
+    otherwise shard the input across participating GPUs; the user is
+    responsible for defining how to do so, for example through the use
+    of a :class:`DistributedSampler`.
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-ddp-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires that ``torch.distributed`` to be already
+    initialized, by calling :func:`torch.distributed.init_process_group`.
+
+    ``DistributedDataParallel`` is proven to be significantly faster than
+    :class:`torch.nn.DataParallel` for single-node multi-GPU data
+    parallel training.
+
+    To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
+    up ``N`` processes, ensuring that each process exclusively works on a single
+    GPU from 0 to N-1. This can be done by either setting
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling the following API for GPUs,
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.cuda.set_device(i)
+
+    or calling the unified API for :ref:`accelerator<accelerators>`,
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.accelerator.set_device_index(i)
+
+    where i is from 0 to N-1. In each process, you should refer the following
+    to construct this module:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> if torch.accelerator.is_available():
+        >>>     device_type = torch.accelerator.current_accelerator().type
+        >>>     vendor_backend = torch.distributed.get_default_backend_for_device(device_type)
+        >>>
+        >>> torch.distributed.init_process_group(
+        >>>     backend=vendor_backend, world_size=N, init_method='...'
+        >>> )
+        >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
+
+    Or you can use the latest API for initialization:
+
+        >>> torch.distributed.init_process_group(device_id=i)
+
+    In order to spawn up multiple processes per node, you can use either
+    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
+
+    .. note::
+        Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
+        for a brief introduction to all features related to distributed training.
+
+    .. note::
+        ``DistributedDataParallel`` can be used in conjunction with
+        :class:`torch.distributed.optim.ZeroRedundancyOptimizer` to reduce
+        per-rank optimizer states memory footprint. Please refer to
+        `ZeroRedundancyOptimizer recipe <https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html>`__
+        for more details.
+
+    .. note:: ``nccl`` backend is currently the fastest and highly recommended
+        backend when using GPUs. This applies to both single-node and
+        multi-node distributed training.
+
+    .. note:: This module also supports mixed-precision distributed training.
+        This means that your model can have different types of parameters such
+        as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these
+        mixed types of parameters will just work fine.
+
+    .. note:: If you use ``torch.save`` on one process to checkpoint the module,
+        and ``torch.load`` on some other processes to recover it, make sure that
+        ``map_location`` is configured properly for every process. Without
+        ``map_location``, ``torch.load`` would recover the module to devices
+        where the module was saved from.
+
+    .. note:: When a model is trained on ``M`` nodes with ``batch=N``, the
+        gradient will be ``M`` times smaller when compared to the same model
+        trained on a single node with ``batch=M*N`` if the loss is summed (NOT
+        averaged as usual) across instances in a batch (because the gradients
+        between different nodes are averaged). You should take this into
+        consideration when you want to obtain a mathematically equivalent
+        training process compared to the local training counterpart. But in most
+        cases, you can just treat a DistributedDataParallel wrapped model, a
+        DataParallel wrapped model and an ordinary model on a single GPU as the
+        same (E.g. using the same learning rate for equivalent batch size).
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. note::
+        If you are using DistributedDataParallel in conjunction with the
+        :ref:`distributed-rpc-framework`, you should always use
+        :meth:`torch.distributed.autograd.backward` to compute gradients and
+        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        parameters.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> import torch.distributed.autograd as dist_autograd
+            >>> from torch.nn.parallel import DistributedDataParallel as DDP
+            >>> import torch
+            >>> from torch import optim
+            >>> from torch.distributed.optim import DistributedOptimizer
+            >>> import torch.distributed.rpc as rpc
+            >>> from torch.distributed.rpc import RRef
+            >>>
+            >>> t1 = torch.rand((3, 3), requires_grad=True)
+            >>> t2 = torch.rand((3, 3), requires_grad=True)
+            >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
+            >>> ddp_model = DDP(my_model)
+            >>>
+            >>> # Setup optimizer
+            >>> optimizer_params = [rref]
+            >>> for param in ddp_model.parameters():
+            >>>     optimizer_params.append(RRef(param))
+            >>>
+            >>> dist_optim = DistributedOptimizer(
+            >>>     optim.SGD,
+            >>>     optimizer_params,
+            >>>     lr=0.05,
+            >>> )
+            >>>
+            >>> with dist_autograd.context() as context_id:
+            >>>     pred = ddp_model(rref.to_here())
+            >>>     loss = loss_func(pred, target)
+            >>>     dist_autograd.backward(context_id, [loss])
+            >>>     dist_optim.step(context_id)
+
+    .. note::
+        DistributedDataParallel currently offers limited support for gradient
+        checkpointing with :meth:`torch.utils.checkpoint`.
+        If the checkpoint is done with use_reentrant=False (recommended), DDP
+        will work as expected without any limitations.
+        If, however, the checkpoint is done with use_reentrant=True (the default),
+        DDP will work as expected when there are no unused parameters in the model
+        and each layer is checkpointed at most once (make sure you are not passing
+        `find_unused_parameters=True` to DDP). We currently do not support the
+        case where a layer is checkpointed multiple times, or when there unused
+        parameters in the checkpointed model.
+
+    .. note::
+        To let a non-DDP model load a state dict from a DDP model,
+        :meth:`~torch.nn.modules.utils.consume_prefix_in_state_dict_if_present`
+        needs to be applied to strip the prefix "module." in the DDP state dict before loading.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) are distributed synchronization
+        points. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient ``allreduce`` following the reverse order of the
+        registered parameters of the model. In other words, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact same parameter registration order.
+
+    .. warning::
+        This module allows parameters with non-rowmajor-contiguous strides.
+        For example, your model may contain some parameters whose
+        :class:`torch.memory_format` is ``torch.contiguous_format``
+        and others whose format is ``torch.channels_last``.  However,
+        corresponding parameters in different processes must have the
+        same strides.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. warning::
+        You should never try to change your model's parameters after wrapping
+        up your model with ``DistributedDataParallel``. Because, when
+        wrapping up your model with ``DistributedDataParallel``, the constructor
+        of ``DistributedDataParallel`` will register the additional gradient
+        reduction functions on all the parameters of the model itself at the
+        time of construction. If you change the model's parameters afterwards,
+        gradient reduction functions no longer match the correct set of
+        parameters.
+
+    .. warning::
+        Using ``DistributedDataParallel`` in conjunction with the
+        :ref:`distributed-rpc-framework` is experimental and subject to change.
+
+    Args:
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices.
+                   1) For single-device modules, ``device_ids`` can
+                   contain exactly one device id, which represents the only
+                   CUDA device where the input module corresponding to this process resides.
+                   Alternatively, ``device_ids`` can also be ``None``.
+                   2) For multi-device modules and CPU modules,
+                   ``device_ids`` must be ``None``.
+
+                   When ``device_ids`` is ``None`` for both cases,
+                   both the input data for the forward pass and the actual module
+                   must be placed on the correct device.
+                   (default: ``None``)
+        output_device (int or torch.device): Device location of output for
+                      single-device CUDA modules. For multi-device modules and
+                      CPU modules, it must be ``None``, and the module itself
+                      dictates the output location. (default: ``device_ids[0]``
+                      for single-device modules)
+        broadcast_buffers (bool): Flag that enables syncing (broadcasting)
+                          buffers of the module at beginning of the ``forward``
+                          function. (default: ``True``)
+        init_sync (bool): Whether to sync during initialization to verify param
+                          shapes and broadcast parameters and buffers.
+                          WARNING: if this is set to False the user is required
+                          to ensure themselves that the weights are the same on
+                          all ranks.
+                          (default: ``True``)
+        process_group: The process group to be used for distributed data
+                       all-reduction. If ``None``, the default process group, which
+                       is created by :func:`torch.distributed.init_process_group`,
+                       will be used. (default: ``None``)
+        bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       :attr:`bucket_cap_mb` controls the bucket size in
+                       MebiBytes (MiB). If ``None``, a default size of 25 MiB
+                       will be used. (default: ``None``)
+        find_unused_parameters (bool): Traverse the autograd graph from all
+                               tensors contained in the return value of the
+                               wrapped module's ``forward`` function. Parameters
+                               that don't receive gradients as part of this
+                               graph are preemptively marked as being ready to
+                               be reduced. In addition, parameters that may have
+                               been used in the wrapped module's ``forward``
+                               function but were not part of loss computation and
+                               thus would also not receive gradients are
+                               preemptively marked as ready to be reduced.
+                               (default: ``False``)
+        check_reduction: This argument is deprecated.
+        gradient_as_bucket_view (bool): When set to ``True``, gradients will be views
+                      pointing to different offsets of ``allreduce`` communication
+                      buckets. This can reduce peak memory usage, where the
+                      saved memory size will be equal to the total gradients
+                      size. Moreover, it avoids the overhead of copying between
+                      gradients and ``allreduce`` communication buckets. When
+                      gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by
+                      referring to the :meth:`~torch.optim.Optimizer.zero_grad`
+                      function in ``torch/optim/optimizer.py`` as a solution.
+                      Note that gradients will be views after first iteration, so
+                      the peak memory saving should be checked after first iteration.
+        static_graph (bool): When set to ``True``, DDP knows the trained graph is
+                     static. Static graph means 1) The set of used and unused
+                     parameters will not change during the whole training loop; in
+                     this case, it does not matter whether users set
+                     ``find_unused_parameters = True`` or not. 2) How the graph is trained
+                     will not change during the whole training loop (meaning there is
+                     no control flow depending on iterations).
+                     When static_graph is set to be ``True``, DDP will support cases that
+                     can not be supported in the past:
+                     1) Reentrant backwards.
+                     2) Activation checkpointing multiple times.
+                     3) Activation checkpointing when model has unused parameters.
+                     4) There are model parameters that are outside of forward function.
+                     5) Potentially improve performance when there are unused parameters,
+                     as DDP will not search graph in each iteration to detect unused
+                     parameters when static_graph is set to be ``True``.
+                     To check whether you can set static_graph to be ``True``, one way is to
+                     check ddp logging data at the end of your previous model training,
+                     if ``ddp_logging_data.get("can_set_static_graph") == True``, mostly you
+                     can set ``static_graph = True`` as well.
+
+                     Example::
+                         >>> # xdoctest: +SKIP("undefined variables")
+                         >>> model_DDP = torch.nn.parallel.DistributedDataParallel(model)
+                         >>> # Training loop
+                         >>> ...
+                         >>> ddp_logging_data = model_DDP._get_ddp_logging_data()
+                         >>> static_graph = ddp_logging_data.get("can_set_static_graph")
+        delay_all_reduce_named_params (list of tuple of str and torch.nn.Parameter): a list
+                    of named parameters whose all reduce will be delayed when the gradient of
+                    the parameter specified in ``param_to_hook_all_reduce`` is ready. Other
+                    arguments of DDP do not apply to named params specified in this argument
+                    as these named params will be ignored by DDP reducer.
+        param_to_hook_all_reduce (torch.nn.Parameter): a parameter to hook delayed all reduce
+                    of parameters specified in ``delay_all_reduce_named_params``.
+        skip_all_reduce_unused_params: When set to True, DDP will skip reducing unused parameters.
+                    This requires that unused parameters remain the same across all ranks throughout
+                    the entire training process. If this condition is not met, it may cause
+                    desynchronization and result in training hang.
+
+
+    Attributes:
+        module (Module): the module to be parallelized.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+        >>> net = torch.nn.parallel.DistributedDataParallel(model)
+    """
+
+    # used to track whether the given thread is inside ddp forward for torchdynamo purposes
+    _active_ddp_module: Optional["DistributedDataParallel"] = None
+
+    def __init__(
+        self,
+        module,
+        device_ids=None,
+        output_device=None,
+        dim=0,
+        broadcast_buffers=True,
+        init_sync=True,
+        process_group=None,
+        bucket_cap_mb=None,
+        find_unused_parameters=False,
+        check_reduction=False,
+        gradient_as_bucket_view=False,
+        static_graph=False,
+        delay_all_reduce_named_params=None,
+        param_to_hook_all_reduce=None,
+        mixed_precision: _MixedPrecision | None = None,
+        device_mesh=None,
+        skip_all_reduce_unused_params=False,
+    ):
+        super().__init__()
+        Joinable.__init__(self)
+        self._use_python_reducer = (
+            torch._dynamo.utils.get_optimize_ddp_mode() == "python_reducer"
+        )
+        self.logger: dist.Logger | None = None
+        if bool(delay_all_reduce_named_params is not None) != bool(
+            param_to_hook_all_reduce is not None
+        ):
+            self._log_and_throw(
+                ValueError,
+                "delay_all_reduce_named_params and param_to_hook_all_reduce "
+                "need to be set at the same time.",
+            )
+
+        if process_group and device_mesh is not None:
+            raise RuntimeError(
+                "Cannot specify both process_group and device_mesh arguments."
+            )
+        elif process_group is None and device_mesh is None:
+            self.process_group = _get_default_group()
+        elif device_mesh is None:
+            # pyrefly: ignore [bad-assignment]
+            self.process_group = process_group
+        else:
+            if device_mesh.ndim != 1:
+                raise RuntimeError(
+                    f"Only 1D device mesh is supported, but got {device_mesh}."
+                )
+            self.device_mesh = device_mesh
+            self.process_group = device_mesh.get_group(mesh_dim=0)
+
+            root_mesh = device_mesh._get_root_mesh()
+            # if a root mesh is not the same as device_mesh,
+            # meaning the device_mesh is sliced out from the root mesh.
+            if root_mesh != device_mesh:
+                # TODO: This is a temporary work around to enable DDP + TP.
+                # We should do the logic in DDP so that the 2D implementation is
+                # sound and the state_dict works out of the box.
+                # This has to be done before check UninitializedParameter.
+                from torch.distributed.tensor.parallel.ddp import (
+                    _pre_dp_module_transform,
+                )
+
+                _pre_dp_module_transform(module)
+
+        self._delay_all_reduce_params = []
+        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
+            self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore)
+        else:
+            self.parameters_to_ignore = set()
+        if delay_all_reduce_named_params is not None:
+            for name, param in delay_all_reduce_named_params:
+                self.parameters_to_ignore.add(name)
+                self._delay_all_reduce_params.append(param)
+
+        self._module_parameters = [
+            p
+            for n, p in module.named_parameters()
+            if n not in self.parameters_to_ignore
+        ]
+        if not any(p.requires_grad for p in self._module_parameters):
+            if len(self._delay_all_reduce_params):
+                logger.info("Delay the AllReduce of all parameters.")
+            else:
+                self._log_and_throw(
+                    RuntimeError,
+                    "DistributedDataParallel is not needed when a module "
+                    "doesn't have any parameter that requires a gradient.",
+                )
+
+        if device_ids is not None and len(device_ids) > 1:
+            self._log_and_throw(
+                ValueError,
+                "device_ids can only be None or contain a single element.",
+            )
+
+        self.is_multi_device_module = (
+            len({p.device for p in self._module_parameters}) > 1
+        )
+        distinct_device_types = {
+            p.device.type for p in self._module_parameters if p.device is not None
+        }
+        if len(distinct_device_types) != 1:
+            self._log_and_throw(
+                ValueError,
+                "DistributedDataParallel's input module must be on "
+                f"the same type of devices, but input module parameters locate in {distinct_device_types}.",
+            )
+
+        self.device_type = next(iter(distinct_device_types))
+
+        if (
+            device_ids is None
+            or len(device_ids) == 0  # For backward compatibility.
+            or self.device_type == "cpu"
+            or self.is_multi_device_module
+        ):
+            if device_ids or output_device:
+                self._log_and_throw(
+                    ValueError,
+                    "DistributedDataParallel device_ids and output_device arguments "
+                    "only work with single-device/multiple-device GPU modules or CPU modules, "
+                    f"but got device_ids {device_ids}, output_device {output_device}, "
+                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",
+                )
+
+            self.device_ids = None
+            self.output_device = None
+        else:
+            # pyrefly: ignore [bad-assignment]
+            self.device_ids = [_get_device_index(x, True) for x in device_ids]
+
+            if output_device is None:
+                output_device = device_ids[0]
+
+            # pyrefly: ignore [bad-assignment]
+            self.output_device = _get_device_index(output_device, True)
+
+        self.static_graph = False
+        self.dim = dim
+        self.module = module
+        self.device = next(iter(self._module_parameters)).device
+        self.broadcast_buffers = broadcast_buffers
+        self.find_unused_parameters = find_unused_parameters
+        self.require_backward_grad_sync = True
+        self.require_forward_param_sync = True
+        self.gradient_as_bucket_view = gradient_as_bucket_view
+        self.mixed_precision = mixed_precision
+        if self.mixed_precision is not None:
+            logger.warning("Received mixed precision config %s", self.mixed_precision)
+
+        if check_reduction:
+            # This argument is no longer used since the reducer
+            # will ensure reduction completes even if some parameters
+            # do not receive gradients.
+            warnings.warn(
+                "The `check_reduction` argument in `DistributedDataParallel` "
+                "module is deprecated. Please avoid using it.",
+                FutureWarning,
+                stacklevel=2,
+            )
+
+        # Check that a module does not have Uninitialized parameters
+        for param in self._module_parameters:
+            if isinstance(param, torch.nn.parameter.UninitializedParameter):
+                self._log_and_throw(
+                    RuntimeError,
+                    "Modules with uninitialized parameters can't be used with `DistributedDataParallel`. "
+                    "Run a dummy forward pass to correctly initialize the modules",
+                )
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = 250 * 1024 * 1024
+
+        # reduction bucket size
+        if bucket_cap_mb is None:
+            # default case (bucket cap is 25 MiB)
+            bucket_cap_mb = 25
+            self.bucket_bytes_cap_default = True
+        else:
+            self.bucket_bytes_cap_default = False
+        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+
+        # Whether to perform input tensor CPU to GPU copies on a side-stream
+        self.use_side_stream_for_tensor_copies = (
+            os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1"
+        )
+
+        # Initialize gradient buffers and register all reduce hook
+        self._delay_grad_buffer: torch.Tensor | None = None
+        self._delay_grad_views: list[torch.Tensor] = []
+        self._delay_all_reduce_all_params = False
+        if len(self._delay_all_reduce_params) != 0:
+            self._register_delay_all_reduce_hook(
+                bucket_cap_mb=bucket_cap_mb,
+                param_to_hook_all_reduce=param_to_hook_all_reduce,
+                device_ids=device_ids,
+            )
+            if self._delay_all_reduce_all_params:
+                return
+
+        self.skip_all_reduce_unused_params = skip_all_reduce_unused_params
+
+        # Build parameters for reducer.
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+
+        # All collectives during initialization are gated by this flag.
+        if init_sync:
+            # Verify model equivalence.
+            _verify_param_shape_across_processes(self.process_group, parameters)
+            # Sync params and buffers. Ensures all DDP models start off at the same value.
+            _sync_module_states(
+                module=self.module,
+                process_group=self.process_group,
+                broadcast_bucket_size=self.broadcast_bucket_size,
+                src=0,
+                params_and_buffers_to_ignore=self.parameters_to_ignore,
+                broadcast_buffers=self.broadcast_buffers,
+            )
+
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            static_graph,
+        )
+        self._comm_hooks: list[tuple[Callable, object]] = []
+
+        if self.mixed_precision is not None:
+            _setup_mixed_precision_params(self.mixed_precision, self.module)
+            _cast_buffers(self.mixed_precision, self.module)
+            # Stream used for async low precision copies.
+            self._mp_stream = torch.Stream()
+            self._submodule_to_event = defaultdict(deque)  # type: ignore[var-annotated]
+            # Add forward pre-hook to root module to kick off copies to lower
+            # precision.
+            self.module.register_forward_pre_hook(
+                self._root_copy_hook, prepend=False, with_kwargs=True
+            )
+            # Add forward pre hook to all submodules to wait for copy events
+            # before running computation.
+            for module in self.module.modules():
+                module.register_forward_pre_hook(
+                    self._module_wait_for_copy_hook,
+                    prepend=False,
+                    with_kwargs=True,
+                )
+            # Set up callbacks in backward to upcast and use full precision
+            # params. TODO (rohan-varma): Make this compose with general
+            # comm hooks and apply_optimizer_in_backward. Importing inline to
+            # avoid circular import issue.
+            from torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks import (
+                _AllreduceUpcastHookState,
+                _reducer_allreduce_and_upcast_hook,
+            )
+
+            upcast_hook_state = _AllreduceUpcastHookState(
+                ddp_weakref=weakref.ref(self),
+                upcast_stream=torch.Stream(),
+            )
+            self.register_comm_hook(
+                upcast_hook_state,
+                _reducer_allreduce_and_upcast_hook,
+            )
+            # Inform reducer of reduced precision param dtype for correctness
+            # of type checks between gradient and bucket.
+            self.reducer._set_mixed_precision_param_dtype(  # type: ignore[attr-defined]
+                self.mixed_precision.param_dtype
+            )
+
+        self._has_rebuilt_buckets = False
+
+        if static_graph:
+            self._set_static_graph()
+
+        self._lazy_init_ran = False
+
+        # Register the AccumulateGrad post hooks if optimize_ddp is
+        # True. The hooks will be deregistered if compiled_autograd is not
+        # enabled.
+        self._accum_grad_hooks: list[RemovableHandle] = []
+        if self._use_python_reducer:
+            # pyrefly: ignore [bad-assignment]
+            torch._inductor.config._fuse_ddp_communication = True
+            torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb
+            # Directly adding this to the trace rule will disturb the users
+            # who are using DDPOptimizer.
+            torch._dynamo.trace_rules.LEGACY_MOD_INLINELIST.add(
+                "torch.nn.parallel.distributed"
+            )
+            torch._dynamo.trace_rules.get_legacy_mod_inlinelist.cache_clear()
+            # NOTE: we should init these lazily
+            self._register_accum_grad_hook()
+
+        # Whether or not DDPSink performs a clone.
+        self._ddp_sink_clone = True
+
+    def _register_accum_grad_hook(self):
+        import torch.distributed._functional_collectives as fcol
+
+        def compiled_accum_grad_hook(
+            param,
+            *,
+            param_index: int,
+        ):
+            if not self.require_backward_grad_sync:
+                return
+
+            if param.grad is None:
+                return
+
+            if self._comm_hooks:
+                for hook, state in self._comm_hooks:
+                    hook(state, (param.grad, param))
+            else:
+                gradient = param.grad / self.process_group.size()
+                gradient = fcol.all_reduce(gradient, "sum", self.process_group)
+                param.grad.copy_(gradient)
+
+        for index, param in enumerate(self._module_parameters):
+            if not param.requires_grad:
+                continue
+            self._accum_grad_hooks.append(
+                param.register_post_accumulate_grad_hook(
+                    functools.partial(
+                        compiled_accum_grad_hook,
+                        param_index=index,
+                    )
+                )
+            )
+
+    def _delayed_all_reduce_hook(self, grad):
+        world_size = dist.get_world_size(self.process_group)
+
+        self._delay_grad_buffer.div_(world_size)  # type: ignore[union-attr]
+        _ = dist.all_reduce(
+            self._delay_grad_buffer, group=self.process_group, async_op=True
+        )
+        return grad
+
+    def _register_delay_all_reduce_hook(
+        self,
+        bucket_cap_mb,
+        param_to_hook_all_reduce,
+        device_ids,
+    ):
+        # 1. Create gradient buffer
+        device = torch.device("cpu") if device_ids is None else device_ids[0]
+        self._delay_grad_buffer = torch.zeros(
+            sum(p.numel() for p in self._delay_all_reduce_params),
+            device=device,
+        )
+
+        # 2. Broadcast the parameters
+        detached_params = [p.detach() for p in self._delay_all_reduce_params]
+        dist._broadcast_coalesced(self.process_group, detached_params, bucket_cap_mb, 0)
+
+        # 3. Hook all reduce to the specified parameter
+        param_to_hook_all_reduce.register_hook(self._delayed_all_reduce_hook)
+
+        # 4. Build tensor views for gradients
+        offset = 0
+        for param in self._delay_all_reduce_params:
+            grad_view = self._delay_grad_buffer[offset : (offset + param.numel())].view(
+                param.shape
+            )
+            self._delay_grad_views.append(grad_view)
+            offset = offset + param.numel()
+
+        # 5. Check whether the all reduce of all params requiring grad is delayed.
+        for module_name, module in self.module.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if param.requires_grad:
+                    full_name = f"{module_name}.{param_name}"
+                    if full_name not in self.parameters_to_ignore:
+                        # There is at least a param whose all reduce will not be delayed.
+                        # In this case, we should not set self._delay_all_reduce_all_params
+                        # to True.
+                        return
+        self._delay_all_reduce_all_params = True
+
+    def _setup_in_backward_optimizers(self):
+        # Check if user has used apply_optim_in_backward to overlap optimizer
+        # step + DDP backward. Current constraints:
+        # 1. Only allreduce is supported at the moment, no custom communication.
+        # 2. For DDP-managed parameters that have their optimizer run in
+        # backward, their gradients are set to ``None``. If your use case
+        # requires DDP parameters grad not to be set to ``None`` after their
+        # in-backward optimizer runs, please ping
+        # https://github.com/pytorch/pytorch/issues/90052.
+        # NOTE: we use self._module_parameters instead of .parameters() since
+        # the former excludes ignored (non-DDP managed) parameters.
+        if any(hasattr(p, "_in_backward_optimizers") for p in self._module_parameters):
+            torch._C._log_api_usage_once("ddp.optimizer_in_backward")
+            # Remove hooks that apply_optim_in_backward had registered because
+            # DDP customizes how optimizer is overlapped with backward due to
+            # the allreduce.
+            param_to_handle_map = (
+                dist.optim.apply_optimizer_in_backward.param_to_optim_hook_handle_map
+            )
+            for p in self._module_parameters:
+                for handle in param_to_handle_map.get(p, []):
+                    handle.remove()
+
+            # Need a weakref to DDP instance to run all_reduce (from reducer)
+            # and get managed DDP parameters.
+            ddp_weakref = weakref.ref(self)
+            # Note: importing in function, otherwise this will cause a circular
+            # import.
+            from torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks import (
+                _apply_optim_in_backward_hook,
+            )
+
+            self.register_comm_hook(
+                ddp_weakref,
+                _apply_optim_in_backward_hook(
+                    gradient_is_bucket_view=self.gradient_as_bucket_view
+                ),
+            )
+
+            self.reducer._set_optimizer_in_backward()  # type: ignore[attr-defined]
+
+    def _fire_reducer_autograd_hook(self, idx, *unused):
+        """
+        Fire the reducer's autograd hook to allreduce params in a Reducer bucket.
+
+        Note that this is only used during mixed precision training as the
+        Reducer's hooks installed during construction time would not be called
+        as we're working in the low precision parameter setting.
+        """
+        self.reducer._autograd_hook(idx)  # type: ignore[attr-defined]
+
+    def _root_copy_hook(self, *args: Any, **kwargs: Any) -> None:
+        """
+        For DDP mixed precision, put low precision copies on separate stream and create events to wait for them.
+
+        When training with DDP mixed precision, this root pre-forward hook kicks
+        off low precision copies on a separate stream and creates respective
+        events to wait for them.
+        """
+        # Clear out previous iteration submodule to event. This is because we
+        # may have populated some events for modules that didn't end up being
+        # used.
+        self._submodule_to_event = defaultdict(deque)  # type: ignore[var-annotated]
+        with self._mp_stream:
+            for submodule in self.module.modules():
+                for param in submodule.parameters(recurse=False):
+                    # Do not cast DDP ignored parameters.
+                    if hasattr(param, "_ddp_ignored") and param._ddp_ignored:
+                        continue
+                    _alloc_storage(param._mp_param, param.size())
+                    # copy() implicitly casts to low precision
+                    with torch.no_grad():
+                        param._mp_param.copy_(param.data)
+                        # TODO: when zero_grad(set_to_none=False) or in grad
+                        # accumulation case, accumulated grads can be in fp32
+                        # which can cause errors when running DDP backwards due
+                        # to mismatched incoming and accumulated gradient types.
+                        # So we manually cast the accumulated grad down for now,
+                        # in the future we may shift to FSDP style gradient
+                        # accumulation management where the accumulated gradient
+                        # is saved and .grad field is set to None, bypassing
+                        # this issue.
+                        if param.grad is not None:
+                            param.grad.data = param.grad.to(
+                                self.mixed_precision.param_dtype  # type: ignore[union-attr]
+                            )
+                    param.data = param._mp_param
+                copy_event = torch.Event()
+                copy_event.record()
+                self._submodule_to_event[submodule].append(copy_event)
+
+    def _module_wait_for_copy_hook(
+        self,
+        module,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Before carrying out computation, wait on the appropriate event to ensure low precision copies have finished."""
+        try:
+            event = self._submodule_to_event[module].popleft()
+        except IndexError:
+            # copy event has already been waited on
+            return
+
+        event.wait(stream=torch.accelerator.current_stream())
+        for p in module.parameters(recurse=False):
+            # Don't register hooks if param does not require grad
+            if not p.requires_grad or (hasattr(p, "_ddp_ignored") and p._ddp_ignored):
+                continue
+            # We need to register autograd hook here instead of DDP's ctor
+            # since we're working with the low precision param. Register them
+            # via obtaining the gradient accumulator.
+            tmp = p.expand_as(p)
+            grad_acc = tmp.grad_fn.next_functions[0][0]
+
+            hook = grad_acc.register_hook(
+                functools.partial(self._fire_reducer_autograd_hook, p._idx)
+            )
+            p._ddp_mp_hook_state = (grad_acc, hook)
+
+    def _log_and_throw(self, err_type, err_msg):
+        if self.logger is not None:
+            self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}")
+        raise err_type(err_msg)
+
+    def _ddp_init_helper(
+        self,
+        parameters,
+        expect_sparse_gradient,
+        param_to_name_mapping,
+        static_graph,
+    ):
+        """
+        DDP init helper function to manage parameters, grad hooks, logging, and SyncBatchNorm.
+
+        Initialization helper function that does the following:
+        (1) bucketing the parameters for reductions
+        (2) resetting the bucketing states
+        (3) registering the grad hooks
+        (4) Logging construction-time DDP logging data
+        (5) passing a handle of DDP to SyncBatchNorm Layer
+        """
+        # Notice, the parameters order is not in the order in which they are used,
+        # especially in models with control flow.
+        #
+        # Alongside parameters are not presented in the real execution order,
+        # if a certain model happens to also
+        #   1) have other collectives comm ops in its backward graph.
+        #   2) have unused parameter in subset ranks of the whole world.
+        # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter,
+        # matching up with other collectives comm ops on other ranks unexpectedly.
+        #
+        # In order to handle this corner case, when the parameters are not in the real execution order,
+        # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients
+        # of the whole graph are computed.
+        #
+        # Notice, here we only disable bucketing for the first iteration.
+        # After the first iteration, it's OK to rebuild buckets,
+        # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph.
+
+        # Can remove this branching once #73732 is landed.
+        if static_graph is True or self.find_unused_parameters is False:
+            bucket_size_limits = [sys.maxsize]
+        else:
+            if self.bucket_bytes_cap_default:
+                bucket_size_limits = [
+                    dist._DEFAULT_FIRST_BUCKET_BYTES,
+                    self.bucket_bytes_cap,
+                ]
+            else:
+                bucket_size_limits = [self.bucket_bytes_cap]
+        (
+            bucket_indices,
+            per_bucket_size_limits,
+        ) = dist._compute_bucket_assignment_by_size(
+            parameters,
+            bucket_size_limits,
+            expect_sparse_gradient,
+        )
+
+        # Remember index for parameters if we are in mixed precision, as we
+        # need to pass in index to Reducer's autograd hook via python.
+        if self.mixed_precision is not None:
+            for i, p in enumerate(parameters):
+                p._idx = i
+
+        # Note: reverse list of buckets because we want to approximate the
+        # order in which their gradients are produced, and assume they
+        # are used in the forward pass in the order they are defined.
+        self.reducer = dist.Reducer(
+            parameters,
+            list(reversed(bucket_indices)),
+            list(reversed(per_bucket_size_limits)),
+            self.process_group,
+            expect_sparse_gradient,
+            # The bucket size limit is specified in the constructor.
+            # Additionally, we allow for a single small bucket for parameters
+            # that are defined first, such that their gradients don't spill into
+            # a much larger bucket, adding unnecessary latency after gradient
+            # computation finishes. Experiments showed 1MB is a reasonable value.
+            self.bucket_bytes_cap,
+            self.find_unused_parameters,
+            self.gradient_as_bucket_view,
+            param_to_name_mapping,
+            # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first
+            # bucket.
+            (
+                dist._DEFAULT_FIRST_BUCKET_BYTES
+                if self.bucket_bytes_cap_default
+                else self.bucket_bytes_cap
+            ),
+            self.skip_all_reduce_unused_params,
+            self._use_python_reducer,
+        )
+
+        self.logger = dist.Logger(self.reducer)
+        # Set as a weak reference to avoid reference cycle between
+        # logger and reducer.
+        self.reducer.set_logger(self.logger)
+
+        has_sync_bn = False
+        for submodule in self.module.modules():
+            if isinstance(submodule, torch.nn.SyncBatchNorm):
+                has_sync_bn = True
+                break
+
+        # Set logging data that can be got during construction time.
+        self.logger.set_construction_data_and_log(
+            self.module.__class__.__name__,
+            [] if self.device_ids is None else self.device_ids,
+            -1 if self.output_device is None else self.output_device,
+            self.broadcast_buffers,
+            has_sync_bn,
+            static_graph,
+        )
+
+        # passing a handle to torch.nn.SyncBatchNorm layer
+        self._passing_sync_batchnorm_handle(self.module)
+
+    def __getstate__(self):
+        self._check_default_group()
+        attrs = copy.copy(self.__dict__)
+        del attrs["process_group"]
+        del attrs["reducer"]
+        del attrs["logger"]
+        return attrs
+
+    def __setstate__(self, state):
+        # If serializable, then the process group should be the default one
+        self.process_group = _get_default_group()
+        super().__setstate__(state)
+        self.__dict__.setdefault("require_forward_param_sync", True)
+        self.__dict__.setdefault("require_backward_grad_sync", True)
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            self.static_graph,
+        )
+        if self.static_graph:
+            self.reducer._set_static_graph()
+            assert self.logger is not None
+            self.logger._set_static_graph()
+
+    def _build_params_for_reducer(self):
+        # Build tuple of (module, parameter) for all parameters that require grads.
+        modules_and_parameters = [
+            (module, parameter)
+            for module_name, module in self.module.named_modules()
+            for parameter in [
+                param
+                # Note that we access module.named_parameters instead of
+                # parameters(module). parameters(module) is only needed in the
+                # single-process multi device case, where it accesses replicated
+                # parameters through _former_parameters.
+                for param_name, param in module.named_parameters(recurse=False)
+                if param.requires_grad
+                and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+            ]
+        ]
+
+        # Deduplicate any parameters that might be shared across child modules.
+        memo = set()
+        modules_and_parameters = [
+            # "p not in memo" is the deduplication check.
+            # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+            (m, p)
+            for m, p in modules_and_parameters
+            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+        ]
+
+        # Build list of parameters.
+        parameters = [parameter for _, parameter in modules_and_parameters]
+
+        # Checks if a module will produce a sparse gradient.
+        def produces_sparse_gradient(module):
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
+                return module.sparse
+            return False
+
+        # Build list of booleans indicating whether or not to expect sparse
+        # gradients for the corresponding parameters.
+        expect_sparse_gradient = [
+            produces_sparse_gradient(module) for module, _ in modules_and_parameters
+        ]
+
+        self._assign_modules_buffers()
+
+        return parameters, expect_sparse_gradient
+
+    def _assign_modules_buffers(self):
+        """
+        Assign self.module.named_buffers to self.modules_buffers.
+
+        Assigns module buffers to self.modules_buffers which are then used to
+        broadcast across ranks when broadcast_buffers=True. Note that this
+        must be called every time buffers need to be synced because buffers can
+        be reassigned by user module,
+        see https://github.com/pytorch/pytorch/issues/63916.
+        """
+        # Collect buffers for modules, filtering out buffers that should be ignored.
+        named_module_buffers = [
+            (buffer, buffer_name)
+            for buffer_name, buffer in self.module.named_buffers()
+            if buffer_name not in self.parameters_to_ignore
+        ]
+        self.modules_buffers = [
+            buffer for (buffer, buffer_name) in named_module_buffers
+        ]
+        # Dict[str, tensor] representing module buffers not ignored by DDP.
+        self.named_module_buffers = {
+            buffer_name: buffer for (buffer, buffer_name) in named_module_buffers
+        }
+
+    def _build_debug_param_to_name_mapping(self, parameters):
+        param_to_param_index = {parameters[i]: i for i in range(len(parameters))}
+        param_set = set(parameters)
+        param_index_to_param_fqn = {}
+        for module_name, module in self.module.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                fqn = f"{module_name}.{param_name}"
+                # Bypass ignored parameters since those are not reduced by DDP
+                # to begin with.
+                if fqn not in self.parameters_to_ignore and param.requires_grad:
+                    if param not in param_set:
+                        self._log_and_throw(
+                            ValueError,
+                            f"Param with name {fqn} found in module parameters, but not DDP parameters."
+                            " This indicates a bug in DDP, please report an issue to PyTorch.",
+                        )
+                    param_index = param_to_param_index[param]
+                    param_index_to_param_fqn[param_index] = fqn
+
+        # Ensure we covered all parameters
+        if len(param_set) != len(param_index_to_param_fqn):
+            self._log_and_throw(
+                ValueError,
+                (
+                    "Expected param to name mapping to cover all parameters, but"
+                    f" got conflicting lengths: {len(param_set)} vs "
+                    f"{len(param_index_to_param_fqn)}. This indicates a bug in DDP"
+                    ", please report an issue to PyTorch."
+                ),
+            )
+
+        return param_index_to_param_fqn
+
+    def _get_parameters(self, m, recurse=True):
+        """Return a generator of module parameters."""
+
+        def model_parameters(m):
+            ps = (
+                m._former_parameters.values()
+                if hasattr(m, "_former_parameters")
+                else m.parameters(recurse=False)
+            )
+            yield from ps
+
+        for mod in m.modules() if recurse else [m]:
+            yield from model_parameters(mod)
+
+    def _check_default_group(self):
+        pickle_not_supported = False
+        try:
+            if self.process_group != _get_default_group():
+                pickle_not_supported = True
+        except RuntimeError:
+            pickle_not_supported = True
+
+        if pickle_not_supported:
+            self._log_and_throw(
+                RuntimeError,
+                "DDP Pickling/Unpickling are only supported "
+                "when using DDP with the default process "
+                "group. That is, when you have called "
+                "init_process_group and have not passed "
+                "process_group argument to DDP constructor",
+            )
+
+    @contextmanager
+    def no_sync(self):
+        r"""
+        Context manager to disable gradient synchronizations across DDP processes.
+
+        Within this context, gradients will be accumulated on module
+        variables, which will later be synchronized in the first
+        forward-backward pass exiting the context.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> with ddp.no_sync():
+            >>>     for input in inputs:
+            >>>         ddp(input).backward()  # no synchronization, accumulate grads
+            >>> ddp(another_input).backward()  # synchronize grads
+
+        .. warning::
+            The forward pass should be included inside the context manager, or
+            else gradients will still be synchronized.
+        """
+        old_require_backward_grad_sync = self.require_backward_grad_sync
+        self.require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            self.require_backward_grad_sync = old_require_backward_grad_sync
+
+    @classmethod
+    def _get_active_ddp_module(cls):
+        """`TorchDynamo` requires DDP's status and module for cooperative optimization."""
+        return cls._active_ddp_module
+
+    # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in
+    # for the 'module_to_run' underneath
+    # see torch._dynamo/eval_frame.py TorchPatcher.patch for more details
+    @contextmanager
+    @torch._disable_dynamo(recursive=False)
+    def _inside_ddp_forward(self):
+        DistributedDataParallel._active_ddp_module = self
+        try:
+            yield
+        finally:
+            DistributedDataParallel._active_ddp_module = None
+
+    def _run_ddp_forward(self, *inputs, **kwargs):
+        if self._use_python_reducer:
+            return self.module(*inputs, **kwargs)  # type: ignore[index]
+        else:
+            with self._inside_ddp_forward():
+                return self.module(*inputs, **kwargs)  # type: ignore[index]
+
+    def _clear_grad_buffer(self):
+        # Making param.grad points to the grad buffers before backward is based on the
+        # assumption that the grad accumulation is done in place in autograd engine,
+        # for some edge cases, if the grad accumulation in autograd engine is not in
+        # place, then the param.grad and grad buffers are detached.
+        if self._delay_grad_buffer is not None:
+            # We batch zero_grad for all params by resetting the whole grad
+            # buffer when the grad of all params is set to None.
+            all_param_grad_none = all(
+                param.grad is None for param in self._delay_all_reduce_params
+            )
+
+            for index, param in enumerate(self._delay_all_reduce_params):
+                if param.grad is None:
+                    param.grad = self._delay_grad_views[index]
+                    if not all_param_grad_none:
+                        param.grad.zero_()
+
+            if all_param_grad_none:
+                self._delay_grad_buffer.zero_()
+
+    def _lazy_init(self):
+        # Initialization for DDP that occurs after construction, but lazily
+        # before the first forward pass.
+        self._setup_in_backward_optimizers()
+        self._lazy_init_ran = True
+
+    def _pre_forward(self, *inputs, **kwargs):
+        if self._use_python_reducer:
+            return inputs, kwargs
+
+        if not self._lazy_init_ran and not torch.compiler.is_compiling():
+            self._lazy_init()
+
+        if self._delay_all_reduce_all_params:
+            return inputs, kwargs
+
+        if torch.is_grad_enabled() and self.require_backward_grad_sync:
+            assert self.logger is not None
+            self.logger.set_runtime_stats_and_log()
+            self.reducer.prepare_for_forward()
+
+        # Notify the join context that this process has not joined, if
+        # needed
+        work = Join.notify_join_context(self)
+        if work:
+            self.reducer._set_forward_pass_work_handle(
+                work,
+                self._divide_by_initial_world_size,  # type: ignore[arg-type]
+            )
+
+        # Calling _rebuild_buckets before forward computation,
+        # It may allocate new buckets before deallocating old buckets
+        # inside _rebuild_buckets. To save peak memory usage,
+        # call _rebuild_buckets before the peak memory usage increases
+        # during forward computation.
+        # This should be called only once during whole training period.
+        if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
+            logger.info("Reducer buckets have been rebuilt in this iteration.")
+            self._has_rebuilt_buckets = True
+
+        # sync params according to location (before/after forward) user
+        # specified as part of hook, if hook was specified.
+        if self._check_sync_bufs_pre_fwd():
+            self._sync_buffers()
+
+        if self._join_config.enable:
+            # Notify joined ranks whether they should sync in backwards pass or not.
+            self._check_global_requires_backward_grad_sync(is_joined_rank=False)
+
+        if self.device_ids:
+            moved_inputs, moved_kwargs = _to_kwargs(
+                inputs,
+                kwargs,
+                torch.device(self.device_type, self.device_ids[0]),
+                self.use_side_stream_for_tensor_copies,
+            )
+            args, kwargs = moved_inputs[0], moved_kwargs[0]
+            # Cast inputs to reduced precision if needed.
+            if self.mixed_precision is not None:
+                args, kwargs = _cast_forward_inputs(
+                    self.mixed_precision.param_dtype,
+                    *args,
+                    **kwargs,
+                )
+            return args, kwargs
+        else:
+            # Cast inputs to reduced precision if needed.
+            # TODO (rohan-varma) test this codepath.
+            if self.mixed_precision is not None:
+                inputs, kwargs = _cast_forward_inputs(
+                    self.mixed_precision.param_dtype,
+                    *inputs,
+                    **kwargs,
+                )
+            return inputs, kwargs
+
+    def _post_forward(self, output):
+        if self._use_python_reducer:
+            return output
+
+        if self._delay_all_reduce_all_params:
+            self._clear_grad_buffer()
+            return output
+
+        # sync params according to location (before/after forward) user
+        # specified as part of hook, if hook was specified.
+        if self._check_sync_bufs_post_fwd():
+            self._sync_buffers()
+
+        if torch.is_grad_enabled() and self.require_backward_grad_sync:
+            self.require_forward_param_sync = True
+            # We'll return the output object verbatim since it is a freeform
+            # object. We need to find any tensors in this object, though,
+            # because we need to figure out which parameters were used during
+            # this forward pass, to ensure we short circuit reduction for any
+            # unused parameters. Only if `find_unused_parameters` is set.
+            if self.find_unused_parameters and not self.static_graph:
+                # Do not need to populate this for static graph.
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            self.require_forward_param_sync = False
+
+        # TODO: DDPSink is currently enabled for unused parameter detection and
+        # static graph training for first iteration.
+        if (self.find_unused_parameters and not self.static_graph) or (
+            self.static_graph and not self._static_graph_delay_allreduce_enqueued
+        ):
+            (
+                output_tensor_list,
+                treespec,
+                output_is_rref,
+            ) = _tree_flatten_with_rref(output)
+            output_placeholders: list[torch.Tensor | None] = [
+                None for _ in range(len(output_tensor_list))
+            ]
+            # Do not touch tensors that have no grad_fn, which can cause issues
+            # such as https://github.com/pytorch/pytorch/issues/60733
+            for i, output in enumerate(output_tensor_list):
+                if torch.is_tensor(output) and output.grad_fn is None:
+                    output_placeholders[i] = output
+
+            # When find_unused_parameters=True, makes tensors which require grad
+            # run through the DDPSink backward pass. When not all outputs are
+            # used in loss, this makes those corresponding tensors receive
+            # undefined gradient which the reducer then handles to ensure
+            # param.grad field is not touched and we don't error out.
+            passthrough_tensor_list = _DDPSink.apply(
+                weakref.ref(self),
+                *output_tensor_list,
+            )
+            for i in range(len(output_placeholders)):
+                if output_placeholders[i] is None:
+                    output_placeholders[i] = passthrough_tensor_list[i]
+
+            # Reconstruct output data structure.
+            output = _tree_unflatten_with_rref(
+                output_placeholders, treespec, output_is_rref
+            )
+
+        # At the end of the forward pass, reset the grad buffer and grad views
+        self._clear_grad_buffer()
+        return output
+
+    def forward(self, *inputs, **kwargs):
+        with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
+            inputs, kwargs = self._pre_forward(*inputs, **kwargs)
+            output = (
+                self.module.forward(*inputs, **kwargs)
+                if self._delay_all_reduce_all_params
+                else self._run_ddp_forward(*inputs, **kwargs)
+            )
+            return self._post_forward(output)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Kept for BC
+        return _to_kwargs(
+            inputs,
+            kwargs,
+            torch.device(self.device_type, device_id),
+            self.use_side_stream_for_tensor_copies,
+        )
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super().train(mode)
+        return self
+
+    # When running in join mode, schedules an allreduce to notify joined ranks
+    # of whether backwards pass synchronization will run this iteration or not.
+    def _check_global_requires_backward_grad_sync(self, is_joined_rank):
+        if not is_joined_rank and self.require_backward_grad_sync:
+            requires_sync_tensor = torch.ones(1, device=self.device)
+        else:
+            requires_sync_tensor = torch.zeros(1, device=self.device)
+
+        work = dist.all_reduce(
+            requires_sync_tensor, group=self.process_group, async_op=True
+        )
+
+        # (kwen2501) This if condition is a plain translation of previous
+        # behavior, i.e. in the `is_joined_rank=False` case, `work.wait()`
+        # is not called and it doesn't care about the result. I am guessing
+        # that it just wants to fire a matching all-reduce and does not want
+        # the main stream to wait.
+        if is_joined_rank:
+            work.wait()
+            should_sync_backwards = requires_sync_tensor.item() != 0
+            return should_sync_backwards
+        else:
+            return None  # Return value is not/should not be used.
+
+    # When running in join mode, checks and performs sync of module buffers if
+    # the models have buffers that should be synchronized in the forward pass.
+    def _check_and_sync_module_buffers(self):
+        if self._check_sync_bufs_pre_fwd():
+            authoritative_rank = self._find_common_rank(self._distributed_rank, False)
+            self._sync_module_buffers(authoritative_rank)
+
+    # When running in join model, agrees upon a common rank and broadcast model
+    # parameters to all other ranks.
+    def _sync_final_model(self, is_last_joiner):
+        # Agree upon the process that will be the authoritative model copy.
+        # The current rank is a candidate for being the authoritative copy if
+        # is_last_joiner=True. We break ties via picking the larger rank.
+        self._authoritative_rank = self._find_common_rank(
+            self._distributed_rank, is_last_joiner
+        )
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=self._authoritative_rank,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+            broadcast_buffers=self.broadcast_buffers,
+        )
+
+    # Schedule comm ops to match those scheduled in the reducer's backward
+    # pass.
+    def _match_all_reduce_for_bwd_pass(self):
+        comm_work = []
+        # Schedule comm in the same order as Reducer schedules them, i.e.
+        # the order of the buckets. Retrieving the bucket order from the reducer
+        # ensures that we keep the same order in join mode, such as when bucket
+        # order is rebuilt dynamically.
+
+        # Returns grad_buckets in order, but real tensors are substituted with
+        # zero tensors of the same shape.
+        grad_buckets = self.reducer._get_zeros_like_grad_buckets()
+        for grad_bucket in grad_buckets:
+            # Joined processes contribute zero gradient. In the case that
+            # divide_by_initial_world_size=True, we divide grads by the static
+            # world size, if not, the dividing factor is reduced by the number
+            # of joined processes.
+            work = self.reducer._run_comm_hook(grad_bucket)
+            comm_work.append(work)
+        for work in comm_work:
+            work.wait()
+
+    # Allreduces the used parameter mapping across ranks.
+    def _match_unused_params_allreduce(self):
+        locally_used_param_map = self.reducer._get_local_used_map()
+        self.process_group.allreduce(locally_used_param_map)
+
+    def join(
+        self,
+        divide_by_initial_world_size: bool = True,
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+    ):
+        r"""
+        Context manager for training with uneven inputs across processes in DDP.
+
+        This context manager will keep track of already-joined DDP processes,
+        and "shadow" the forward and backward passes by inserting collective
+        communication operations to match with the ones created by non-joined
+        DDP processes. This will ensure each collective call has a corresponding
+        call by already-joined DDP processes, preventing hangs or errors that
+        would otherwise happen when training with uneven inputs across
+        processes. Alternatively, if the flag ``throw_on_early_termination`` is
+        specified to be ``True``, all trainers will throw an error once one rank
+        runs out of inputs, allowing these errors to be caught and handled
+        according to application logic.
+
+        Once all DDP processes have joined, the context manager will broadcast
+        the model corresponding to the last joined process to all processes to
+        ensure the model is the same across all processes
+        (which is guaranteed by DDP).
+
+        To use this to enable training with uneven inputs across processes,
+        simply wrap this context manager around your training loop. No further
+        modifications to the model or data loading is required.
+
+        .. warning::
+            If the model or training loop this context manager is wrapped around
+            has additional distributed collective operations, such as
+            ``SyncBatchNorm`` in the model's forward pass, then the flag
+            ``throw_on_early_termination`` must be enabled. This is because this
+            context manager is not aware of non-DDP collective communication.
+            This flag will cause all ranks to throw when any one rank
+            exhausts inputs, allowing these errors to be caught and recovered
+            from across all ranks.
+
+        Args:
+            divide_by_initial_world_size (bool): If ``True``, will divide
+                gradients by the initial ``world_size`` DDP training was launched
+                with. If ``False``, will compute the effective world size
+                (number of ranks that have not depleted their inputs yet) and
+                divide gradients by that during allreduce. Set
+                ``divide_by_initial_world_size=True`` to ensure every input
+                sample including the uneven inputs have equal weight in terms of
+                how much they contribute to the global gradient. This is
+                achieved by always dividing the gradient by the initial
+                ``world_size`` even when we encounter uneven inputs. If you set
+                this to ``False``, we divide the gradient by the remaining
+                number of nodes. This ensures parity with training on a smaller
+                ``world_size`` although it also means the uneven inputs would
+                contribute more towards the global gradient. Typically, you
+                would want to set this to ``True`` for cases where the last few
+                inputs of your training job are uneven. In extreme cases, where
+                there is a large discrepancy in the number of inputs, setting
+                this to ``False`` might provide better results.
+            enable (bool): Whether to enable uneven input detection or not. Pass
+                in ``enable=False`` to disable in cases where you know that
+                inputs are even across participating processes. Default is
+                ``True``.
+            throw_on_early_termination (bool): Whether to throw an error
+                or continue training when at least one rank has exhausted
+                inputs. If ``True``, will throw upon the first rank reaching end
+                of data. If ``False``, will continue training with a smaller
+                effective world size until all ranks are joined. Note that if
+                this flag is specified, then the flag
+                ``divide_by_initial_world_size`` would be ignored. Default
+                is ``False``.
+
+
+        Example::
+
+            >>> # xdoctest: +SKIP("Distributed")
+            >>> import torch
+            >>> import torch.distributed as dist
+            >>> import os
+            >>> import torch.multiprocessing as mp
+            >>> import torch.nn as nn
+            >>> # On each spawned worker
+            >>> def worker(rank):
+            >>>     dist.init_process_group("nccl", rank=rank, world_size=2)
+            >>>     torch.cuda.set_device(rank)
+            >>>     model = nn.Linear(1, 1, bias=False).to(rank)
+            >>>     model = torch.nn.parallel.DistributedDataParallel(
+            >>>         model, device_ids=[rank], output_device=rank
+            >>>     )
+            >>>     # Rank 1 gets one more input than rank 0.
+            >>>     inputs = [torch.tensor([1]).float() for _ in range(10 + rank)]
+            >>>     with model.join():
+            >>>         for _ in range(5):
+            >>>             for inp in inputs:
+            >>>                 loss = model(inp).sum()
+            >>>                 loss.backward()
+            >>>     # Without the join() API, the below synchronization will hang
+            >>>     # blocking for rank 1's allreduce to complete.
+            >>>     torch.cuda.synchronize(device=rank)
+        """
+        return Join(
+            [self],
+            enable,
+            throw_on_early_termination,
+            divide_by_initial_world_size=divide_by_initial_world_size,
+        )
+
+    def join_hook(
+        self,
+        **kwargs,
+    ):
+        r"""
+        DDP join hook enables training on uneven inputs by mirroring communications in forward and backward passes.
+
+        Arguments:
+            kwargs (dict): a :class:`dict` containing any keyword arguments
+                to modify the behavior of the join hook at run time; all
+                :class:`Joinable` instances sharing the same join context
+                manager are forwarded the same value for ``kwargs``.
+
+        The hook supports the following keyword arguments:
+            divide_by_initial_world_size (bool, optional):
+                If ``True``, then gradients are divided by the initial world
+                size that DDP was launched with.
+                If ``False``, then gradients are divided by the effective world
+                size (i.e. the number of non-joined processes), meaning that
+                the uneven inputs contribute more toward the global gradient.
+                Typically, this should be set to ``True`` if the degree of
+                unevenness is small but can be set to ``False`` in extreme
+                cases for possibly better results.
+                Default is ``True``.
+        """
+        divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True)
+        return _DDPJoinHook(
+            self, divide_by_initial_world_size=divide_by_initial_world_size
+        )
+
+    @property
+    def join_device(self):
+        return self.device
+
+    @property
+    def join_process_group(self):
+        return self.process_group
+
+    def _register_buffer_comm_hook(
+        self,
+        state,
+        hook: Callable,
+        comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+    ):
+        r"""
+        Allow custom registration of hooks that define how buffer are synchronized across ranks.
+
+        The hook takes in an optional state and is passed in a Dict[str, Tensor]
+        corresponding to buffer names and the buffers, and can run arbitrary reductions
+        on buffers as opposed to DDP's default broadcast from rank 0. This is useful for
+        example if a counter needs to be summed or averaged across ranks every iteration.
+
+        Args:
+            state (Any): Optional state that is passed to the hook.
+            hook (Callable): Callable with the following signature:
+                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``
+            comm_hook_location (_BufferCommHookLocation): Enum value indicating
+                            where to run the hook.
+                            _BufferCommHookLocation.PRE_FORWARD means that the
+                            hook will run _before_ the forward pass, and
+                            _BufferCommHookLocation.POST_FORWARD means that the
+                            hook will run _after_ the forward pass.
+
+            NOTE: To maximize performance, users can return a
+                List[torch.futures.Future] from their hook, and DDP will
+                install and await these hooks appropriately at the end of
+                the backward pass. This will ensure all buffers are
+                synchronized by the end of the backward pass. If this
+                setting is used, it is recommended to pass
+                comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+                which will trigger the hook after the forward pass.
+                If _BufferCommHookLocation.PRE_FORWARD is used, users must
+                ensure appropriate synchronization when manipulating GPU
+                buffers in the forward pass.
+        """
+        assert callable(hook)
+        self.buffer_hook = _BufferCommHook(
+            buffer_comm_hook=hook,
+            buffer_comm_hook_state=state,
+            buffer_comm_hook_location=comm_hook_location,
+        )
+
+    def register_comm_hook(self, state: object, hook: Callable):
+        r"""
+        Register communication hook for user-defined DDP aggregation of gradients across multiple workers.
+
+        This hook would be very useful for researchers to try out new ideas. For
+        example, this hook can be used to implement several algorithms like GossipGrad
+        and gradient compression which involve different communication strategies for
+        parameter syncs while running Distributed DataParallel training.
+
+        Args:
+            state (object): Passed to the hook to maintain any state information during the training process.
+                            Examples include error feedback in gradient compression,
+                            peers to communicate with next in GossipGrad, etc.
+
+                            It is locally stored by each worker
+                            and shared by all the gradient tensors on the worker.
+            hook (Callable): Callable with the following signature:
+                             ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+
+                             This function is called once the bucket is ready. The
+                             hook can perform whatever processing is needed and return
+                             a Future indicating completion of any async work (ex: allreduce).
+                             If the hook doesn't perform any communication, it still
+                             must return a completed Future. The Future should hold the
+                             new value of grad bucket's tensors. Once a bucket is ready,
+                             c10d reducer would call this hook and use the tensors returned
+                             by the Future and copy grads to individual parameters.
+                             Note that the future's return type must be a single tensor.
+
+                             We also provide an API called ``get_future`` to retrieve a
+                             Future associated with the completion of ``c10d.ProcessGroup.Work``.
+                             ``get_future`` is currently supported for NCCL and also supported for most
+                             operations on GLOO and MPI, except for peer to peer operations (send/recv).
+
+        .. warning ::
+            Grad bucket's tensors will not be predivided by world_size. User is responsible
+            to divide by the world_size in case of operations like allreduce.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        .. warning ::
+            The Future object that hook returns should contain a single tensor
+            that has the same shape with the tensors inside grad bucket.
+
+        .. warning ::
+            ``get_future`` API supports NCCL, and partially GLOO and MPI backends (no support
+            for peer-to-peer operations like send/recv) and will return a ``torch.futures.Future``.
+
+        Example::
+            Below is an example of a noop hook that returns the same tensor.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     fut = torch.futures.Future()
+            >>>     fut.set_result(bucket.buffer())
+            >>>     return fut
+            >>> ddp.register_comm_hook(state=None, hook=noop)
+
+        Example::
+            Below is an example of a Parallel SGD algorithm where gradients are encoded before
+            allreduce, and then decoded after allreduce.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     encoded_tensor = encode(bucket.buffer())  # encode gradients
+            >>>     fut = torch.distributed.all_reduce(encoded_tensor).get_future()
+            >>>     # Define the then callback to decode.
+            >>>     def decode(fut):
+            >>>         decoded_tensor = decode(fut.value()[0])  # decode gradients
+            >>>         return decoded_tensor
+            >>>     return fut.then(decode)
+            >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
+        """
+        self._check_comm_hook(hook)
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(hook.__qualname__)
+        self._comm_hooks.append((hook, state))
+        dist._register_comm_hook(self.reducer, state, hook)
+
+    def _register_builtin_comm_hook(self, comm_hook_type):
+        r"""
+        Register a built-in communication hook that specifies how DDP aggregates gradients across multiple workers.
+
+        The built-in hooks aim to provide efficient C++ implementations for certain hooks,
+        which might not be as efficient if implemented in Python using a Python communication hook.
+
+        Args:
+            comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as ALLREDUCE, FP16_COMPRESS, etc.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        Example::
+            Below is an example of a FP16 compression where gradients are
+            compressed into 16-bit floating-point numbers before allreduce, and
+            then decompressed after allreduce.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
+
+        """
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(str(comm_hook_type))
+        dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
+
+    def _register_fused_optim(self, optim: type, *args, optim_params=None, **kwargs):
+        r"""
+        Register an optimizer in DDP to optimize parameter immediately after its gradient reduction.
+
+        Registers an optimizer with DDP such that the optimization for a
+        parameter will run immediately when that parameter's gradient is
+        finished with reduction, instead of waiting for all parameters'
+        gradients to finish reduction. This can result in a training speedup
+        depending on your workload since the optimizer can run while gradient
+        reduction for other parameters are still ongoing. In addition, this has
+        the potential to reduce peak memory consumption during training, as it
+        only needs to load the per-parameter optimizer states of a single
+        parameter at a time, instead of loading all per-parameter optimizer
+        states at once.
+
+        Args:
+            optim (Type): a ``torch.optim.Optimizer`` class to be registered
+            as a fused optimizer.
+            *args (Sequence[Any]): Arguments to forward to `optim`.
+            optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
+            to optimize, similar to `params` argument of traditional `torch.optim`
+            Optimizers. If this is omitted, all DDP model parameters will be
+            optimized.
+            **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim`.
+
+        .. warning ::
+            _register_fused_optim should only be called once on a DDP instance,
+            and registering multiple fused optimizers for the same DDP model
+            is not currently supported. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            _register_fused_optim and register_comm_hook currently do not
+            compose together, meaning that custom DDP communication hooks are
+            not supported with overlapped optimizers. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            Gradient accumulation and DDP `no_sync` are currently not supported
+            with overlapped optimizer. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("No rendezvous handler")
+            >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+            >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> lr = 1e-2
+            >>> betas = (0.9, 0.99)
+            >>> eps = 1e-6
+            >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps)
+            >>> # Example with subset of parameters
+            >>> params_to_opt = [list(net.parameters())[0]]
+            >>> net._register_fused_optim(
+            ...   torch.optim.Adam, lr, optim_params=params_to_opt,  betas=betas, eps=eps
+            ... )
+        """
+        # Note: importing in function, otherwise this will cause a circular
+        # import as optimizer_overlap module needs to import DistributedDataParallel.
+        from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim
+
+        overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs)
+        try:
+            overlapped_optim.register_ddp(self)
+        except NotImplementedError as e:
+            raise RuntimeError(
+                f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}."
+            ) from e
+
+    def _distributed_broadcast_coalesced(
+        self, tensors, buffer_size, authoritative_rank=0
+    ):
+        dist._broadcast_coalesced(
+            self.process_group, tensors, buffer_size, authoritative_rank
+        )
+
+    def _check_sync_bufs_post_fwd(self):
+        return (
+            self.will_sync_module_buffers()
+            and hasattr(self, "buffer_hook")
+            and self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.POST_FORWARD
+        )
+
+    def _check_sync_bufs_pre_fwd(self):
+        return self.will_sync_module_buffers() and (
+            not hasattr(self, "buffer_hook")
+            or self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.PRE_FORWARD
+        )
+
+    def will_sync_module_buffers(self):
+        return (
+            self.require_forward_param_sync
+            and self.broadcast_buffers
+            and len(self.modules_buffers) > 0
+        )
+
+    def _find_common_rank(self, input_rank, rank_cond):
+        # -1 indicates that this rank is not under consideration to be the
+        # common_rank
+        rank_to_use = torch.tensor(
+            [input_rank if rank_cond else -1],
+            device=self.device,
+        )
+        dist.all_reduce(rank_to_use, op=ReduceOp.MAX, group=self.process_group)
+        if rank_to_use.item() == -1:
+            self._log_and_throw(
+                ValueError,
+                "BUG! Expected rank_cond to be true for at least one process."
+                " This indicates a bug in PyTorch, please report an issue.",
+            )
+        return rank_to_use.item()
+
+    def _sync_buffers(self):
+        with torch.no_grad():
+            # module buffer sync
+            # Synchronize buffers across processes.
+            # If we are running DDP with the join manager, we have to agree
+            # upon a rank to sync module buffers from, since rank 0 may
+            # already have been joined and have stale module buffers.
+            if self._join_config.enable:
+                authoritative_rank = self._find_common_rank(
+                    self._distributed_rank, True
+                )
+            else:
+                # The process with rank 0 is considered the authoritative copy.
+                authoritative_rank = 0
+            # Update self.modules_buffers in case any buffers were
+            # reassigned.
+            self._assign_modules_buffers()
+            self._sync_module_buffers(authoritative_rank)
+
+    def _sync_module_buffers(self, authoritative_rank):
+        if not hasattr(self, "buffer_hook"):
+            self._default_broadcast_coalesced(authoritative_rank=authoritative_rank)
+        else:
+            hook = self.buffer_hook.buffer_comm_hook
+            state = self.buffer_hook.buffer_comm_hook_state
+            futs = hook(state, self.named_module_buffers)
+            if futs is not None:
+                self.reducer._install_post_backward_futures(futs)
+
+    def _default_broadcast_coalesced(
+        self, bufs=None, bucket_size=None, authoritative_rank=0
+    ):
+        """
+        Broadcasts buffers from rank 0 to rest of workers.
+
+        If bufs, bucket_size are None, default values self.modules_buffers
+        and self.broadcast_bucket_size are used instead.
+        """
+        if bufs is None:
+            bufs = self.modules_buffers
+        if bucket_size is None:
+            bucket_size = self.broadcast_bucket_size
+
+        self._distributed_broadcast_coalesced(bufs, bucket_size, authoritative_rank)
+
+    def _passing_sync_batchnorm_handle(self, module):
+        for layer in module.modules():
+            if isinstance(layer, torch.nn.modules.SyncBatchNorm):
+                if self.device_type == "cpu":
+                    self._log_and_throw(
+                        ValueError,
+                        "SyncBatchNorm layers only work with GPU modules",
+                    )
+
+    def _check_comm_hook(self, hook):
+        if not callable(hook):
+            self._log_and_throw(TypeError, "Communication hook must be callable.")
+
+        sig = inspect.signature(hook)
+        if (
+            sig.parameters["bucket"].annotation != inspect._empty
+            and sig.parameters["bucket"].annotation != dist.GradBucket
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: bucket annotation should be dist.GradBucket.",
+            )
+
+        if (
+            sig.return_annotation != inspect._empty
+            and sig.return_annotation != torch.futures.Future[torch.Tensor]
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].",
+            )
+
+        if hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"]:
+            cuda_supported = (
+                torch.version.cuda is not None
+            ) or torch.version.hip is not None
+            nccl_supported = (
+                dist.is_available()
+                and dist.is_nccl_available()
+                and torch.cuda.nccl.version() >= (2, 10)
+            )
+            xpu_xccl_supported = (
+                dist.is_available()
+                and dist.is_xccl_available()
+                and torch.xpu.is_available()
+            )
+
+            if not ((cuda_supported and nccl_supported) or xpu_xccl_supported):
+                self._log_and_throw(
+                    TypeError,
+                    "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+ or XPU and XCCL",
+                )
+
+    @property
+    def _distributed_rank(self):
+        return dist.get_rank(self.process_group)
+
+    @staticmethod
+    def _get_data_parallel_params(module, named_params=False):
+        """Return a generator of parameters managed by a given DDP unit."""
+        for param in (
+            module.parameters() if not named_params else module.named_parameters()
+        ):
+            if not hasattr(param, "_ddp_ignored"):
+                yield param
+
+    @staticmethod
+    def _set_params_and_buffers_to_ignore_for_model(
+        module, params_and_buffers_to_ignore
+    ):
+        """
+        Set parameters and buffers to be ignored by DDP.
+
+        Expected format for parameters is the fully qualified name: {module_name}.{param_name}, and
+        similarly, {module_name}.{buffer_name} for buffers. For example:
+        params_to_ignore = []
+        # NB: model here is vanilla PyTorch module, not yet wrapped with DDP.
+        for module_name, module in model.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if should_ignore(param):
+                    # Create expected format
+                    fqn = f"{module_name}.{param_name}"
+                    params_to_ignore.append(fqn)
+        torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+            model,
+            params_to_ignore
+        )
+        """
+        # This is a workaround to set parameters and buffers DDP should ignore
+        # during synchronization. It will be removed when the API is finalized
+        # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
+        module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
+        for name, param in module.named_parameters():
+            if name in params_and_buffers_to_ignore:
+                param._ddp_ignored = True
+        for name, buffer in module.named_buffers():
+            if name in params_and_buffers_to_ignore:
+                buffer._ddp_ignored = True
+
+    def _get_ddp_logging_data(self):
+        r"""
+        Return a dictionary of logging data for debugging and analysis.
+
+        This interface can be called after DistributedDataParallel() is
+        constructed. It returns a dictionary of logging data. It could help
+        for debugging and analysis. The logging data includes DistributedDataParallel
+        constructor input parameters, some internal states of DistributedDataParallel
+        and performance metrics. Simply print the dictionary and see what
+        these metrics are.
+        This is a prototype interface and subject to change in the future.
+        """
+        assert self.logger is not None
+        ddp_logging_data = self.logger._get_ddp_logging_data()
+        return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
+
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate):
+        r"""
+        Set sample_rate of collecting runtime stats.
+
+        This interface allows users to set sample_rate of collecting
+        runtime stats. The runtime stats will be recorded for the
+        first 10 iterations, after 10 iterations runtime stats will be
+        recorded once every "sample_rate" training iterations. In
+        default, runtime stats are recorded for the first 10 iterations,
+        after 10 iterations runtime stats are recorded once every
+        "kDDPRuntimeLoggingSampleRate=100" training iterations.
+        This is a prototype interface and subject to change in the future.
+        """
+        if sample_rate < 1:
+            self._log_and_throw(
+                ValueError,
+                "DDP runtime logging sample rate should be equal or greater than 1",
+            )
+        self.reducer._set_ddp_runtime_logging_sample_rate(sample_rate)
+
+    def _set_static_graph(self):
+        """
+        Set static graph for DDP.
+
+        It is recommended to set static graph in the DDP constructor, which will
+        call this private API internally.
+        """
+        # If self.static_graph has been set, no need to set it again
+        if self.static_graph:
+            warnings.warn(
+                "You've set static_graph to be True, no need to set it again.",
+                stacklevel=2,
+            )
+            return
+        self.static_graph = True
+        self._static_graph_delay_allreduce_enqueued = False
+        self.reducer._set_static_graph()
+        assert self.logger is not None
+        self.logger._set_static_graph()
+        if self.find_unused_parameters:
+            warnings.warn(
+                "You passed find_unused_parameters=true to DistributedDataParallel, "
+                "`_set_static_graph` will detect unused parameters automatically, so "
+                "you do not need to set find_unused_parameters=true, just be sure these "
+                "unused parameters will not change during training loop while calling "
+                "`_set_static_graph`.",
+                stacklevel=2,
+            )
+
+    def _remove_autograd_hooks(self):
+        """Remove autograd hooks registered by the reducer on the model parameters."""
+        self.reducer._remove_autograd_hooks()
+
+    def _check_reducer_finalized(self):
+        """
+        Check if the reducer has processed all buckets and finalized the backward appropriately.
+
+        It is useful to call this method after calling .backward() in your training loop
+        in order to avoid subsequent hard to debug errors down the road due to the
+        reducer not finalizing backward.
+        """
+        self.reducer._check_reducer_finalized()
+
+    def _set_sparse_metadata(self, global_unique_ids):
+        self.reducer._set_sparse_metadata(global_unique_ids)
+
+    def _update_process_group(self, new_process_group):
+        """
+        Dynamically updates the process group for DDP so that we can shrink/expand DDP
+        world size without having to reinitialize DDP.
+
+        NOTE: If you are using custom communications hooks via, register_comm_hook,
+        you need to update the process groups for those hooks separately.
+        """
+        # Force a rebuild of buckets for a new process group. This ensures all ranks
+        # are synchronized in terms of when they will rebuild buckets and also
+        # re-evaluates previous assumptions of buckets given the world size might have
+        # changed.
+        self._has_rebuilt_buckets = False
+        self.reducer._reset_state()
+
+        if not _rank_not_in_group(new_process_group):
+            self.process_group = new_process_group
+            self.reducer._update_process_group(new_process_group)
+
+    def _set_ddp_sink_clone(self, val: bool):
+        """
+        Sets whether or not DDPSink should clone the output tensors or not.
+        The default is True since if the loss is modified in place we run
+        into the view is modified in-place error.
+
+        Although, cloning the tensors can add significant memory and
+        performance hit if the number and size of tensors are large. As
+        a result, this can be set to False if you are not modifying the
+        loss in place.
+        """
+        self._ddp_sink_clone = val
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c26aaf5048e908ab72978b9d8562d4997c17928
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py
@@ -0,0 +1,135 @@
+import threading
+from collections.abc import Sequence
+from typing import Any, cast
+
+import torch
+from torch._utils import ExceptionWrapper
+from torch.cuda._utils import _get_device_index
+from torch.nn.modules import Module
+
+
+__all__ = ["get_a_var", "parallel_apply"]
+
+
+def get_a_var(
+    obj: torch.Tensor | list[Any] | tuple[Any, ...] | dict[Any, Any],
+) -> torch.Tensor | None:
+    if isinstance(obj, torch.Tensor):
+        return obj
+
+    if isinstance(obj, (list, tuple)):
+        for result in map(get_a_var, obj):
+            if isinstance(result, torch.Tensor):
+                return result
+    if isinstance(obj, dict):
+        for result in map(get_a_var, obj.items()):
+            if isinstance(result, torch.Tensor):
+                return result
+    return None
+
+
+def parallel_apply(
+    modules: Sequence[Module],
+    inputs: Sequence[Any],
+    kwargs_tup: Sequence[dict[str, Any]] | None = None,
+    devices: Sequence[int | torch.device | None] | None = None,
+) -> list[Any]:
+    r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`.
+
+    Args:
+        modules (Module): modules to be parallelized
+        inputs (tensor): inputs to the modules
+        devices (list of int or torch.device): CUDA devices
+
+    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
+    :attr:`devices` (if given) should all have same length. Moreover, each
+    element of :attr:`inputs` can either be a single object as the only argument
+    to a module, or a collection of positional arguments.
+    """
+    assert len(modules) == len(inputs), (
+        f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}"
+    )
+    if kwargs_tup is not None:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = (cast(dict[str, Any], {}),) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+    devices = [_get_device_index(x, True) for x in devices]
+    streams = [torch.accelerator.current_stream(x) for x in devices]
+    assert torch.accelerator.is_available(), "No available accelerator found."
+    device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
+    lock = threading.Lock()
+    results = {}
+    grad_enabled, autocast_enabled = (
+        torch.is_grad_enabled(),
+        torch.is_autocast_enabled(),
+    )
+
+    def _worker(
+        i: int,
+        module: Module,
+        input: Any,
+        kwargs: dict[str, Any],
+        device: int | torch.device | None = None,
+        stream: torch.Stream | None = None,
+    ) -> None:
+        torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            t = get_a_var(input)
+            if t is None:
+                with lock:
+                    results[i] = ExceptionWrapper(
+                        where=f"in replica {i}, no device was provided and no tensor input was found; "
+                        "device cannot be resolved"
+                    )
+                return
+            device = t.get_device()
+        if isinstance(device, torch.device):
+            device = device.index
+        if stream is None:
+            stream = torch.accelerator.current_stream(device)
+        try:
+            with (
+                torch.accelerator.device_index(device),
+                stream,
+                torch.amp.autocast(device_type, enabled=autocast_enabled),
+            ):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
+                output = module(*input, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception:
+            with lock:
+                results[i] = ExceptionWrapper(
+                    where=f"in replica {i} on device {device}"
+                )
+
+    if len(modules) > 1:
+        threads = [
+            threading.Thread(
+                target=_worker, args=(i, module, input, kwargs, device, stream)
+            )
+            for i, (module, input, kwargs, device, stream) in enumerate(
+                zip(modules, inputs, kwargs_tup, devices, streams, strict=True)
+            )
+        ]
+
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0])
+
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, ExceptionWrapper):
+            output.reraise()
+        outputs.append(output)
+    return outputs
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e7844ab4aba222055f726492df33d2a61aba880
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/replicate.py
@@ -0,0 +1,203 @@
+from collections import OrderedDict
+from collections.abc import Iterator, Sequence
+from typing import cast, TYPE_CHECKING, TypeVar
+from typing_extensions import TypeIs
+
+import torch
+from torch._utils import _get_device_index
+from torch.nn.modules import Module
+from torch.nn.parallel import comm
+
+
+if TYPE_CHECKING:
+    from torch._C import ScriptMethod
+    from torch.jit import ScriptModule
+    from torch.jit._state import EnabledProxy
+
+
+__all__ = ["replicate"]
+
+
+def _is_script_module(module: Module) -> TypeIs["ScriptModule"]:
+    import torch.jit
+
+    return isinstance(module, torch.jit.ScriptModule)
+
+
+def _is_script_method(module: object) -> TypeIs["ScriptMethod"]:
+    import torch.jit
+
+    return isinstance(module, torch._C.ScriptMethod)
+
+
+def _init_script_module() -> "ScriptModule":
+    import torch.jit
+
+    return torch.jit.ScriptModule()
+
+
+def _is_jit_enabled() -> "EnabledProxy":
+    import torch.jit._state
+
+    return torch.jit._state._enabled
+
+
+# Check if we can safely replicate the module.
+# there are two types of module:
+# 1. python modules
+# 2. ScriptModule
+#
+# currently a module cannot be replicated properly if the descendants of
+# any ScriptModule contains python module (type 1 above)
+def _replicatable_module(module: Module, memo: set[Module] | None = None) -> bool:
+    # module.modules() contains module itself as the first element
+    def descendant_modules(module: Module) -> Iterator[Module]:
+        gen = module.modules()
+        next(gen)
+        return gen
+
+    if not _is_jit_enabled():
+        return True
+    if memo is None:
+        memo = set()
+
+    # memoize visited modules
+    memo.add(module)
+    if _is_script_module(module):
+        memo.update(descendant_modules(module))
+        return all(
+            _is_script_module(descendant) for descendant in descendant_modules(module)
+        )
+
+    for child in module.children():
+        # since any unreplicatable module will cause the check to return
+        # False early, visited modules here can be safely ignored.
+        if child in memo:
+            continue
+        if not _replicatable_module(child, memo):
+            return False
+
+    return True
+
+
+def _broadcast_coalesced_reshape(
+    tensors: Sequence[torch.Tensor],
+    devices: Sequence[int | torch.device],
+    detach: bool = False,
+) -> list[list[torch.Tensor]]:
+    from torch.nn.parallel._functions import Broadcast
+
+    if detach:
+        return comm.broadcast_coalesced(tensors, devices)
+    else:
+        # Use the autograd function to broadcast if not detach
+        if len(tensors) > 0:
+            tensor_copies = Broadcast.apply(devices, *tensors)
+            return [
+                tensor_copies[i : i + len(tensors)]
+                for i in range(0, len(tensor_copies), len(tensors))
+            ]
+        else:
+            return []
+
+
+T = TypeVar("T", bound=Module)
+
+
+def replicate(
+    network: T,
+    devices: Sequence[int | torch.device],
+    detach: bool = False,
+) -> list[T]:
+    if not _replicatable_module(network):
+        raise RuntimeError(
+            "Cannot replicate network where python modules are children of ScriptModule"
+        )
+
+    if not devices:
+        return []
+
+    devices = [_get_device_index(x, True) for x in devices]
+    num_replicas = len(devices)
+
+    params = list(network.parameters())
+    param_indices = {param: idx for idx, param in enumerate(params)}
+    param_copies = _broadcast_coalesced_reshape(params, devices, detach)
+
+    buffers = list(network.buffers())
+    buffers_rg: list[torch.Tensor] = []
+    buffers_not_rg: list[torch.Tensor] = []
+    for buf in buffers:
+        if buf.requires_grad and not detach:
+            buffers_rg.append(buf)
+        else:
+            buffers_not_rg.append(buf)
+
+    buffer_indices_rg = {buf: idx for idx, buf in enumerate(buffers_rg)}
+    buffer_indices_not_rg = {buf: idx for idx, buf in enumerate(buffers_not_rg)}
+
+    buffer_copies_rg = _broadcast_coalesced_reshape(buffers_rg, devices, detach=detach)
+    buffer_copies_not_rg = _broadcast_coalesced_reshape(
+        buffers_not_rg, devices, detach=True
+    )
+
+    modules = list(network.modules())
+    module_copies: list[list[Module]] = [[] for _ in devices]
+    module_indices: dict[Module, int] = {}
+
+    for i, module in enumerate(modules):
+        module_indices[module] = i
+        for j in range(num_replicas):
+            replica = module._replicate_for_data_parallel()
+            # This is a temporary fix for DDP. DDP needs to access the
+            # replicated model parameters. It used to do so through
+            # `mode.parameters()`. The fix added in #33907 for DP stops the
+            # `parameters()` API from exposing the replicated parameters.
+            # Hence, we add a `_former_parameters` dict here to support DDP.
+            replica._former_parameters = OrderedDict()
+
+            module_copies[j].append(replica)
+
+    for i, module in enumerate(modules):
+        for key, child in module._modules.items():
+            if child is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._modules[key] = None
+            else:
+                module_idx = module_indices[child]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    setattr(replica, key, module_copies[j][module_idx])
+        for key, param in module._parameters.items():
+            if param is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = None
+            else:
+                param_idx = param_indices[param]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    param_copy = param_copies[j][param_idx]
+                    # parameters in replicas are no longer leaves,
+                    # so setattr them as non-parameter attributes
+                    setattr(replica, key, param_copy)
+                    # expose the parameter for DDP
+                    replica._former_parameters[key] = param_copy  # type: ignore[operator, index]
+        for key, buf in module._buffers.items():  # type: ignore[assignment]
+            if buf is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = None
+            else:
+                if buf.requires_grad and not detach:
+                    buffer_copies = buffer_copies_rg
+                    buffer_idx = buffer_indices_rg[buf]
+                else:
+                    buffer_copies = buffer_copies_not_rg
+                    buffer_idx = buffer_indices_not_rg[buf]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    setattr(replica, key, buffer_copies[j][buffer_idx])
+
+    return [cast(T, module_copies[j][0]) for j in range(num_replicas)]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..27aeaf19944dcadab63b25d0c9789c31dff322da
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/parallel/scatter_gather.py
@@ -0,0 +1,154 @@
+# mypy: allow-untyped-defs
+from collections.abc import Sequence
+from typing import Any, overload, TypeVar
+from typing_extensions import deprecated
+
+import torch
+from torch.nn.parallel._functions import Gather, Scatter
+
+
+__all__ = ["scatter", "scatter_kwargs", "gather"]
+
+
+@deprecated(
+    "`is_namedtuple` is deprecated, please use the python checks instead",
+    category=FutureWarning,
+)
+def is_namedtuple(obj: Any) -> bool:
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    return _is_namedtuple(obj)
+
+
+def _is_namedtuple(obj: Any) -> bool:
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    return (
+        isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
+    )
+
+
+T = TypeVar("T", dict, list, tuple)
+
+
+# For some reason, 'scatter' returns a tuple when given a single Tensor input but a list otherwise.
+@overload
+def scatter(
+    inputs: torch.Tensor,
+    target_gpus: Sequence[int | torch.device],
+    dim: int = ...,
+) -> tuple[torch.Tensor, ...]: ...
+
+
+@overload
+def scatter(
+    inputs: T,
+    target_gpus: Sequence[int | torch.device],
+    dim: int = ...,
+) -> list[T]: ...
+
+
+def scatter(inputs, target_gpus, dim=0):
+    r"""Slice tensors into approximately equal chunks and distributes them across given GPUs.
+
+    Duplicates references to objects that are not tensors.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            return Scatter.apply(target_gpus, None, dim, obj)
+        if _is_namedtuple(obj):
+            # pyrefly: ignore [no-matching-overload]
+            return [
+                # pyrefly: ignore [no-matching-overload]
+                type(obj)(*args)
+                # pyrefly: ignore  # no-matching-overload
+                for args in zip(*map(scatter_map, obj), strict=False)
+            ]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            # pyrefly: ignore [no-matching-overload]
+            return list(zip(*map(scatter_map, obj), strict=False))
+        if isinstance(obj, list) and len(obj) > 0:
+            # pyrefly: ignore [no-matching-overload]
+            return [list(i) for i in zip(*map(scatter_map, obj), strict=False)]
+        if isinstance(obj, dict) and len(obj) > 0:
+            # pyrefly: ignore [no-matching-overload]
+            return [
+                # pyrefly: ignore [no-matching-overload]
+                type(obj)(i)
+                # pyrefly: ignore  # no-matching-overload
+                for i in zip(*map(scatter_map, obj.items()), strict=False)
+            ]
+        return [obj for _ in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        res = scatter_map(inputs)
+    finally:
+        scatter_map = None  # type: ignore[assignment]
+    return res
+
+
+def scatter_kwargs(
+    inputs: tuple[Any, ...],
+    kwargs: dict[str, Any] | None,
+    target_gpus: Sequence[int | torch.device],
+    dim: int = 0,
+) -> tuple[tuple[Any, ...], tuple[dict[str, Any], ...]]:
+    r"""Scatter with support for kwargs dictionary."""
+    scattered_inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(scattered_inputs) < len(scattered_kwargs):
+        scattered_inputs.extend(
+            () for _ in range(len(scattered_kwargs) - len(scattered_inputs))
+        )
+    elif len(scattered_kwargs) < len(inputs):
+        scattered_kwargs.extend(
+            {} for _ in range(len(scattered_inputs) - len(scattered_kwargs))
+        )
+    return tuple(scattered_inputs), tuple(scattered_kwargs)
+
+
+def gather(outputs: Any, target_device: int | torch.device, dim: int = 0) -> Any:
+    r"""Gather tensors from different GPUs on a specified device.
+
+    This function is useful for gathering the results of a distributed computation.
+    It takes a sequence of objects, one for each GPU, and returns a single object
+    on the specified device.
+
+    Args:
+        outputs (Any): A sequence of objects (potentially tensors) to gather.
+        target_device (Union[int, torch.device]): The device to gather the tensors to.
+            Use 'cpu' for CPU to avoid a deprecation warning.
+        dim (int, optional): The dimension along which to gather. Default: 0.
+
+    Returns:
+        Any: A gathered object (potentially tensor) on the specified device.
+    """
+
+    def gather_map(outputs):
+        out = outputs[0]
+        if isinstance(out, torch.Tensor):
+            return Gather.apply(target_device, dim, *outputs)
+        if out is None:
+            return None
+        if isinstance(out, dict):
+            if not all(len(out) == len(d) for d in outputs):
+                raise ValueError("All dicts must have the same number of keys")
+            # pyrefly: ignore [not-callable]
+            return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
+        if _is_namedtuple(out):
+            # pyrefly: ignore [no-matching-overload]
+            return type(out)._make(map(gather_map, zip(*outputs, strict=True)))
+        # pyrefly: ignore [no-matching-overload]
+        return type(out)(map(gather_map, zip(*outputs, strict=True)))
+
+    # Recursive function calls like this create reference cycles.
+    # Setting the function to None clears the refcycle.
+    try:
+        res = gather_map(outputs)
+    finally:
+        gather_map = None  # type: ignore[assignment]
+    return res
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7628c5c15992efa600ea5520aed955ba42c6146
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from torch.nn.quantizable.modules import *  # noqa: F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e2bbbc13202db1cbddaad4b05241a62190adc46
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/__init__.py
@@ -0,0 +1,39 @@
+from torch.nn.quantized import dynamic, functional, modules  # noqa: F403
+from torch.nn.quantized.modules import *  # noqa: F403
+from torch.nn.quantized.modules import MaxPool2d
+
+
+__all__ = [
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "DeQuantize",
+    "Dropout",
+    "ELU",
+    "Embedding",
+    "EmbeddingBag",
+    "GroupNorm",
+    "Hardswish",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "LayerNorm",
+    "LeakyReLU",
+    "Linear",
+    "LSTM",
+    "MultiheadAttention",
+    "PReLU",
+    "Quantize",
+    "ReLU6",
+    "Sigmoid",
+    "Softmax",
+    # Wrapper modules
+    "FloatFunctional",
+    "FXFloatFunctional",
+    "QFunctional",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..d763e171fdb432c8ba2059cc2332e7ac6424854a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/quantized/functional.py
@@ -0,0 +1,10 @@
+r"""nn.quantized.functional.
+
+Quantized equivalents of the `nn.functional`.
+
+Note::
+    This location is in the process of being deprecated.
+    Please, use the `torch.ao.nn.quantized.functional` instead.
+"""
+
+from torch.ao.nn.quantized.functional import *  # noqa: F401,F403
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9253264d1e0eaf7fef1ee4ada06d2bf0be5cda7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/__init__.py
@@ -0,0 +1,48 @@
+from . import parametrizations, parametrize, rnn, stateless
+from .clip_grad import (  # pyrefly: ignore  # deprecated; pyrefly: ignore [deprecated]
+    _clip_grads_with_norm_ as clip_grads_with_norm_,
+    _get_total_norm as get_total_norm,
+    clip_grad_norm,
+    clip_grad_norm_,
+    clip_grad_value_,
+)
+from .convert_parameters import parameters_to_vector, vector_to_parameters
+from .fusion import (
+    fuse_conv_bn_eval,
+    fuse_conv_bn_weights,
+    fuse_linear_bn_eval,
+    fuse_linear_bn_weights,
+)
+from .init import skip_init
+from .memory_format import (
+    convert_conv2d_weight_memory_format,
+    convert_conv3d_weight_memory_format,
+)
+from .spectral_norm import remove_spectral_norm, spectral_norm
+from .weight_norm import remove_weight_norm, weight_norm
+
+
+__all__ = [
+    "clip_grad_norm",
+    "clip_grad_norm_",
+    "clip_grads_with_norm_",
+    "clip_grad_value_",
+    "convert_conv2d_weight_memory_format",
+    "convert_conv3d_weight_memory_format",
+    "fuse_conv_bn_eval",
+    "fuse_conv_bn_weights",
+    "fuse_linear_bn_eval",
+    "fuse_linear_bn_weights",
+    "get_total_norm",
+    "parameters_to_vector",
+    "parametrizations",
+    "parametrize",
+    "remove_spectral_norm",
+    "remove_weight_norm",
+    "rnn",
+    "skip_init",
+    "spectral_norm",
+    "stateless",
+    "vector_to_parameters",
+    "weight_norm",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25b647307900e42b11d1cdafc8d9f8785d1a620
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_deprecation_utils.py
@@ -0,0 +1,53 @@
+import importlib
+import warnings
+from collections.abc import Callable
+
+
+_MESSAGE_TEMPLATE = (
+    r"Usage of '{old_location}' is deprecated; please use '{new_location}' instead."
+)
+
+
+def lazy_deprecated_import(
+    all: list[str],
+    old_module: str,
+    new_module: str,
+) -> Callable:
+    r"""Import utility to lazily import deprecated packages / modules / functional.
+
+    The old_module and new_module are also used in the deprecation warning defined
+    by the `_MESSAGE_TEMPLATE`.
+
+    Args:
+        all: The list of the functions that are imported. Generally, the module's
+            __all__ list of the module.
+        old_module: Old module location
+        new_module: New module location / Migrated location
+
+    Returns:
+        Callable to assign to the `__getattr__`
+
+    Usage:
+
+        # In the `torch/nn/quantized/functional.py`
+        from torch.nn.utils._deprecation_utils import lazy_deprecated_import
+        _MIGRATED_TO = "torch.ao.nn.quantized.functional"
+        __getattr__ = lazy_deprecated_import(
+            all=__all__,
+            old_module=__name__,
+            new_module=_MIGRATED_TO)
+    """
+    warning_message = _MESSAGE_TEMPLATE.format(
+        old_location=old_module, new_location=new_module
+    )
+
+    def getattr_dunder(name: str) -> None:
+        if name in all:
+            # We are using the "RuntimeWarning" to make sure it is not
+            # ignored by default.
+            warnings.warn(warning_message, RuntimeWarning, stacklevel=2)
+            package = importlib.import_module(new_module)
+            return getattr(package, name)
+        raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.")
+
+    return getattr_dunder
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0935490856aebf3503aa126e51d342c3bac0b529
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_named_member_accessor.py
@@ -0,0 +1,373 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Iterable
+
+import torch
+
+
+_MISSING: torch.Tensor = object()  # type: ignore[assignment]
+
+
+def set_tensor(module: "torch.nn.Module", name: str, tensor: torch.Tensor) -> None:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if not isinstance(tensor, torch.Tensor) and tensor is not None:
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+    if name in module._parameters:
+        module._parameters[name] = tensor  # type: ignore[assignment]
+    elif name in module._buffers:
+        module._buffers[name] = tensor
+    else:
+        setattr(module, name, tensor)
+
+
+def swap_tensor(
+    module: "torch.nn.Module",
+    name: str,
+    tensor: torch.Tensor,
+    allow_missing: bool = False,
+) -> torch.Tensor:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if (
+        tensor is not _MISSING
+        and not isinstance(tensor, torch.Tensor)
+        and tensor is not None
+    ):
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+
+    orig_tensor: torch.Tensor
+    if name in module._parameters:
+        orig_tensor = module._parameters[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._parameters[name] = tensor  # type: ignore[assignment]
+        else:
+            del module._parameters[name]
+    elif name in module._buffers:
+        orig_tensor = module._buffers[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._buffers[name] = tensor
+        else:
+            del module._buffers[name]
+    else:
+        if hasattr(module, name):
+            orig_tensor = getattr(module, name)
+        else:
+            if not allow_missing:
+                raise AttributeError(f"{module._get_name()} has no attribute `{name}`")
+            orig_tensor = _MISSING
+        if (
+            orig_tensor is not _MISSING
+            and not isinstance(orig_tensor, torch.Tensor)
+            and orig_tensor is not None
+        ):
+            raise TypeError(
+                f"attribute `{name}`: {orig_tensor} is not an instance of torch.Tensor"
+            )
+        if tensor is not _MISSING:
+            setattr(module, name, tensor)
+        elif hasattr(module, name):
+            delattr(module, name)
+    # pyrefly: ignore [bad-return]
+    return orig_tensor
+
+
+def swap_submodule(
+    module: "torch.nn.Module",
+    name: str,
+    submodule: "torch.nn.Module",
+) -> "torch.nn.Module":
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if not isinstance(submodule, torch.nn.Module):
+        raise TypeError(f"{submodule} is not an instance of torch.nn.Module")
+    if "." in name:
+        raise KeyError('submodule name can\'t contain "."')
+    if name == "":
+        raise KeyError('submodule name can\'t be empty string ""')
+    if name not in module._modules:
+        raise KeyError(f"submodule {name} does not exist")
+
+    orig_submodule = module._modules[name]
+    if not isinstance(orig_submodule, torch.nn.Module):
+        raise TypeError(f"{name} attribute is not an instance of torch.nn.Module")
+    module._modules[name] = submodule
+    return orig_submodule
+
+
+class NamedMemberAccessor:
+    """
+    A class that provides a way to access the submodules and parameters/buffers of a module.
+
+    It provides caching mechanism to speed up submodule lookups.
+    This is useful for functional programming to manipulate the module state.
+    """
+
+    def __init__(self, module: "torch.nn.Module") -> None:
+        self.module = module
+        self.memo: dict[str, torch.nn.Module] = {}
+
+    # Nested attribute access
+
+    def get_submodule(self, name: str) -> "torch.nn.Module":
+        """
+        Return the submodule specified by the given path.
+
+        For example, to get the submodule mod.layer1.conv1,
+        use accessor.get_submodule("layer1.conv1")
+
+        Compare to mod.get_submodule("layer1.conv1"), this method will cache the
+        intermediate submodule access to speed up future lookups.
+        """
+        if not name:
+            return self.module
+
+        if name in self.memo:
+            return self.memo[name]
+        else:
+            prefix, dot, attr = name.rpartition(".")
+            if dot:
+                module = self.get_submodule(prefix)
+            else:
+                module = self.module
+            try:
+                submodule = getattr(module, attr)
+            except AttributeError as ex:
+                raise AttributeError(
+                    f"{module._get_name()} has no attribute `{attr}`"
+                ) from ex
+            if not isinstance(submodule, torch.nn.Module):
+                raise TypeError(
+                    f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
+                )
+            self.memo[name] = submodule
+            return submodule
+
+    def swap_submodule(self, path: str, value: "torch.nn.Module") -> "torch.nn.Module":
+        """
+        Swap the submodule specified by the given ``path`` to ``value``.
+
+        For example, to swap the attribute mod.layer1.conv1 use
+        ``accessor.swap_submodule("layer1.conv1", conv2)``.
+        """
+        prefix, _, attr = path.rpartition(".")
+        return swap_submodule(self.get_submodule(prefix), attr, value)
+
+    def get_tensor(self, name: str) -> torch.Tensor:
+        """
+        Get the tensor specified by the given path to value.
+
+        For example, to get the attribute mod.layer1.conv1.weight,
+        use accessor.get_tensor('layer1.conv1.weight')
+
+        Compare to mod.get_parameter("layer1.conv1.weight"), this method will
+        cache the intermediate submodule access to speed up future lookups.
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            tensor = getattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+        if not isinstance(tensor, torch.Tensor) and tensor is not None:
+            raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+        return tensor  # type: ignore[return-value]
+
+    def set_tensor(self, name: str, value: torch.Tensor) -> None:
+        """
+        Set the attribute specified by the given path to value.
+
+        For example, to set the attribute mod.layer1.conv1.weight,
+        use accessor.set_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        set_tensor(self.get_submodule(prefix), attr, value)
+
+    def del_tensor(self, name: str) -> None:
+        """
+        Delete the attribute specified by the given path.
+
+        For example, to delete the attribute mod.layer1.conv1.weight,
+        use accessor.del_tensor("layer1.conv1.weight")
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            delattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+
+    def swap_tensor(
+        self, name: str, value: torch.Tensor, allow_missing: bool = False
+    ) -> torch.Tensor:
+        """
+        Swap the attribute specified by the given path to value.
+
+        For example, to swap the attribute mod.layer1.conv1.weight,
+        use accessor.swap_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        return swap_tensor(
+            self.get_submodule(prefix), attr, value, allow_missing=allow_missing
+        )
+
+    # Batched operations
+
+    def get_tensors(self, names: Iterable[str]) -> list[torch.Tensor]:
+        """
+        Get the tensors specified by the given paths.
+
+        For example, to get the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.get_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        return [self.get_tensor(name) for name in names]
+
+    def set_tensors(self, names: Iterable[str], values: Iterable[torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        for name, value in zip(names, values, strict=True):
+            self.set_tensor(name, value)
+
+    def set_tensors_dict(self, named_tensors: dict[str, torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        for name, value in named_tensors.items():
+            self.set_tensor(name, value)
+
+    def del_tensors(self, names: Iterable[str]) -> None:
+        """
+        Delete the attributes specified by the given paths.
+
+        For example, to delete the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.del_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        for name in names:
+            self.del_tensor(name)
+
+    def swap_tensors(
+        self,
+        names: Iterable[str],
+        values: Iterable[torch.Tensor],
+        allow_missing: bool = False,
+    ) -> list[torch.Tensor]:
+        """
+        Swap the attributes specified by the given paths to values.
+
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        return [
+            self.swap_tensor(name, value, allow_missing=allow_missing)
+            for name, value in zip(names, values, strict=True)
+        ]
+
+    def swap_tensors_dict(
+        self, named_tensors: dict[str, torch.Tensor], allow_missing: bool = False
+    ) -> tuple[dict[str, torch.Tensor], list[str]]:
+        """
+        Swap the attributes specified by the given paths to values.
+
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        orig_named_tensors = {}
+        missing_keys = []
+        try:
+            for name, tensor in named_tensors.items():
+                orig_tensor = self.swap_tensor(name, tensor, allow_missing=True)
+                if orig_tensor is _MISSING:
+                    missing_keys.append(name)
+                orig_named_tensors[name] = orig_tensor
+        except Exception:
+            # Swap back if any exception occurs
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise
+        if missing_keys and not allow_missing:
+            # Swap back if any key is missing when allow_missing is False
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise RuntimeError(f"Missing key(s): {', '.join(map(repr, missing_keys))}.")
+        return orig_named_tensors, missing_keys
+
+    def check_keys(self, keys: Iterable[str]) -> tuple[list[str], list[str]]:
+        """Check that the given keys are valid."""
+        keys = set(keys)
+        valid_keys = {name for name, _ in self.named_tensors(remove_duplicate=False)}
+        missing_keys = valid_keys - keys
+        unexpected_keys = keys - valid_keys
+        return sorted(missing_keys), sorted(unexpected_keys)
+
+    # Shortcut methods
+
+    def named_parameters(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        """Iterate over all the parameters in the module."""
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+
+    def named_buffers(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        """Iterate over all the buffers in the module."""
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_tensors(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        """Iterate over all the tensors in the module."""
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_modules(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[tuple[str, "torch.nn.Module"]]:
+        """Iterate over all the modules in the module."""
+        yield from self.module.named_modules(remove_duplicate=remove_duplicate)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eae0865845eec9c426c5cc3b7bff1b11b5b1230
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/_per_sample_grad.py
@@ -0,0 +1,126 @@
+# mypy: allow-untyped-defs
+import functools
+
+import torch
+from torch.nn.utils._expanded_weights.expanded_weights_impl import ExpandedWeight
+from torch.utils import _pytree as pytree
+
+
+# dependency on `functional_call` means that this can't be exposed in utils
+# without creating circular dependency
+def call_for_per_sample_grads(
+    module,
+    *,
+    batch_size=None,
+    loss_reduction="sum",
+    batch_first=True,
+):
+    r"""
+    Return a forward function for a module, populating grad_sample with per sample gradients on backward invocation.
+
+    Args:
+        module: The ``nn.Module`` to get per sample gradients with respect to. All trainable
+          parameters will compute per sample gradients, located in a ``grad_sample``
+          field when ``backward`` is invoked
+        batch_size: The batch size of the input. If None is passed, all tensor arguments in args and kwargs must have
+          the same batch size, which is the size of the first dimension. Otherwise, it must be passed manually.
+          Default: None
+        loss_reduction: Indicates if the loss reduction (for aggregating the gradients) is a sum or a mean operation. If
+          "mean", per sample gradients will be scaled by the batch size to offset the crossbatch interaction from
+          running mean across a batch. Must be "mean" or "sum". Default: "sum"
+        batch_first: Indicates if the batch dimension is the first dimension. If True, the batch dimension is the first
+          dimension. If False, it's the second dimension. Default: True.
+
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> model = nn.Linear(4, 3)
+        >>> batched_input = torch.randn(5, 4)  # batch size of 5
+        >>> res = call_for_per_sample_grads(model)(batched_input).sum()
+        >>> res.backward()
+        >>> assert model.weight.shape == (3, 4)
+        >>> assert model.weight.grad_sample.shape == (5, 3, 4)
+        >>> assert model.weight.grad is None
+        >>> assert model.bias.shape == (3,)
+        >>> assert model.bias.grad_sample.shape == (5, 3)
+        >>> assert model.bias.grad is None
+
+    An example using "mean" loss reduction. The grad_sample fields will be scaled by batch_size from what they would be
+    if we ran the same code with loss_reduction="sum". This is because the mean at the end will scale all
+    grad_outputs by 1 / batch_size from cross batch interaction.
+        >>> model = nn.Linear(4, 3)
+        >>> batched_input = torch.randn(5, 4)  # batch size of 5
+        >>> res = call_for_per_sample_grads(model, 5, loss_reduction="mean")(
+        ...     batched_input
+        ... ).mean()
+        >>> res.backward()
+
+    Note::
+        Does not work with any `nn.RNN`, including `nn.GRU` or `nn.LSTM`. Please use custom
+        rewrites that wrap an `nn.Linear` module. See Opacus for an example
+    """
+
+    def maybe_build_expanded_weight(og_tensor, batch_size):
+        if og_tensor.requires_grad:
+            return ExpandedWeight(og_tensor, batch_size, loss_reduction)
+        else:
+            return og_tensor
+
+    def compute_batch_size(*args, **kwargs):
+        args_and_kwargs = pytree.arg_tree_leaves(*args, **kwargs)
+        batch_size = None
+        for arg in args_and_kwargs:
+            if not isinstance(arg, torch.Tensor):
+                continue
+
+            arg_batch_size = arg.shape[0] if batch_first else arg.shape[1]
+            if batch_size is not None and batch_size != arg_batch_size:
+                raise RuntimeError(
+                    "When computing batch size, found at least one input with batch size "
+                    f"{batch_size} and one with batch size {arg_batch_size}. Please specify it "
+                    "explicitly using the batch size kwarg in call_for_per_sample_grads"
+                )
+            batch_size = arg_batch_size
+        if batch_size is None:
+            raise RuntimeError(
+                "Unable to find a tensor in the passed args and kwargs. They may not be pytree-able "
+                "and so ExpandedWeights cannot compute the batch size from the inputs. Please specify "
+                "it explicitly"
+            )
+        return batch_size
+
+    if loss_reduction not in ["sum", "mean"]:
+        raise RuntimeError(
+            f"Expected loss_reduction argument to be sum or mean, got {loss_reduction}"
+        )
+
+    if not isinstance(module, torch.nn.Module):
+        raise RuntimeError(
+            f"Module passed must be nn.Module, got {type(module).__name__}"
+        )
+    if not (batch_size is None or isinstance(batch_size, int)):
+        raise RuntimeError(
+            f"Batch size passed must be None or an integer, got {type(batch_size).__name__}"
+        )
+    if batch_size is not None and batch_size < 1:
+        raise RuntimeError(f"Batch size must be positive, got {batch_size}")
+    for weight in module.parameters():
+        if hasattr(weight, "grad_sample") and weight.grad_sample is not None:  # type: ignore[attr-defined]
+            raise RuntimeError(
+                "Current Expanded Weights accumulates the gradients, which will be incorrect for multiple "
+                f"calls without clearing gradients. Please clear out the grad_sample parameter of {weight} or "
+                "post an issue to pytorch/pytorch to prioritize correct behavior"
+            )
+
+    @functools.wraps(module.forward)
+    def wrapper(*args, **kwargs):
+        wrapper_batch_size = batch_size
+        if wrapper_batch_size is None:
+            wrapper_batch_size = compute_batch_size(*args, **kwargs)
+
+        params = {
+            name: maybe_build_expanded_weight(value, wrapper_batch_size)
+            for (name, value) in module.named_parameters()
+        }
+        return torch.func.functional_call(module, params, args, kwargs)
+
+    return wrapper
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..30202708bfa38bb8437627152fb76061955e31f9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/clip_grad.py
@@ -0,0 +1,299 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import functools
+import types
+import typing
+import warnings
+from collections.abc import Callable
+from typing import cast, TypeAlias, TypeVar
+from typing_extensions import deprecated, ParamSpec
+
+import torch
+from torch import Tensor
+from torch.utils._foreach_utils import (
+    _device_has_foreach_support,
+    _group_tensors_by_device_and_dtype,
+    _has_foreach_support,
+)
+
+
+__all__: list[str] = [
+    "clip_grad_norm",
+    "clip_grad_norm_",
+    "clip_grad_value_",
+]
+
+
+_tensor_or_tensors: TypeAlias = torch.Tensor | typing.Iterable[torch.Tensor]  # noqa: PYI042
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+def _no_grad(func: Callable[_P, _R]) -> Callable[_P, _R]:
+    """
+    This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions
+    clip_grad_norm_ and clip_grad_value_ themselves.
+    """
+
+    def _no_grad_wrapper(*args, **kwargs):
+        with torch.no_grad():
+            # pyrefly: ignore [invalid-param-spec]
+            return func(*args, **kwargs)
+
+    functools.update_wrapper(_no_grad_wrapper, func)
+    # pyrefly: ignore [bad-return]
+    return _no_grad_wrapper
+
+
+@_no_grad
+def _get_total_norm(
+    tensors: _tensor_or_tensors,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+) -> torch.Tensor:
+    r"""Compute the norm of an iterable of tensors.
+
+    The norm is computed over the norms of the individual tensors, as if the norms of
+    the individual tensors were concatenated into a single vector.
+
+    Args:
+        tensors (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will be normalized
+        norm_type (float): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of :attr:`tensors` is ``nan``, ``inf``, or ``-inf``.
+            Default: ``False``
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+
+    Returns:
+        Total norm of the tensors (viewed as a single vector).
+    """
+    if isinstance(tensors, torch.Tensor):
+        tensors = [tensors]
+    else:
+        tensors = list(tensors)
+    norm_type = float(norm_type)
+    if len(tensors) == 0:
+        return torch.tensor(0.0)
+    first_device = tensors[0].device
+    grouped_tensors: dict[
+        tuple[torch.device, torch.dtype], tuple[list[list[Tensor]], list[int]]
+    ] = _group_tensors_by_device_and_dtype(
+        [tensors]  # type: ignore[list-item]
+    )  # type: ignore[assignment]
+
+    norms: list[Tensor] = []
+    for (device, _), ([device_tensors], _) in grouped_tensors.items():
+        if (foreach is None and _has_foreach_support(device_tensors, device)) or (
+            foreach and _device_has_foreach_support(device)
+        ):
+            norms.extend(torch._foreach_norm(device_tensors, norm_type))
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            norms.extend(
+                [torch.linalg.vector_norm(g, norm_type) for g in device_tensors]
+            )
+
+    total_norm = torch.linalg.vector_norm(
+        torch.stack([norm.to(first_device) for norm in norms]), norm_type
+    )
+
+    if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
+        raise RuntimeError(
+            f"The total norm of order {norm_type} for gradients from "
+            "`parameters` is non-finite, so it cannot be clipped. To disable "
+            "this error and scale the gradients by the non-finite norm anyway, "
+            "set `error_if_nonfinite=False`"
+        )
+    return total_norm
+
+
+@_no_grad
+def _clip_grads_with_norm_(
+    parameters: _tensor_or_tensors,
+    max_norm: float,
+    total_norm: torch.Tensor,
+    foreach: bool | None = None,
+) -> None:
+    r"""Scale the gradients of an iterable of parameters given a pre-calculated total norm and desired max norm.
+
+    The gradients will be scaled by the following calculation
+
+    .. math::
+        grad = grad * \min(\frac{max\_norm}{total\_norm + 1e-6}, 1)
+
+    Gradients are modified in-place.
+
+    Note: The scale coefficient is clamped to a maximum of 1.0 to prevent gradient amplification.
+    This ensures that gradients are only scaled down when the total norm exceeds max_norm.
+
+    This function is equivalent to :func:`torch.nn.utils.clip_grad_norm_` with a pre-calculated
+    total norm.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float): max norm of the gradients
+        total_norm (Tensor): total norm of the gradients to use for clipping
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+
+    Returns:
+        None
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    if len(grads) == 0:
+        return
+    grouped_grads: dict[
+        tuple[torch.device, torch.dtype], tuple[list[list[Tensor]], list[int]]
+    ] = _group_tensors_by_device_and_dtype([grads])  # type: ignore[assignment]
+
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
+    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
+    # when the gradients do not reside in CPU memory.
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    for (device, _), ([device_grads], _) in grouped_grads.items():
+        if (foreach is None and _has_foreach_support(device_grads, device)) or (
+            foreach and _device_has_foreach_support(device)
+        ):
+            torch._foreach_mul_(device_grads, clip_coef_clamped.to(device))
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            clip_coef_clamped_device = clip_coef_clamped.to(device)
+            for g in device_grads:
+                g.mul_(clip_coef_clamped_device)
+
+
+@_no_grad
+def clip_grad_norm_(
+    parameters: _tensor_or_tensors,
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    The norm is computed over the norms of the individual gradients of all parameters,
+    as if the norms of the individual gradients were concatenated into a single vector.
+    Gradients are modified in-place.
+
+    This function is equivalent to :func:`torch.nn.utils.get_total_norm` followed by
+    :func:`torch.nn.utils.clip_grads_with_norm_` with the ``total_norm`` returned by ``get_total_norm``.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float): max norm of the gradients
+        norm_type (float, optional): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm. Default: 2.0
+        error_if_nonfinite (bool, optional): if True, an error is thrown if the total
+            norm of the gradients from :attr:`parameters` is ``nan``,
+            ``inf``, or ``-inf``. Default: False
+        foreach (bool, optional): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    else:
+        is_generator = isinstance(parameters, types.GeneratorType)
+        # prevent generators from being exhausted
+        parameters = list(parameters)
+        if is_generator and len(parameters) == 0:
+            warnings.warn(
+                "`parameters` is an empty generator, no gradient clipping will occur.",
+                stacklevel=3,
+            )
+    grads = [p.grad for p in parameters if p.grad is not None]
+    total_norm = _get_total_norm(grads, norm_type, error_if_nonfinite, foreach)
+    _clip_grads_with_norm_(parameters, max_norm, total_norm, foreach)
+    return total_norm
+
+
+@deprecated(
+    "`torch.nn.utils.clip_grad_norm` is now deprecated "
+    "in favor of `torch.nn.utils.clip_grad_norm_`.",
+    category=FutureWarning,
+)
+def clip_grad_norm(
+    parameters: _tensor_or_tensors,
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    .. warning::
+        This method is now deprecated in favor of
+        :func:`torch.nn.utils.clip_grad_norm_`.
+    """
+    return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach)
+
+
+@_no_grad
+def clip_grad_value_(
+    parameters: _tensor_or_tensors,
+    clip_value: float,
+    foreach: bool | None = None,
+) -> None:
+    r"""Clip the gradients of an iterable of parameters at specified value.
+
+    Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        clip_value (float): maximum allowed value of the gradients.
+            The gradients are clipped in the range
+            :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
+        foreach (bool, optional): use the faster foreach-based implementation
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and
+            silently fall back to the slow implementation for other device types.
+            Default: ``None``
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    clip_value = float(clip_value)
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    # pyrefly: ignore [bad-argument-type]
+    grouped_grads = _group_tensors_by_device_and_dtype([grads])
+
+    for (device, _), ([grads], _) in grouped_grads.items():
+        if (
+            foreach is None
+            and _has_foreach_support(cast(list[Tensor], grads), device=device)
+        ) or (foreach and _device_has_foreach_support(device)):
+            torch._foreach_clamp_min_(cast(list[Tensor], grads), -clip_value)
+            torch._foreach_clamp_max_(cast(list[Tensor], grads), clip_value)
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            for grad in grads:
+                cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a56da711ecda3c6e3d5770783f100a8890bbf55
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/convert_parameters.py
@@ -0,0 +1,90 @@
+from collections.abc import Iterable
+
+import torch
+
+
+def parameters_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor:
+    r"""Flatten an iterable of parameters into a single vector.
+
+    Args:
+        parameters (Iterable[Tensor]): an iterable of Tensors that are the
+            parameters of a model.
+
+    Returns:
+        The parameters represented by a single vector
+    """
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    vec = []
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        vec.append(param.view(-1))
+    return torch.cat(vec)
+
+
+def vector_to_parameters(vec: torch.Tensor, parameters: Iterable[torch.Tensor]) -> None:
+    r"""Copy slices of a vector into an iterable of parameters.
+
+    Args:
+        vec (Tensor): a single vector representing the parameters of a model.
+        parameters (Iterable[Tensor]): an iterable of Tensors that are the
+            parameters of a model.
+    """
+    # Ensure vec of type Tensor
+    if not isinstance(vec, torch.Tensor):
+        raise TypeError(f"expected torch.Tensor, but got: {torch.typename(vec)}")
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    # Pointer for slicing the vector for each parameter
+    pointer = 0
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        # The length of the parameter
+        num_param = param.numel()
+        # Slice the vector, reshape it, and replace the old data of the parameter
+        param.data = vec[pointer : pointer + num_param].view_as(param).data
+
+        # Increment the pointer
+        pointer += num_param
+
+
+def _check_param_device(param: torch.Tensor, old_param_device: int | None) -> int:
+    r"""Check if the parameters are located on the same device.
+
+    Currently, the conversion between model parameters and single vector form is not supported
+    for multiple allocations, e.g. parameters in different GPUs/PrivateUse1s, or mixture of CPU/GPU/PrivateUse1.
+
+    Args:
+        param ([Tensor]): a Tensor of a parameter of a model
+        old_param_device (int): the device where the first parameter of a
+                                model is allocated.
+
+    Returns:
+        old_param_device (int): report device for the first time
+    """
+    # Meet the first parameter
+    support_device_types = ["cuda", torch._C._get_privateuse1_backend_name()]
+    if old_param_device is None:
+        old_param_device = (
+            param.get_device() if param.device.type in support_device_types else -1
+        )
+    else:
+        warn = False
+        if (
+            param.device.type in support_device_types
+        ):  # Check if in same GPU/PrivateUse1
+            warn = param.get_device() != old_param_device
+        else:  # Check if in CPU
+            warn = old_param_device != -1
+        if warn:
+            raise TypeError(
+                "Found two parameters on different devices, "
+                "this is currently not supported."
+            )
+    return old_param_device
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..35406785305117f979479bc2baec0f65d6fdb7af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/fusion.py
@@ -0,0 +1,190 @@
+from __future__ import annotations
+
+import copy
+from typing import TypeVar
+
+import torch
+
+
+__all__ = [
+    "fuse_conv_bn_eval",
+    "fuse_conv_bn_weights",
+    "fuse_linear_bn_eval",
+    "fuse_linear_bn_weights",
+]
+
+ConvT = TypeVar("ConvT", bound="torch.nn.modules.conv._ConvNd")
+LinearT = TypeVar("LinearT", bound="torch.nn.Linear")
+
+
+def fuse_conv_bn_eval(
+    conv: ConvT,
+    bn: torch.nn.modules.batchnorm._BatchNorm,
+    transpose: bool = False,
+) -> ConvT:
+    r"""Fuse a convolutional module and a BatchNorm module into a single, new convolutional module.
+
+    Args:
+        conv (torch.nn.modules.conv._ConvNd): A convolutional module.
+        bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module.
+        transpose (bool, optional): If True, transpose the convolutional weight. Defaults to False.
+
+    Returns:
+        torch.nn.modules.conv._ConvNd: The fused convolutional module.
+
+    .. note::
+        Both ``conv`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed.
+    """
+    assert not (conv.training or bn.training), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    assert bn.running_mean is not None and bn.running_var is not None
+    fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
+        fused_conv.weight,
+        fused_conv.bias,
+        bn.running_mean,
+        bn.running_var,
+        bn.eps,
+        bn.weight,
+        bn.bias,
+        transpose,
+    )
+
+    return fused_conv
+
+
+def fuse_conv_bn_weights(
+    conv_w: torch.Tensor,
+    conv_b: torch.Tensor | None,
+    bn_rm: torch.Tensor,
+    bn_rv: torch.Tensor,
+    bn_eps: float,
+    bn_w: torch.Tensor | None,
+    bn_b: torch.Tensor | None,
+    transpose: bool = False,
+) -> tuple[torch.nn.Parameter, torch.nn.Parameter]:
+    r"""Fuse convolutional module parameters and BatchNorm module parameters into new convolutional module parameters.
+
+    Args:
+        conv_w (torch.Tensor): Convolutional weight.
+        conv_b (Optional[torch.Tensor]): Convolutional bias.
+        bn_rm (torch.Tensor): BatchNorm running mean.
+        bn_rv (torch.Tensor): BatchNorm running variance.
+        bn_eps (float): BatchNorm epsilon.
+        bn_w (Optional[torch.Tensor]): BatchNorm weight.
+        bn_b (Optional[torch.Tensor]): BatchNorm bias.
+        transpose (bool, optional): If True, transpose the conv weight. Defaults to False.
+
+    Returns:
+        Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused convolutional weight and bias.
+    """
+    conv_weight_dtype = conv_w.dtype
+    conv_bias_dtype = conv_b.dtype if conv_b is not None else conv_weight_dtype
+    if conv_b is None:
+        conv_b = torch.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
+    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+    if transpose:
+        shape = [1, -1] + [1] * (len(conv_w.shape) - 2)
+    else:
+        shape = [-1, 1] + [1] * (len(conv_w.shape) - 2)
+
+    fused_conv_w = (conv_w * (bn_w * bn_var_rsqrt).reshape(shape)).to(
+        dtype=conv_weight_dtype
+    )
+    fused_conv_b = ((conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b).to(
+        dtype=conv_bias_dtype
+    )
+
+    return (
+        torch.nn.Parameter(fused_conv_w, conv_w.requires_grad),
+        torch.nn.Parameter(fused_conv_b, conv_b.requires_grad),
+    )
+
+
+def fuse_linear_bn_eval(
+    linear: LinearT,
+    bn: torch.nn.modules.batchnorm._BatchNorm,
+) -> LinearT:
+    r"""Fuse a linear module and a BatchNorm module into a single, new linear module.
+
+    Args:
+        linear (torch.nn.Linear): A Linear module.
+        bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module.
+
+    Returns:
+        torch.nn.Linear: The fused linear module.
+
+    .. note::
+        Both ``linear`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed.
+    """
+    assert not (linear.training or bn.training), "Fusion only for eval!"
+    fused_linear = copy.deepcopy(linear)
+
+    """
+    Linear-BN needs to be fused while preserving the shapes of linear weight/bias.
+    To preserve the shapes of linear weight/bias, the channel dim of bn needs to be broadcastable with the last dim of linear,
+    because bn operates over the channel dim, (N, C_in, H, W) while linear operates over the last dim, (*, H_in).
+    To be broadcastable, the number of features in bn and
+    the number of output features from linear must satisfy the following condition:
+    1. they are equal, or
+    2. the number of features in bn is 1
+    Otherwise, skip the folding path
+    """
+    assert linear.out_features == bn.num_features or bn.num_features == 1, (
+        "To fuse, linear.out_features == bn.num_features or bn.num_features == 1"
+    )
+
+    assert bn.running_mean is not None and bn.running_var is not None
+    fused_linear.weight, fused_linear.bias = fuse_linear_bn_weights(
+        fused_linear.weight,
+        fused_linear.bias,
+        bn.running_mean,
+        bn.running_var,
+        bn.eps,
+        bn.weight,
+        bn.bias,
+    )
+
+    return fused_linear
+
+
+def fuse_linear_bn_weights(
+    linear_w: torch.Tensor,
+    linear_b: torch.Tensor | None,
+    bn_rm: torch.Tensor,
+    bn_rv: torch.Tensor,
+    bn_eps: float,
+    bn_w: torch.Tensor,
+    bn_b: torch.Tensor,
+) -> tuple[torch.nn.Parameter, torch.nn.Parameter]:
+    r"""Fuse linear module parameters and BatchNorm module parameters into new linear module parameters.
+
+    Args:
+        linear_w (torch.Tensor): Linear weight.
+        linear_b (Optional[torch.Tensor]): Linear bias.
+        bn_rm (torch.Tensor): BatchNorm running mean.
+        bn_rv (torch.Tensor): BatchNorm running variance.
+        bn_eps (float): BatchNorm epsilon.
+        bn_w (torch.Tensor): BatchNorm weight.
+        bn_b (torch.Tensor): BatchNorm bias.
+
+    Returns:
+        Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused linear weight and bias.
+    """
+    linear_weight_dtype = linear_w.dtype
+    linear_bias_dtype = linear_b.dtype if linear_b is not None else linear_weight_dtype
+    if linear_b is None:
+        linear_b = torch.zeros_like(bn_rm)
+    bn_scale = bn_w * torch.rsqrt(bn_rv + bn_eps)
+
+    fused_w = linear_w * bn_scale.unsqueeze(-1).to(dtype=linear_weight_dtype)
+    fused_b = ((linear_b - bn_rm) * bn_scale + bn_b).to(dtype=linear_bias_dtype)
+
+    return torch.nn.Parameter(fused_w, linear_w.requires_grad), torch.nn.Parameter(
+        fused_b, linear_b.requires_grad
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..10fa03b7c01c2eac7e474ef55f433e4704e6c778
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/init.py
@@ -0,0 +1,55 @@
+# mypy: allow-untyped-defs
+import inspect
+
+import torch
+
+
+def skip_init(module_cls, *args, **kwargs):
+    r"""
+    Given a module class object and args / kwargs, instantiate the module without initializing parameters / buffers.
+
+    This can be useful if initialization is slow or if custom initialization will
+    be performed, making the default initialization unnecessary. There are some caveats to this, due to
+    the way this function is implemented:
+
+    1. The module must accept a `device` arg in its constructor that is passed to any parameters
+    or buffers created during construction.
+
+    2. The module must not perform any computation on parameters in its constructor except
+    initialization (i.e. functions from :mod:`torch.nn.init`).
+
+    If these conditions are satisfied, the module can be instantiated with parameter / buffer values
+    uninitialized, as if having been created using :func:`torch.empty`.
+
+    Args:
+        module_cls: Class object; should be a subclass of :class:`torch.nn.Module`
+        args: args to pass to the module's constructor
+        kwargs: kwargs to pass to the module's constructor
+
+    Returns:
+        Instantiated module with uninitialized parameters / buffers
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> import torch
+        >>> m = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1)
+        >>> m.weight
+        Parameter containing:
+        tensor([[0.0000e+00, 1.5846e+29, 7.8307e+00, 2.5250e-29, 1.1210e-44]],
+               requires_grad=True)
+        >>> m2 = torch.nn.utils.skip_init(torch.nn.Linear, in_features=6, out_features=1)
+        >>> m2.weight
+        Parameter containing:
+        tensor([[-1.4677e+24,  4.5915e-41,  1.4013e-45,  0.0000e+00, -1.4677e+24,
+                  4.5915e-41]], requires_grad=True)
+
+    """
+    if not issubclass(module_cls, torch.nn.Module):
+        raise RuntimeError(f"Expected a Module; got {module_cls}")
+    if "device" not in inspect.signature(module_cls).parameters:
+        raise RuntimeError("Module must support a 'device' arg to skip initialization")
+
+    final_device = kwargs.pop("device", "cpu")
+    kwargs["device"] = "meta"
+    return module_cls(*args, **kwargs).to_empty(device=final_device)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..06eb55a02572d79b6f254624aaea90d86e5430a1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/memory_format.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+from typing import TypeVar
+
+import torch
+
+
+_M = TypeVar("_M", bound="torch.nn.Module")
+
+
+def convert_conv2d_weight_memory_format(
+    module: _M, memory_format: torch.memory_format
+) -> _M:
+    r"""Convert ``memory_format`` of ``nn.Conv2d.weight`` to ``memory_format``.
+
+    The conversion recursively applies to nested ``nn.Module``, including ``module``.
+    Note that it only changes the memory_format, but not the semantics of each dimensions.
+    This function is used to facilitate the computation to adopt NHWC kernels, which
+    provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0
+
+    .. note::
+        Calling ``model.to(memory_format=torch.channels_last)`` is more aggressive
+        than the utility function ``convert_conv2d_weight_memory_format``. Any
+        layer with 4d weight will be affected by ``model.to``, which does not
+        necessarily benefit from conversion to specified ``memory_format``.
+        One place we are confident in is that NHWC(channels_last) conversion for
+        convolution in cuDNN, as it is beneficial to run convolution in NHWC,
+        even in cases where we have to apply permutation to input tensors.
+
+        Hence our strategy here is to convert only the weight of convolution to
+        channels_last. This ensures that;
+        1. Fast convolution kernels will be used, the benefit of which could
+        outweigh overhead of permutation (if input is not in the same format).
+        2. No unnecessary permutations are applied on layers that do not benefit
+        from memory_format conversion.
+
+        The optimal case is that, layers between convolution layers are channels
+        last compatible. Input tensor would be permuted to channels last when it
+        encounters the first convolution layer and stay in that memory format.
+        Hence following convolutions will not need to permute its input tensor.
+
+        In case where a channels last incompatible layer is between convolution
+        layers, we need to permute the input tensor back to contiguous format
+        for that layer. The input tensor will go through the remaining layers in
+        contiguous format and be permuted to channels last when it encounters
+        another convolution layer. There's no point in propagating that
+        permutation to an earlier layer, as most layers are quite agnostic to
+        ``memory_format``.
+
+        This claim might change when PyTorch supports fusion of permutation, as
+        there might have been a better spot to fuse the permutation other than
+        immediately before a convolution.
+
+    Args:
+        module (nn.Module): ``nn.Conv2d`` & ``nn.ConvTranspose2d`` or container
+                            ``nn.Module``
+        memory_format: user specified ``memory_format``,
+            e.g. ``torch.channels_last`` or ``torch.contiguous_format``
+
+    Returns:
+        The original module with updated ``nn.Conv2d``
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+        >>> input = torch.randint(
+        ...     1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda"
+        ... )
+        >>> model = nn.Sequential(
+        >>>     nn.Conv2d(8, 4, 3)).cuda().half()
+        >>> # This is identical to:
+        >>> # nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last)
+        >>> model = nn.utils.convert_conv2d_weight_memory_format(
+        ...     model, torch.channels_last
+        ... )
+        >>> out = model(input)
+    """
+    # TODO: expand this to `_ConvNd` when channels_last support is extended
+    # beyond only 4d tensors.
+    if isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)):
+        weight_data = module.weight.detach().clone(memory_format=memory_format)
+        module.weight.data = weight_data.resize_(
+            weight_data.size(), memory_format=memory_format
+        )
+    for child in module.children():
+        convert_conv2d_weight_memory_format(child, memory_format)
+    # pyrefly: ignore [bad-return]
+    return module
+
+
+def convert_conv3d_weight_memory_format(
+    module: _M, memory_format: torch.memory_format
+) -> _M:
+    r"""Convert ``memory_format`` of ``nn.Conv3d.weight`` to ``memory_format``
+    The conversion recursively applies to nested ``nn.Module``, including ``module``.
+    Note that it only changes the memory_format, but not the semantics of each dimensions.
+    This function is used to facilitate the computation to adopt NHWC kernels, which
+    provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0
+
+    .. note::
+        Calling ``model.to(memory_format=torch.channels_last_3d)`` is more aggressive
+        than the utility function ``convert_conv3d_weight_memory_format``. Any
+        layer with 4d weight will be affected by ``model.to``, which does not
+        necessarily benefit from conversion to specified ``memory_format``.
+        One place we are confident in is that NDHWC(channels_last_3d) conversion for
+        convolution in cuDNN, as it is beneficial to run convolution in NDHWC,
+        even in cases where we have to apply permutation to input tensors.
+
+        Hence our strategy here is to convert only the weight of convolution to
+        channels_last_3d. This ensures that;
+        1. Fast convolution kernels will be used, the benefit of which could
+        outweigh overhead of permutation (if input is not in the same format).
+        2. No unnecessary permutations are applied on layers that do not benefit
+        from memory_format conversion.
+
+        The optimal case is that, layers between convolution layers are channels
+        last compatible. Input tensor would be permuted to channels last when it
+        encounters the first convolution layer and stay in that memory format.
+        Hence following convolutions will not need to permute its input tensor.
+
+        In case where a channels last incompatible layer is between convolution
+        layers, we need to permute the input tensor back to contiguous format
+        for that layer. The input tensor will go through the remaining layers in
+        contiguous format and be permuted to channels last when it encounters
+        another convolution layer. There's no point in propagating that
+        permutation to an earlier layer, as most layers are quite agnostic to
+        ``memory_format``.
+
+        This claim might change when PyTorch supports fusion of permutation, as
+        there might have been a better spot to fuse the permutation other than
+        immediately before a convolution.
+
+    Args:
+        module (nn.Module): ``nn.Conv3d`` & ``nn.ConvTranspose3d`` or container
+                            ``nn.Module``
+        memory_format: user specified ``memory_format``,
+            e.g. ``torch.channels_last`` or ``torch.contiguous_format``
+
+    Returns:
+        The original module with updated ``nn.Conv3d``
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+        >>> input = torch.randint(
+        ...     1, 10, (2, 8, 4, 4, 4), dtype=torch.float16, device="cuda"
+        ... )
+        >>> model = nn.Sequential(
+        >>>     nn.Conv3d(8, 4, 3)).cuda().half()
+        >>> # This is identical to:
+        >>> # nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last_3d)
+        >>> model = nn.utils.convert_conv3d_weight_memory_format(
+        ...     model, torch.channels_last_3d
+        ... )
+        >>> out = model(input)
+    """
+
+    # TODO: expand this to `_ConvNd` when channels_last support is extended
+    # beyond only 4d tensors.
+    if isinstance(module, (torch.nn.Conv3d, torch.nn.ConvTranspose3d)):
+        weight_data = module.weight.detach().clone(memory_format=memory_format)
+        module.weight.data = weight_data.resize_(
+            weight_data.size(), memory_format=memory_format
+        )
+    for child in module.children():
+        convert_conv3d_weight_memory_format(child, memory_format)
+    # pyrefly: ignore [bad-return]
+    return module
+
+
+__all__ = [
+    "convert_conv2d_weight_memory_format",
+    "convert_conv3d_weight_memory_format",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a51bbc15c5969bc742bf954243bd8b1b9333bbe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrizations.py
@@ -0,0 +1,630 @@
+# mypy: allow-untyped-defs
+from enum import auto, Enum
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules import Module
+from torch.nn.utils import parametrize
+
+
+__all__ = ["orthogonal", "spectral_norm", "weight_norm"]
+
+
+def _is_orthogonal(Q, eps=None):
+    n, k = Q.size(-2), Q.size(-1)
+    Id = torch.eye(k, dtype=Q.dtype, device=Q.device)
+    # A reasonable eps, but not too large
+    eps = 10.0 * n * torch.finfo(Q.dtype).eps
+    return torch.allclose(Q.mH @ Q, Id, atol=eps)
+
+
+def _make_orthogonal(A):
+    """Assume that A is a tall matrix.
+
+    Compute the Q factor s.t. A = QR (A may be complex) and diag(R) is real and non-negative.
+    """
+    X, tau = torch.geqrf(A)
+    Q = torch.linalg.householder_product(X, tau)
+    # The diagonal of X is the diagonal of R (which is always real) so we normalise by its signs
+    Q *= X.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
+    return Q
+
+
+class _OrthMaps(Enum):
+    matrix_exp = auto()
+    cayley = auto()
+    householder = auto()
+
+
+class _Orthogonal(Module):
+    base: Tensor
+
+    def __init__(
+        self, weight, orthogonal_map: _OrthMaps, *, use_trivialization=True
+    ) -> None:
+        super().__init__()
+
+        # Note [Householder complex]
+        # For complex tensors, it is not possible to compute the tensor `tau` necessary for
+        # linalg.householder_product from the reflectors.
+        # To see this, note that the reflectors have a shape like:
+        # 0 0 0
+        # * 0 0
+        # * * 0
+        # which, for complex matrices, give n(n-1) (real) parameters. Now, you need n^2 parameters
+        # to parametrize the unitary matrices. Saving tau on its own does not work either, because
+        # not every combination of `(A, tau)` gives a unitary matrix, meaning that if we optimise
+        # them as independent tensors we would not maintain the constraint
+        # An equivalent reasoning holds for rectangular matrices
+        if weight.is_complex() and orthogonal_map == _OrthMaps.householder:
+            raise ValueError(
+                "The householder parametrization does not support complex tensors."
+            )
+
+        self.shape = weight.shape
+        self.orthogonal_map = orthogonal_map
+        if use_trivialization:
+            self.register_buffer("base", None)
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        n, k = X.size(-2), X.size(-1)
+        transposed = n < k
+        if transposed:
+            X = X.mT
+            n, k = k, n
+        # Here n > k and X is a tall matrix
+        if (
+            self.orthogonal_map == _OrthMaps.matrix_exp
+            or self.orthogonal_map == _OrthMaps.cayley
+        ):
+            # We just need n x k - k(k-1)/2 parameters
+            X = X.tril()
+            if n != k:
+                # Embed into a square matrix
+                X = torch.cat(
+                    [X, X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1
+                )
+            A = X - X.mH
+            # A is skew-symmetric (or skew-hermitian)
+            if self.orthogonal_map == _OrthMaps.matrix_exp:
+                Q = torch.matrix_exp(A)
+            elif self.orthogonal_map == _OrthMaps.cayley:
+                # Computes the Cayley retraction (I+A/2)(I-A/2)^{-1}
+                Id = torch.eye(n, dtype=A.dtype, device=A.device)
+                Q = torch.linalg.solve(
+                    torch.add(Id, A, alpha=-0.5), torch.add(Id, A, alpha=0.5)
+                )
+            # Q is now orthogonal (or unitary) of size (..., n, n)
+            if n != k:
+                # pyrefly: ignore [unbound-name]
+                Q = Q[..., :k]
+            # Q is now the size of the X (albeit perhaps transposed)
+        else:
+            # X is real here, as we do not support householder with complex numbers
+            A = X.tril(diagonal=-1)
+            tau = 2.0 / (1.0 + (A * A).sum(dim=-2))
+            Q = torch.linalg.householder_product(A, tau)
+            # The diagonal of X is 1's and -1's
+            # We do not want to differentiate through this or update the diagonal of X hence the casting
+            Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2)
+
+        if hasattr(self, "base"):
+            # pyrefly: ignore [unbound-name]
+            Q = self.base @ Q
+        if transposed:
+            # pyrefly: ignore [unbound-name]
+            Q = Q.mT
+        return Q  # type: ignore[possibly-undefined]
+
+    @torch.autograd.no_grad()
+    def right_inverse(self, Q: torch.Tensor) -> torch.Tensor:
+        if Q.shape != self.shape:
+            raise ValueError(
+                f"Expected a matrix or batch of matrices of shape {self.shape}. "
+                f"Got a tensor of shape {Q.shape}."
+            )
+
+        Q_init = Q
+        n, k = Q.size(-2), Q.size(-1)
+        transpose = n < k
+        if transpose:
+            Q = Q.mT
+            n, k = k, n
+
+        # We always make sure to always copy Q in every path
+        if not hasattr(self, "base"):
+            # Note [right_inverse expm cayley]
+            # If we do not have use_trivialization=True, we just implement the inverse of the forward
+            # map for the Householder. To see why, think that for the Cayley map,
+            # we would need to find the matrix X \in R^{n x k} such that:
+            # Y = torch.cat([X.tril(), X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1)
+            # A = Y - Y.mH
+            # cayley(A)[:, :k]
+            # gives the original tensor. It is not clear how to do this.
+            # Perhaps via some algebraic manipulation involving the QR like that of
+            # Corollary 2.2 in Edelman, Arias and Smith?
+            if (
+                self.orthogonal_map == _OrthMaps.cayley
+                or self.orthogonal_map == _OrthMaps.matrix_exp
+            ):
+                raise NotImplementedError(
+                    "It is not possible to assign to the matrix exponential "
+                    "or the Cayley parametrizations when use_trivialization=False."
+                )
+
+            # If parametrization == _OrthMaps.householder, make Q orthogonal via the QR decomposition.
+            # Here Q is always real because we do not support householder and complex matrices.
+            # See note [Householder complex]
+            A, tau = torch.geqrf(Q)
+            # We want to have a decomposition X = QR with diag(R) > 0, as otherwise we could
+            # decompose an orthogonal matrix Q as Q = (-Q)@(-Id), which is a valid QR decomposition
+            # The diagonal of Q is the diagonal of R from the qr decomposition
+            A.diagonal(dim1=-2, dim2=-1).sign_()
+            # Equality with zero is ok because LAPACK returns exactly zero when it does not want
+            # to use a particular reflection
+            A.diagonal(dim1=-2, dim2=-1)[tau == 0.0] *= -1
+            return A.mT if transpose else A
+        else:
+            if n == k:
+                # We check whether Q is orthogonal
+                if not _is_orthogonal(Q):
+                    Q = _make_orthogonal(Q)
+                else:  # Is orthogonal
+                    Q = Q.clone()
+            else:
+                # Complete Q into a full n x n orthogonal matrix
+                N = torch.randn(
+                    *(Q.size()[:-2] + (n, n - k)), dtype=Q.dtype, device=Q.device
+                )
+                Q = torch.cat([Q, N], dim=-1)
+                Q = _make_orthogonal(Q)
+            self.base = Q
+
+            # It is necessary to return the -Id, as we use the diagonal for the
+            # Householder parametrization. Using -Id makes:
+            # householder(torch.zeros(m,n)) == torch.eye(m,n)
+            # Poor man's version of eye_like
+            neg_Id = torch.zeros_like(Q_init)
+            neg_Id.diagonal(dim1=-2, dim2=-1).fill_(-1.0)
+            return neg_Id
+
+
+def orthogonal(
+    module: Module,
+    name: str = "weight",
+    orthogonal_map: str | None = None,
+    *,
+    use_trivialization: bool = True,
+) -> Module:
+    r"""Apply an orthogonal or unitary parametrization to a matrix or a batch of matrices.
+
+    Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, the parametrized
+    matrix :math:`Q \in \mathbb{K}^{m \times n}` is **orthogonal** as
+
+    .. math::
+
+        \begin{align*}
+            Q^{\text{H}}Q &= \mathrm{I}_n \mathrlap{\qquad \text{if }m \geq n}\\
+            QQ^{\text{H}} &= \mathrm{I}_m \mathrlap{\qquad \text{if }m < n}
+        \end{align*}
+
+    where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex
+    and the transpose when :math:`Q` is real-valued, and
+    :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+    In plain words, :math:`Q` will have orthonormal columns whenever :math:`m \geq n`
+    and orthonormal rows otherwise.
+
+    If the tensor has more than two dimensions, we consider it as a batch of matrices of shape `(..., m, n)`.
+
+    The matrix :math:`Q` may be parametrized via three different ``orthogonal_map`` in terms of the original tensor:
+
+    - ``"matrix_exp"``/``"cayley"``:
+      the :func:`~torch.matrix_exp` :math:`Q = \exp(A)` and the `Cayley map`_
+      :math:`Q = (\mathrm{I}_n + A/2)(\mathrm{I}_n - A/2)^{-1}` are applied to a skew-symmetric
+      :math:`A` to give an orthogonal matrix.
+    - ``"householder"``: computes a product of Householder reflectors
+      (:func:`~torch.linalg.householder_product`).
+
+    ``"matrix_exp"``/``"cayley"`` often make the parametrized weight converge faster than
+    ``"householder"``, but they are slower to compute for very thin or very wide matrices.
+
+    If ``use_trivialization=True`` (default), the parametrization implements the "Dynamic Trivialization Framework",
+    where an extra matrix :math:`B \in \mathbb{K}^{n \times n}` is stored under
+    ``module.parametrizations.weight[0].base``. This helps the
+    convergence of the parametrized layer at the expense of some extra memory use.
+    See `Trivializations for Gradient-Based Optimization on Manifolds`_ .
+
+    Initial value of :math:`Q`:
+    If the original tensor is not parametrized and ``use_trivialization=True`` (default), the initial value
+    of :math:`Q` is that of the original tensor if it is orthogonal (or unitary in the complex case)
+    and it is orthogonalized via the QR decomposition otherwise (see :func:`torch.linalg.qr`).
+    Same happens when it is not parametrized and ``orthogonal_map="householder"`` even when ``use_trivialization=False``.
+    Otherwise, the initial value is the result of the composition of all the registered
+    parametrizations applied to the original tensor.
+
+    .. note::
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`.
+
+
+    .. _`Cayley map`: https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map
+    .. _`Trivializations for Gradient-Based Optimization on Manifolds`: https://arxiv.org/abs/1909.09501
+
+    Args:
+        module (nn.Module): module on which to register the parametrization.
+        name (str, optional): name of the tensor to make orthogonal. Default: ``"weight"``.
+        orthogonal_map (str, optional): One of the following: ``"matrix_exp"``, ``"cayley"``, ``"householder"``.
+            Default: ``"matrix_exp"`` if the matrix is square or complex, ``"householder"`` otherwise.
+        use_trivialization (bool, optional): whether to use the dynamic trivialization framework.
+            Default: ``True``.
+
+    Returns:
+        The original module with an orthogonal parametrization registered to the specified
+        weight
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> orth_linear = orthogonal(nn.Linear(20, 40))
+        >>> orth_linear
+        ParametrizedLinear(
+        in_features=20, out_features=40, bias=True
+        (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+            (0): _Orthogonal()
+            )
+        )
+        )
+        >>> # xdoctest: +IGNORE_WANT
+        >>> Q = orth_linear.weight
+        >>> torch.dist(Q.T @ Q, torch.eye(20))
+        tensor(4.9332e-07)
+    """
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
+        raise ValueError(
+            f"Module '{module}' has no parameter or buffer with name '{name}'"
+        )
+
+    # We could implement this for 1-dim tensors as the maps on the sphere
+    # but I believe it'd bite more people than it'd help
+    if weight.ndim < 2:
+        raise ValueError(
+            "Expected a matrix or batch of matrices. "
+            f"Got a tensor of {weight.ndim} dimensions."
+        )
+
+    if orthogonal_map is None:
+        orthogonal_map = (
+            "matrix_exp"
+            if weight.size(-2) == weight.size(-1) or weight.is_complex()
+            else "householder"
+        )
+
+    orth_enum = getattr(_OrthMaps, orthogonal_map, None)
+    if orth_enum is None:
+        raise ValueError(
+            'orthogonal_map has to be one of "matrix_exp", "cayley", "householder". '
+            f"Got: {orthogonal_map}"
+        )
+    orth = _Orthogonal(weight, orth_enum, use_trivialization=use_trivialization)
+    parametrize.register_parametrization(module, name, orth, unsafe=True)
+    return module
+
+
+class _WeightNorm(Module):
+    def __init__(
+        self,
+        dim: int | None = 0,
+    ) -> None:
+        super().__init__()
+        if dim is None:
+            dim = -1
+        self.dim = dim
+
+    def forward(self, weight_g, weight_v):
+        return torch._weight_norm(weight_v, weight_g, self.dim)
+
+    def right_inverse(self, weight):
+        weight_g = torch.norm_except_dim(weight, 2, self.dim)
+        weight_v = weight
+
+        return weight_g, weight_v
+
+
+def weight_norm(module: Module, name: str = "weight", dim: int = 0):
+    r"""Apply weight normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by :attr:`name` with two parameters: one specifying the magnitude
+    and one specifying the direction.
+
+    By default, with ``dim=0``, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    ``dim=None``.
+
+    See https://arxiv.org/abs/1602.07868
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = weight_norm(nn.Linear(20, 40), name='weight')
+        >>> m
+        ParametrizedLinear(
+          in_features=20, out_features=40, bias=True
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        >>> m.parametrizations.weight.original0.size()
+        torch.Size([40, 1])
+        >>> m.parametrizations.weight.original1.size()
+        torch.Size([40, 20])
+
+    """
+    _weight_norm = _WeightNorm(dim)
+    parametrize.register_parametrization(module, name, _weight_norm, unsafe=True)
+
+    def _weight_norm_compat_hook(
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        g_key = f"{prefix}{name}_g"
+        v_key = f"{prefix}{name}_v"
+        if g_key in state_dict and v_key in state_dict:
+            original0 = state_dict.pop(g_key)
+            original1 = state_dict.pop(v_key)
+            state_dict[f"{prefix}parametrizations.{name}.original0"] = original0
+            state_dict[f"{prefix}parametrizations.{name}.original1"] = original1
+
+    module._register_load_state_dict_pre_hook(_weight_norm_compat_hook)
+    return module
+
+
+class _SpectralNorm(Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        n_power_iterations: int = 1,
+        dim: int = 0,
+        eps: float = 1e-12,
+    ) -> None:
+        super().__init__()
+        ndim = weight.ndim
+        if dim >= ndim or dim < -ndim:
+            raise IndexError(
+                "Dimension out of range (expected to be in range of "
+                f"[-{ndim}, {ndim - 1}] but got {dim})"
+            )
+
+        if n_power_iterations <= 0:
+            raise ValueError(
+                "Expected n_power_iterations to be positive, but "
+                f"got n_power_iterations={n_power_iterations}"
+            )
+        self.dim = dim if dim >= 0 else dim + ndim
+        self.eps = eps
+        if ndim > 1:
+            # For ndim == 1 we do not need to approximate anything (see _SpectralNorm.forward)
+            self.n_power_iterations = n_power_iterations
+            weight_mat = self._reshape_weight_to_matrix(weight)
+            h, w = weight_mat.size()
+
+            u = weight_mat.new_empty(h).normal_(0, 1)
+            v = weight_mat.new_empty(w).normal_(0, 1)
+            self.register_buffer("_u", F.normalize(u, dim=0, eps=self.eps))
+            self.register_buffer("_v", F.normalize(v, dim=0, eps=self.eps))
+
+            # Start with u, v initialized to some reasonable values by performing a number
+            # of iterations of the power method
+            self._power_method(weight_mat, 15)
+
+    def _reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor:
+        # Precondition
+        assert weight.ndim > 1
+
+        if self.dim != 0:
+            # permute dim to front
+            weight = weight.permute(
+                self.dim, *(d for d in range(weight.dim()) if d != self.dim)
+            )
+
+        return weight.flatten(1)
+
+    @torch.autograd.no_grad()
+    def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> None:
+        # See original note at torch/nn/utils/spectral_norm.py
+        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
+        #     updated in power iteration **in-place**. This is very important
+        #     because in `DataParallel` forward, the vectors (being buffers) are
+        #     broadcast from the parallelized module to each module replica,
+        #     which is a new module object created on the fly. And each replica
+        #     runs its own spectral norm power iteration. So simply assigning
+        #     the updated vectors to the module this function runs on will cause
+        #     the update to be lost forever. And the next time the parallelized
+        #     module is replicated, the same randomly initialized vectors are
+        #     broadcast and used!
+        #
+        #     Therefore, to make the change propagate back, we rely on two
+        #     important behaviors (also enforced via tests):
+        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
+        #          is already on correct device; and it makes sure that the
+        #          parallelized module is already on `device[0]`.
+        #       2. If the out tensor in `out=` kwarg has correct shape, it will
+        #          just fill in the values.
+        #     Therefore, since the same power iteration is performed on all
+        #     devices, simply updating the tensors in-place will make sure that
+        #     the module replica on `device[0]` will update the _u vector on the
+        #     parallelized module (by shared storage).
+        #
+        #    However, after we update `u` and `v` in-place, we need to **clone**
+        #    them before using them to normalize the weight. This is to support
+        #    backproping through two forward passes, e.g., the common pattern in
+        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
+        #    complain that variables needed to do backward for the first forward
+        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
+
+        # Precondition
+        assert weight_mat.ndim > 1
+
+        for _ in range(n_power_iterations):
+            # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+            # are the first left and right singular vectors.
+            # This power iteration produces approximations of `u` and `v`.
+            self._u = F.normalize(
+                torch.mv(weight_mat, self._v),  # type: ignore[has-type]
+                dim=0,
+                eps=self.eps,
+                out=self._u,  # type: ignore[has-type]
+            )
+            self._v = F.normalize(
+                torch.mv(weight_mat.H, self._u),  # type: ignore[has-type]
+                dim=0,
+                eps=self.eps,
+                out=self._v,  # type: ignore[has-type]
+            )
+
+    def forward(self, weight: torch.Tensor) -> torch.Tensor:
+        if weight.ndim == 1:
+            # Faster and more exact path, no need to approximate anything
+            return F.normalize(weight, dim=0, eps=self.eps)
+        else:
+            weight_mat = self._reshape_weight_to_matrix(weight)
+            if self.training:
+                self._power_method(weight_mat, self.n_power_iterations)
+            # See above on why we need to clone
+            u = self._u.clone(memory_format=torch.contiguous_format)
+            v = self._v.clone(memory_format=torch.contiguous_format)
+            # The proper way of computing this should be through F.bilinear, but
+            # it seems to have some efficiency issues:
+            # https://github.com/pytorch/pytorch/issues/58093
+            sigma = torch.vdot(u, torch.mv(weight_mat, v))
+            return weight / sigma
+
+    def right_inverse(self, value: torch.Tensor) -> torch.Tensor:
+        # we may want to assert here that the passed value already
+        # satisfies constraints
+        return value
+
+
+def spectral_norm(
+    module: Module,
+    name: str = "weight",
+    n_power_iterations: int = 1,
+    eps: float = 1e-12,
+    dim: int | None = None,
+) -> Module:
+    r"""Apply spectral normalization to a parameter in the given module.
+
+    .. math::
+        \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})},
+        \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    When applied on a vector, it simplifies to
+
+    .. math::
+        \mathbf{x}_{SN} = \dfrac{\mathbf{x}}{\|\mathbf{x}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generative Adversarial Networks (GANs) by reducing the Lipschitz constant
+    of the model. :math:`\sigma` is approximated performing one iteration of the
+    `power method`_ every time the weight is accessed. If the dimension of the
+    weight tensor is greater than 2, it is reshaped to 2D in power iteration
+    method to get spectral norm.
+
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`power method`: https://en.wikipedia.org/wiki/Power_iteration
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    .. note::
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`. It is a
+        reimplementation of :func:`torch.nn.utils.spectral_norm`.
+
+    .. note::
+        When this constraint is registered, the singular vectors associated to the largest
+        singular value are estimated rather than sampled at random. These are then updated
+        performing :attr:`n_power_iterations` of the `power method`_ whenever the tensor
+        is accessed with the module on `training` mode.
+
+    .. note::
+        If the `_SpectralNorm` module, i.e., `module.parametrization.weight[idx]`,
+        is in training mode on removal, it will perform another power iteration.
+        If you'd like to avoid this iteration, set the module to eval mode
+        before its removal.
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter. Default: ``"weight"``.
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectral norm. Default: ``1``.
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms. Default: ``1e-12``.
+        dim (int, optional): dimension corresponding to number of outputs.
+            Default: ``0``, except for modules that are instances of
+            ConvTranspose{1,2,3}d, when it is ``1``
+
+    Returns:
+        The original module with a new parametrization registered to the specified
+        weight
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> snm = spectral_norm(nn.Linear(20, 40))
+        >>> snm
+        ParametrizedLinear(
+          in_features=20, out_features=40, bias=True
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _SpectralNorm()
+            )
+          )
+        )
+        >>> torch.linalg.matrix_norm(snm.weight, 2)
+        tensor(1.0081, grad_fn=<AmaxBackward0>)
+    """
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
+        raise ValueError(
+            f"Module '{module}' has no parameter or buffer with name '{name}'"
+        )
+
+    if dim is None:
+        if isinstance(
+            module,
+            (
+                torch.nn.ConvTranspose1d,
+                torch.nn.ConvTranspose2d,
+                torch.nn.ConvTranspose3d,
+            ),
+        ):
+            dim = 1
+        else:
+            dim = 0
+    parametrize.register_parametrization(
+        module, name, _SpectralNorm(weight, n_power_iterations, dim, eps)
+    )
+    return module
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py
new file mode 100644
index 0000000000000000000000000000000000000000..28599db7bdf116f7e3af1bcd7d8576fc2fe51f9b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/parametrize.py
@@ -0,0 +1,838 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import collections
+import copyreg
+from collections.abc import Sequence
+from contextlib import contextmanager
+from copy import deepcopy
+
+import torch
+from torch import Tensor
+from torch.__future__ import get_swap_module_params_on_conversion
+from torch.nn.modules.container import Module, ModuleDict, ModuleList
+from torch.nn.parameter import Parameter
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+
+__all__ = [
+    "cached",
+    "ParametrizationList",
+    "register_parametrization",
+    "is_parametrized",
+    "remove_parametrizations",
+    "type_before_parametrizations",
+    "transfer_parametrizations_and_params",
+]
+
+_cache_enabled = 0
+_cache: dict[tuple[int, str], Tensor | None] = {}
+
+
+@contextmanager
+def cached():
+    r"""Context manager that enables the caching system within parametrizations registered with :func:`register_parametrization`.
+
+    The value of the parametrized objects is computed and cached the first time
+    they are required when this context manager is active. The cached values are
+    discarded when leaving the context manager.
+
+    This is useful when using a parametrized parameter more than once in the forward pass.
+    An example of this is when parametrizing the recurrent kernel of an RNN or when
+    sharing weights.
+
+    The simplest way to activate the cache is by wrapping the forward pass of the neural network
+
+    .. code-block:: python
+
+        import torch.nn.utils.parametrize as P
+
+        ...
+        with P.cached():
+            output = model(inputs)
+
+    in training and evaluation. One may also wrap the parts of the modules that use
+    several times the parametrized tensors. For example, the loop of an RNN with a
+    parametrized recurrent kernel:
+
+    .. code-block:: python
+
+        with P.cached():
+            for x in xs:
+                out_rnn = self.rnn_cell(x, out_rnn)
+    """
+    global _cache
+    global _cache_enabled
+    _cache_enabled += 1
+    try:
+        yield
+    finally:
+        _cache_enabled -= 1
+        if not _cache_enabled:
+            _cache = {}
+
+
+def _register_parameter_or_buffer(module, name, X) -> None:
+    if isinstance(X, Parameter):
+        module.register_parameter(name, X)
+    else:
+        module.register_buffer(name, X)
+
+
+def _maybe_set(dest: Tensor, src: Tensor) -> None:
+    should_swap = (
+        get_swap_module_params_on_conversion() or is_traceable_wrapper_subclass(dest)
+    )
+    if should_swap:
+        if isinstance(dest, Parameter) and not isinstance(src, Parameter):
+            src = Parameter(src, requires_grad=dest.requires_grad)
+        torch.utils.swap_tensors(dest, src)
+    else:
+        dest.set_(src)  # type: ignore[call-overload]
+
+
+class ParametrizationList(ModuleList):
+    r"""A sequential container that holds and manages the original parameters or buffers of a parametrized :class:`torch.nn.Module`.
+
+    It is the type of ``module.parametrizations[tensor_name]`` when ``module[tensor_name]``
+    has been parametrized with :func:`register_parametrization`.
+
+    If the first registered parametrization has a ``right_inverse`` that returns one tensor or
+    does not have a ``right_inverse`` (in which case we assume that ``right_inverse`` is the identity),
+    it will hold the tensor under the name ``original``.
+    If it has a ``right_inverse`` that returns more than one tensor, these will be registered as
+    ``original0``, ``original1``, ...
+
+    .. warning::
+        This class is used internally by :func:`register_parametrization`. It is documented
+        here for completeness. It shall not be instantiated by the user.
+
+    Args:
+        modules (sequence): sequence of modules representing the parametrizations
+        original (Parameter or Tensor): parameter or buffer that is parametrized
+        unsafe (bool): a boolean flag that denotes whether the parametrization
+            may change the dtype and shape of the tensor. Default: `False`
+            Warning: the parametrization is not checked for consistency upon registration.
+            Enable this flag at your own risk.
+    """
+
+    original: Tensor
+    unsafe: bool
+
+    def __init__(
+        self,
+        modules: Sequence[Module],
+        original: Tensor | Parameter,
+        unsafe: bool = False,
+    ) -> None:
+        # We require this because we need to treat differently the first parametrization
+        # This should never throw, unless this class is used from the outside
+        if len(modules) == 0:
+            raise ValueError("ParametrizationList requires one or more modules.")
+
+        super().__init__(modules)
+        self.unsafe = unsafe
+
+        # In plain words:
+        # module.weight must keep its dtype and shape.
+        # Furthermore, if there is no right_inverse or the right_inverse returns a tensor,
+        # this should be of the same dtype as the original tensor
+        #
+        # We check that the following invariants hold:
+        #    X = module.weight
+        #    Y = param.right_inverse(X)
+        #    assert isinstance(Y, Tensor) or
+        #           (isinstance(Y, collections.abc.Sequence) and all(isinstance(t, Tensor) for t in Y))
+        #    Z = param(Y) if isinstance(Y, Tensor) else param(*Y)
+        #    # Consistency checks
+        #    assert X.dtype == Z.dtype and X.shape == Z.shape
+        #    # If it has one input, this allows to be able to use set_ to be able to
+        #    # move data to/from the original tensor without changing its id (which is what the
+        #    # optimizer uses to track parameters)
+        #    if isinstance(Y, Tensor)
+        #      assert X.dtype == Y.dtype
+        # Below we use original = X, new = Y
+
+        original_shape = original.shape
+        original_dtype = original.dtype
+
+        # Compute new
+        with torch.no_grad():
+            new = original
+            for module in reversed(self):  # type: ignore[call-overload]
+                if hasattr(module, "right_inverse"):
+                    try:
+                        new = module.right_inverse(new)  # type: ignore[operator]
+                    except NotImplementedError:
+                        pass
+                # else, or if it throws, we assume that right_inverse is the identity
+
+        if not isinstance(new, Tensor) and not isinstance(new, Sequence):
+            raise ValueError(
+                "'right_inverse' must return a Tensor or a Sequence of tensors (list, tuple...). "
+                f"Got {type(new).__name__}"
+            )
+
+        # Set the number of original tensors
+        self.is_tensor = isinstance(new, Tensor)
+        self.ntensors = 1 if self.is_tensor else len(new)
+
+        # Register the tensor(s)
+        if self.is_tensor:
+            # pyrefly: ignore [missing-attribute]
+            if original.dtype != new.dtype:
+                raise ValueError(
+                    "When `right_inverse` outputs one tensor, it may not change the dtype.\n"
+                    f"original.dtype: {original.dtype}\n"
+                    # pyrefly: ignore [missing-attribute]
+                    f"right_inverse(original).dtype: {new.dtype}"
+                )
+
+            # pyrefly: ignore [missing-attribute]
+            if original.device != new.device:
+                raise ValueError(
+                    "When `right_inverse` outputs one tensor, it may not change the device.\n"
+                    f"original.device: {original.device}\n"
+                    # pyrefly: ignore [missing-attribute]
+                    f"right_inverse(original).device: {new.device}"
+                )
+
+            # Set the original to original so that the user does not need to re-register the parameter
+            # manually in the optimiser
+            with torch.no_grad():
+                # pyrefly: ignore [bad-argument-type]
+                _maybe_set(original, new)
+            _register_parameter_or_buffer(self, "original", original)
+        else:
+            for i, originali in enumerate(new):
+                if not isinstance(originali, Tensor):
+                    raise ValueError(
+                        "'right_inverse' must return a Tensor or a Sequence of tensors "
+                        "(list, tuple...). "
+                        f"Got element {i} of the sequence with type {type(originali).__name__}."
+                    )
+
+                # If the original tensor was a Parameter that required grad, we expect the user to
+                # add the new parameters to the optimizer after registering the parametrization
+                # (this is documented)
+                if isinstance(original, Parameter):
+                    originali = Parameter(originali, original.requires_grad)
+                originali.requires_grad_(original.requires_grad)
+                _register_parameter_or_buffer(self, f"original{i}", originali)
+
+        if not self.unsafe:
+            # Consistency checks:
+            # Since f : A -> B, right_inverse : B -> A, Z and original should live in B
+            # Z = forward(right_inverse(original))
+            Z = self()
+            if not isinstance(Z, Tensor):
+                raise ValueError(
+                    f"A parametrization must return a tensor. Got {type(Z).__name__}."
+                )
+            if Z.dtype != original_dtype:
+                raise ValueError(
+                    "Registering a parametrization may not change the dtype of the tensor, unless `unsafe` flag is enabled.\n"
+                    f"unparametrized dtype: {original_dtype}\n"
+                    f"parametrized dtype: {Z.dtype}"
+                )
+            if Z.shape != original_shape:
+                raise ValueError(
+                    "Registering a parametrization may not change the shape of the tensor, unless `unsafe` flag is enabled.\n"
+                    f"unparametrized shape: {original_shape}\n"
+                    f"parametrized shape: {Z.shape}"
+                )
+
+    def right_inverse(self, value: Tensor) -> None:
+        r"""Call the ``right_inverse`` methods of the parametrizations in the inverse registration order.
+
+        Then, it stores the result in ``self.original`` if ``right_inverse`` outputs one tensor
+        or in ``self.original0``, ``self.original1``, ... if it outputs several.
+
+        Args:
+            value (Tensor): Value to which initialize the module
+        """
+        # All the exceptions in this function should almost never throw.
+        # They could throw if, for example, right_inverse function returns a different
+        # dtype when given a different input, which should most likely be caused by a
+        # bug in the user's code
+
+        with torch.no_grad():
+            # See https://github.com/pytorch/pytorch/issues/53103
+            for module in reversed(self):  # type: ignore[call-overload]
+                if hasattr(module, "right_inverse"):
+                    value = module.right_inverse(value)  # type: ignore[operator]
+                else:
+                    raise RuntimeError(
+                        f"parametrization {type(module).__name__} does not implement "
+                        "right_inverse."
+                    )
+            if self.is_tensor:
+                # These exceptions should only throw when a right_inverse function does not
+                # return the same dtype for every input, which should most likely be caused by a bug
+                if not isinstance(value, Tensor):
+                    raise ValueError(
+                        f"`right_inverse` should return a tensor. Got {type(value).__name__}"
+                    )
+                if value.dtype != self.original.dtype:
+                    raise ValueError(
+                        f"The tensor returned by `right_inverse` has dtype {value.dtype} "
+                        f"while `original` has dtype {self.original.dtype}"
+                    )
+                # We know that the result is going to have the same dtype
+                _maybe_set(self.original, value)
+            else:
+                if not isinstance(value, collections.abc.Sequence):
+                    raise ValueError(
+                        "'right_inverse' must return a sequence of tensors. "
+                        f"Got {type(value).__name__}."
+                    )
+                if len(value) != self.ntensors:
+                    raise ValueError(
+                        "'right_inverse' must return a sequence of tensors of length "
+                        f"{self.ntensors}. Got a sequence of length {len(value)}."
+                    )
+                for i, tensor in enumerate(value):
+                    original_i = getattr(self, f"original{i}")
+                    if not isinstance(tensor, Tensor):
+                        raise ValueError(
+                            f"`right_inverse` must return a sequence of tensors. "
+                            f"Got element {i} of type {type(tensor).__name__}"
+                        )
+                    if original_i.dtype != tensor.dtype:
+                        raise ValueError(
+                            f"Tensor {i} returned by `right_inverse` has dtype {tensor.dtype} "
+                            f"while `original{i}` has dtype {original_i.dtype}"
+                        )
+                    _maybe_set(original_i, tensor)
+
+    def forward(self) -> Tensor:
+        if torch.jit.is_scripting():
+            raise RuntimeError("Parametrization is not working with scripting.")
+        # Unpack the originals for the first parametrization
+        if self.is_tensor:
+            x = self[0](self.original)
+        else:
+            originals = (getattr(self, f"original{i}") for i in range(self.ntensors))
+            x = self[0](*originals)
+        # It's not possible to call self[1:] here, so we have to be a bit more cryptic
+        # Also we want to skip all non-integer keys
+        curr_idx = 1
+        while hasattr(self, str(curr_idx)):
+            x = self[curr_idx](x)
+            curr_idx += 1
+        return x
+
+
+def _inject_new_class(module: Module) -> None:
+    r"""Set up a module to be parametrized.
+
+    This works by substituting the class of the module by a class
+    that extends it to be able to inject a property
+
+    Args:
+        module (nn.Module): module into which to inject the property
+    """
+    cls = module.__class__
+
+    def default_deepcopy(self, memo):
+        # Just emulate a standard deepcopy procedure when __deepcopy__ doesn't exist in the current class.
+        obj = memo.get(id(self), None)
+        if obj is not None:
+            return obj
+        replica = self.__new__(self.__class__)
+        memo[id(self)] = replica
+        replica.__dict__ = deepcopy(self.__dict__, memo)
+        # Also save all slots if they exist.
+        slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+        for slot in slots_to_save:
+            if hasattr(self, slot):
+                setattr(replica, slot, deepcopy(getattr(self, slot), memo))
+        return replica
+
+    def getstate(self):
+        raise RuntimeError(
+            "Serialization of parametrized modules is only "
+            "supported through state_dict(). See:\n"
+            "https://pytorch.org/tutorials/beginner/saving_loading_models.html"
+            "#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training"
+        )
+
+    dct = {"__getstate__": getstate}
+    # We don't allow serialization of parametrized modules but should still allow deepcopying.
+    # Default 'deepcopy' function invokes __deepcopy__ method instead of __getstate__ when it exists.
+    if not hasattr(cls, "__deepcopy__"):
+        dct["__deepcopy__"] = default_deepcopy  # type: ignore[assignment]
+
+    param_cls = type(
+        f"Parametrized{cls.__name__}",
+        (cls,),
+        dct,
+    )
+
+    module.__class__ = param_cls
+
+
+def _inject_property(module: Module, tensor_name: str) -> None:
+    r"""Injects a property into module[tensor_name].
+
+    It assumes that the class in the module has already been modified from its
+    original one using _inject_new_class and that the tensor under :attr:`tensor_name`
+    has already been moved out
+
+    Args:
+        module (nn.Module): module into which to inject the property
+        tensor_name (str): name of the name of the property to create
+    """
+    # We check the precondition.
+    # This should never fire if register_parametrization is correctly implemented
+    assert not hasattr(module, tensor_name)
+
+    @torch.jit.unused
+    def get_cached_parametrization(parametrization) -> Tensor:
+        global _cache
+        key = (id(module), tensor_name)
+        tensor = _cache.get(key)
+        if tensor is None:
+            tensor = parametrization()
+            _cache[key] = tensor
+        return tensor
+
+    def get_parametrized(self) -> Tensor:
+        if torch.jit.is_scripting():
+            raise RuntimeError("Parametrization is not working with scripting.")
+        parametrization = self.parametrizations[tensor_name]
+        # pyrefly: ignore [redundant-condition]
+        if _cache_enabled:
+            if torch.jit.is_scripting():
+                # Scripting
+                raise RuntimeError(
+                    "Caching is not implemented for scripting. "
+                    "Either disable caching or avoid scripting."
+                )
+            elif torch._C._get_tracing_state() is not None:
+                # Tracing
+                raise RuntimeError(
+                    "Cannot trace a model while caching parametrizations."
+                )
+            else:
+                return get_cached_parametrization(parametrization)
+        else:
+            # If caching is not active, this function just evaluates the parametrization
+            return parametrization()
+
+    def set_original(self, value: Tensor) -> None:
+        if torch.jit.is_scripting():
+            raise RuntimeError("Parametrization is not working with scripting.")
+        self.parametrizations[tensor_name].right_inverse(value)
+
+    setattr(module.__class__, tensor_name, property(get_parametrized, set_original))
+
+
+def register_parametrization(
+    module: Module,
+    tensor_name: str,
+    parametrization: Module,
+    *,
+    unsafe: bool = False,
+) -> Module:
+    r"""Register a parametrization to a tensor in a module.
+
+    Assume that ``tensor_name="weight"`` for simplicity. When accessing ``module.weight``,
+    the module will return the parametrized version ``parametrization(module.weight)``.
+    If the original tensor requires a gradient, the backward pass will differentiate
+    through :attr:`parametrization`, and the optimizer will update the tensor accordingly.
+
+    The first time that a module registers a parametrization, this function will add an attribute
+    ``parametrizations`` to the module of type :class:`~ParametrizationList`.
+
+    The list of parametrizations on the tensor ``weight`` will be accessible under
+    ``module.parametrizations.weight``.
+
+    The original tensor will be accessible under
+    ``module.parametrizations.weight.original``.
+
+    Parametrizations may be concatenated by registering several parametrizations
+    on the same attribute.
+
+    The training mode of a registered parametrization is updated on registration
+    to match the training mode of the host module
+
+    Parametrized parameters and buffers have an inbuilt caching system that can be activated
+    using the context manager :func:`cached`.
+
+    A :attr:`parametrization` may optionally implement a method with signature
+
+    .. code-block:: python
+
+        def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
+
+    This method is called on the unparametrized tensor when the first parametrization
+    is registered to compute the initial value of the original tensor.
+    If this method is not implemented, the original tensor will be just the unparametrized tensor.
+
+    If all the parametrizations registered on a tensor implement `right_inverse` it is possible
+    to initialize a parametrized tensor by assigning to it, as shown in the example below.
+
+    It is possible for the first parametrization to depend on several inputs.
+    This may be implemented returning a tuple of tensors from ``right_inverse``
+    (see the example implementation of a ``RankOne`` parametrization below).
+
+    In this case, the unconstrained tensors are also located under ``module.parametrizations.weight``
+    with names ``original0``, ``original1``,...
+
+    .. note::
+
+        If unsafe=False (default) both the forward and right_inverse methods will be called
+        once to perform a number of consistency checks.
+        If unsafe=True, then right_inverse will be called if the tensor is not parametrized,
+        and nothing will be called otherwise.
+
+    .. note::
+
+        In most situations, ``right_inverse`` will be a function such that
+        ``forward(right_inverse(X)) == X`` (see
+        `right inverse <https://en.wikipedia.org/wiki/Inverse_function#Right_inverses>`_).
+        Sometimes, when the parametrization is not surjective, it may be reasonable
+        to relax this.
+
+    .. warning::
+
+        If a parametrization depends on several inputs, :func:`~register_parametrization`
+        will register a number of new parameters. If such parametrization is registered
+        after the optimizer is created, these new parameters will need to be added manually
+        to the optimizer. See :meth:`torch.Optimizer.add_param_group`.
+
+    Args:
+        module (nn.Module): module on which to register the parametrization
+        tensor_name (str): name of the parameter or buffer on which to register
+            the parametrization
+        parametrization (nn.Module): the parametrization to register
+    Keyword args:
+        unsafe (bool): a boolean flag that denotes whether the parametrization
+            may change the dtype and shape of the tensor. Default: `False`
+            Warning: the parametrization is not checked for consistency upon registration.
+            Enable this flag at your own risk.
+
+    Raises:
+        ValueError: if the module does not have a parameter or a buffer named :attr:`tensor_name`
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> import torch.nn.utils.parametrize as P
+        >>>
+        >>> class Symmetric(nn.Module):
+        >>>     def forward(self, X):
+        >>>         return X.triu() + X.triu(1).T  # Return a symmetric matrix
+        >>>
+        >>>     def right_inverse(self, A):
+        >>>         return A.triu()
+        >>>
+        >>> m = nn.Linear(5, 5)
+        >>> P.register_parametrization(m, "weight", Symmetric())
+        >>> print(torch.allclose(m.weight, m.weight.T))  # m.weight is now symmetric
+        True
+        >>> A = torch.rand(5, 5)
+        >>> A = A + A.T  # A is now symmetric
+        >>> m.weight = A  # Initialize the weight to be the symmetric matrix A
+        >>> print(torch.allclose(m.weight, A))
+        True
+
+        >>> class RankOne(nn.Module):
+        >>>     def forward(self, x, y):
+        >>> # Form a rank 1 matrix multiplying two vectors
+        >>>         return x.unsqueeze(-1) @ y.unsqueeze(-2)
+        >>>
+        >>>     def right_inverse(self, Z):
+        >>> # Project Z onto the rank 1 matrices
+        >>>         U, S, Vh = torch.linalg.svd(Z, full_matrices=False)
+        >>> # Return rescaled singular vectors
+        >>>         s0_sqrt = S[0].sqrt().unsqueeze(-1)
+        >>>         return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+        >>>
+        >>> linear_rank_one = P.register_parametrization(
+        ...     nn.Linear(4, 4), "weight", RankOne()
+        ... )
+        >>> print(torch.linalg.matrix_rank(linear_rank_one.weight).item())
+        1
+
+    """
+    parametrization.train(module.training)
+    if is_parametrized(module, tensor_name):
+        # Correctness checks.
+        # If A is the space of tensors with shape and dtype equal to module.weight
+        # we check that parametrization.forward and parametrization.right_inverse are
+        # functions from A to A
+        if not unsafe:
+            Y = getattr(module, tensor_name)
+            X = parametrization(Y)
+            if not isinstance(X, Tensor):
+                raise ValueError(
+                    f"A parametrization must return a tensor. Got {type(X).__name__}."
+                )
+            if X.dtype != Y.dtype:
+                raise ValueError(
+                    "Registering a parametrization may not change the dtype of the tensor, unless the `unsafe` flag is enabled.\n"
+                    f"module.{tensor_name}.dtype: {Y.dtype}\n"
+                    f"parametrization(module.{tensor_name}).dtype: {X.dtype}"
+                )
+            if X.shape != Y.shape:
+                raise ValueError(
+                    "Registering a parametrization may not change the shape of the tensor, unless the `unsafe` flag is enabled.\n"
+                    f"module.{tensor_name}.shape: {Y.shape}\n"
+                    f"parametrization(module.{tensor_name}).shape: {X.shape}"
+                )
+            if hasattr(parametrization, "right_inverse"):
+                try:
+                    Z = parametrization.right_inverse(X)  # type: ignore[operator]
+                except NotImplementedError:
+                    pass
+                else:
+                    if not isinstance(Z, Tensor):
+                        raise ValueError(
+                            f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}"
+                        )
+                    if Z.dtype != Y.dtype:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same dtype "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.dtype: {Y.dtype}\n"
+                            f"returned dtype: {Z.dtype}"
+                        )
+                    if Z.shape != Y.shape:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same shape "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.shape: {Y.shape}\n"
+                            f"returned shape: {Z.shape}"
+                        )
+            # else right_inverse is assumed to be the identity
+
+        # add the new parametrization to the parametrization list
+        assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+        module.parametrizations[tensor_name].append(parametrization)  # type: ignore[operator]
+        # If unsafe was True in previous parametrization, keep it enabled
+        module.parametrizations[tensor_name].unsafe |= unsafe  # type: ignore[index, union-attr, operator]
+    elif tensor_name in module._buffers or tensor_name in module._parameters:
+        # Set the parametrization mechanism
+        # Fetch the original buffer or parameter
+        original = getattr(module, tensor_name)
+        # We create this early to check for possible errors
+        parametrizations = ParametrizationList(
+            [parametrization], original, unsafe=unsafe
+        )
+        # Delete the previous parameter or buffer
+        delattr(module, tensor_name)
+        # If this is the first parametrization registered on the module,
+        # we prepare the module to inject the property
+        if not is_parametrized(module):
+            # Change the class
+            _inject_new_class(module)
+            # Inject a ``ModuleDict`` into the instance under module.parametrizations
+            module.parametrizations = ModuleDict()
+        # Add a property into the class
+        _inject_property(module, tensor_name)
+        # Add a ParametrizationList
+        assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+        module.parametrizations[tensor_name] = parametrizations
+    else:
+        raise ValueError(
+            f"Module '{module}' does not have a parameter, a buffer, or a "
+            f"parametrized element with name '{tensor_name}'"
+        )
+    return module
+
+
+def is_parametrized(module: Module, tensor_name: str | None = None) -> bool:
+    r"""Determine if a module has a parametrization.
+
+    Args:
+        module (nn.Module): module to query
+        tensor_name (str, optional): name of the parameter in the module
+            Default: ``None``
+    Returns:
+        ``True`` if :attr:`module` has a parametrization for the parameter named :attr:`tensor_name`,
+        or if it has any parametrization when :attr:`tensor_name` is ``None``;
+        otherwise ``False``
+    """
+    parametrizations = getattr(module, "parametrizations", None)
+    if parametrizations is None or not isinstance(parametrizations, ModuleDict):
+        return False
+    if tensor_name is None:
+        # Check that there is at least one parametrized buffer or Parameter
+        return len(parametrizations) > 0
+    else:
+        return tensor_name in parametrizations
+
+
+def remove_parametrizations(
+    module: Module,
+    tensor_name: str,
+    leave_parametrized: bool = True,
+) -> Module:
+    r"""Remove the parametrizations on a tensor in a module.
+
+    - If ``leave_parametrized=True``, ``module[tensor_name]`` will be set to
+      its current output. In this case, the parametrization shall not change the ``dtype``
+      of the tensor.
+    - If ``leave_parametrized=False``, ``module[tensor_name]`` will be set to
+      the unparametrised tensor in ``module.parametrizations[tensor_name].original``.
+      This is only possible when the parametrization depends on just one tensor.
+
+    Args:
+        module (nn.Module): module from which remove the parametrization
+        tensor_name (str): name of the parametrization to be removed
+        leave_parametrized (bool, optional): leave the attribute :attr:`tensor_name` parametrized.
+            Default: ``True``
+
+    Returns:
+        Module: module
+
+    Raises:
+        ValueError: if ``module[tensor_name]`` is not parametrized
+        ValueError: if ``leave_parametrized=False`` and the parametrization depends on several tensors
+    """
+    if not is_parametrized(module, tensor_name):
+        raise ValueError(
+            f"Module {module} does not have a parametrization on {tensor_name}"
+        )
+
+    # Fetch the original tensor
+    assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+    parametrizations = module.parametrizations[tensor_name]
+    # pyrefly: ignore [invalid-argument]
+    if parametrizations.is_tensor:
+        original = parametrizations.original
+        assert isinstance(original, torch.Tensor), "is_tensor promised us a Tensor"
+        if leave_parametrized:
+            with torch.no_grad():
+                t = getattr(module, tensor_name)
+            # We know they have the same dtype because we have checked this when registering the
+            # parametrizations. As such, we can use set_
+            # We do this so that the parameter does not to change the id()
+            # This way the user does not need to update the optimizer
+            with torch.no_grad():
+                if type(original) is torch.Tensor:
+                    _maybe_set(original, t)
+                else:
+                    try:
+                        _maybe_set(original, t)
+                    except RuntimeError as e:
+                        # TODO: Fix this for tensor subclasses that are parameters:
+                        # RuntimeError: set_storage is not allowed on a Tensor created from .data or .detach().
+                        raise RuntimeError(
+                            "Calling remove_parametrizations() with leave_parametrized=True "
+                            "for a parameter that is an instance of a tensor subclass requires "
+                            "set_() to be implemented correctly for the tensor subclass."
+                            "Alternatively, one can opt into the swap_tensors path"
+                            "Either set leave_parametrized=False or provide a working implementation"
+                            "for set_() in the tensor subclass or set "
+                            "torch.__future__.set_swap_module_params_on_conversion(True)."
+                        ) from e
+    else:
+        if leave_parametrized:
+            # We cannot use no_grad because we need to know whether one or more
+            # original tensors required grad
+            t = getattr(module, tensor_name)
+            # We'll have to trust the user to add it to the optimizer
+            original = Parameter(t) if t.requires_grad else t
+        else:
+            raise ValueError(
+                "Cannot leave unparametrized (`leave_parametrized=False`) a tensor "
+                "that is parametrized in terms of a sequence of tensors."
+            )
+
+    # Delete the property that manages the parametrization
+    delattr(module.__class__, tensor_name)
+    # Delete the ParametrizationList
+    del module.parametrizations[tensor_name]
+
+    # Restore the parameter / buffer into the main class
+    _register_parameter_or_buffer(module, tensor_name, original)
+
+    # Roll back the parametrized class if no other buffer or parameter
+    # is currently parametrized in this class
+    if not is_parametrized(module):
+        delattr(module, "parametrizations")
+        # Restore class
+        orig_cls = module.__class__.__bases__[0]
+        module.__class__ = orig_cls
+    return module
+
+
+def type_before_parametrizations(module: Module) -> type:
+    r"""Return the module type before parametrizations were applied and if not, then it returns the module type.
+
+    Args:
+        module (nn.Module): module to get type of
+    """
+    if is_parametrized(module):
+        return module.__class__.__bases__[0]
+    else:
+        return type(module)
+
+
+def transfer_parametrizations_and_params(
+    from_module: Module,
+    to_module: Module,
+    tensor_name: str | None = None,
+) -> Module:
+    r"""Transfer parametrizations and the parameters they parametrize from :attr:`from_module` to :attr:`to_module`.
+
+    If :attr:`tensor_name` is specified, only transfers the specified parameter, otherwise
+    transfers all parametrized parameters. If those parameters do not exist in to_module, it will create them.
+    Does nothing if from_module is not parametrized.
+
+    Args:
+        from_module (nn.Module): module to transfer from
+        to_module (nn.Module): module to transfer to
+        tensor_name (str, optional): parameter to transfer
+
+    Returns:
+        Module: to_module
+    """
+    if is_parametrized(from_module):
+        assert isinstance(from_module.parametrizations, ModuleDict)  # for mypy
+
+        # get list of all params or the single param to transfer
+        parameters_to_transfer: list | ModuleDict = (
+            from_module.parametrizations if tensor_name is None else [tensor_name]
+        )
+
+        assert hasattr(parameters_to_transfer, "__iter__")  # for mypy
+        for parameter_name in parameters_to_transfer:
+            # initialize the to-be-transferred param in to_module if it doesn't exist already
+            if not hasattr(to_module, parameter_name):
+                setattr(
+                    to_module,
+                    parameter_name,
+                    Parameter(getattr(from_module, parameter_name)),
+                )
+
+            # apply the params's parametrizations to to_module
+            for param_func in from_module.parametrizations[  # type: ignore[attr-defined]
+                parameter_name
+            ]:
+                register_parametrization(to_module, parameter_name, param_func)
+            assert isinstance(to_module.parametrizations, ModuleDict)  # for mypy
+
+            # make values match, original values can be stored in either original or
+            # original0, original1..., need to check both cases
+            if hasattr(from_module.parametrizations[parameter_name], "original"):
+                to_module.parametrizations[
+                    parameter_name
+                ].original = from_module.parametrizations[parameter_name].original
+            else:
+                num = 0
+                orig_num = "original" + str(num)
+                # loop through each original# until all values have been set
+                while hasattr(from_module.parametrizations[parameter_name], orig_num):
+                    setattr(
+                        to_module.parametrizations[parameter_name],
+                        orig_num,
+                        getattr(from_module.parametrizations[parameter_name], orig_num),
+                    )
+                    num = num + 1
+                    orig_num = "original" + str(num)
+
+    return to_module
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py
new file mode 100644
index 0000000000000000000000000000000000000000..827bf19ed4bea00723e38d2ca60dcf14cc3abbc2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/prune.py
@@ -0,0 +1,1385 @@
+# mypy: allow-untyped-defs
+r"""Pruning methods."""
+
+import numbers
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+
+import torch
+
+
+class BasePruningMethod(ABC):
+    r"""Abstract base class for creation of new pruning techniques.
+
+    Provides a skeleton for customization requiring the overriding of methods
+    such as :meth:`compute_mask` and :meth:`apply`.
+    """
+
+    _tensor_name: str
+
+    def __call__(self, module, inputs):
+        r"""Multiply the mask into original tensor and store the result.
+
+        Multiplies the mask (stored in ``module[name + '_mask']``)
+        into the original tensor (stored in ``module[name + '_orig']``)
+        and stores the result into ``module[name]`` by using :meth:`apply_mask`.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            inputs: not used.
+        """
+        setattr(module, self._tensor_name, self.apply_mask(module))
+
+    @abstractmethod
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a random mask to
+        apply on top of the ``default_mask`` according to the specific pruning
+        method recipe.
+
+        Args:
+            t (torch.Tensor): tensor representing the importance scores of the
+            parameter to prune.
+            default_mask (torch.Tensor): Base mask from previous pruning
+            iterations, that need to be respected after the new mask is
+            applied. Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+        """
+
+    def apply_mask(self, module):
+        r"""Simply handles the multiplication between the parameter being pruned and the generated mask.
+
+        Fetches the mask and the original tensor from the module
+        and returns the pruned version of the tensor.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+
+        Returns:
+            pruned_tensor (torch.Tensor): pruned version of the input tensor
+        """
+        # to carry out the multiplication, the mask needs to have been computed,
+        # so the pruning method must know what tensor it's operating on
+        assert self._tensor_name is not None, (
+            f"Module {module} has to be pruned"
+        )  # this gets set in apply()
+        mask = getattr(module, self._tensor_name + "_mask")
+        orig = getattr(module, self._tensor_name + "_orig")
+        pruned_tensor = mask.to(dtype=orig.dtype) * orig
+        return pruned_tensor
+
+    @classmethod
+    def apply(cls, module, name, *args, importance_scores=None, **kwargs):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            args: arguments passed on to a subclass of
+                :class:`BasePruningMethod`
+            importance_scores (torch.Tensor): tensor of importance scores (of
+                same shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the
+                corresponding elements in the parameter being pruned.
+                If unspecified or None, the parameter will be used in its place.
+            kwargs: keyword arguments passed on to a subclass of a
+                :class:`BasePruningMethod`
+        """
+
+        def _get_composite_method(cls, module, name, *args, **kwargs):
+            # Check if a pruning method has already been applied to
+            # `module[name]`. If so, store that in `old_method`.
+            old_method = None
+            found = 0
+            # there should technically be only 1 hook with hook.name == name
+            # assert this using `found`
+            hooks_to_remove = []
+            for k, hook in module._forward_pre_hooks.items():
+                # if it exists, take existing thing, remove hook, then
+                # go through normal thing
+                if isinstance(hook, BasePruningMethod) and hook._tensor_name == name:
+                    old_method = hook
+                    hooks_to_remove.append(k)
+                    found += 1
+            assert found <= 1, (
+                f"Avoid adding multiple pruning hooks to the\
+                same tensor {name} of module {module}. Use a PruningContainer."
+            )
+
+            for k in hooks_to_remove:
+                del module._forward_pre_hooks[k]
+
+            # Apply the new pruning method, either from scratch or on top of
+            # the previous one.
+            method = cls(*args, **kwargs)  # new pruning
+            # Have the pruning method remember what tensor it's been applied to
+            method._tensor_name = name
+
+            # combine `methods` with `old_method`, if `old_method` exists
+            if old_method is not None:  # meaning that there was a hook
+                # if the hook is already a pruning container, just add the
+                # new pruning method to the container
+                if isinstance(old_method, PruningContainer):
+                    old_method.add_pruning_method(method)
+                    method = old_method  # rename old_method --> method
+
+                # if the hook is simply a single pruning method, create a
+                # container, add the old pruning method and the new one
+                elif isinstance(old_method, BasePruningMethod):
+                    container = PruningContainer(old_method)
+                    # Have the pruning method remember the name of its tensor
+                    # setattr(container, '_tensor_name', name)
+                    container.add_pruning_method(method)
+                    method = container  # rename container --> method
+            return method
+
+        method = _get_composite_method(cls, module, name, *args, **kwargs)
+        # at this point we have no forward_pre_hooks but we could have an
+        # active reparameterization of the tensor if another pruning method
+        # had been applied (in which case `method` would be a PruningContainer
+        # and not a simple pruning method).
+
+        # Pruning is to be applied to the module's tensor named `name`,
+        # starting from the state it is found in prior to this iteration of
+        # pruning. The pruning mask is calculated based on importances scores.
+
+        orig = getattr(module, name)
+        if importance_scores is not None:
+            assert importance_scores.shape == orig.shape, (
+                f"importance_scores should have the same shape as parameter                 {name} of {module}"
+            )
+        else:
+            importance_scores = orig
+
+        # If this is the first time pruning is applied, take care of moving
+        # the original tensor to a new parameter called name + '_orig' and
+        # and deleting the original parameter
+        if not isinstance(method, PruningContainer):
+            # copy `module[name]` to `module[name + '_orig']`
+            module.register_parameter(name + "_orig", orig)
+            # temporarily delete `module[name]`
+            del module._parameters[name]
+            default_mask = torch.ones_like(orig)  # temp
+        # If this is not the first time pruning is applied, all of the above
+        # has been done before in a previous pruning iteration, so we're good
+        # to go
+        else:
+            default_mask = (
+                getattr(module, name + "_mask")
+                .detach()
+                .clone(memory_format=torch.contiguous_format)
+            )
+
+        # Use try/except because if anything goes wrong with the mask
+        # computation etc., you'd want to roll back.
+        try:
+            # get the final mask, computed according to the specific method
+            mask = method.compute_mask(importance_scores, default_mask=default_mask)
+            # reparameterize by saving mask to `module[name + '_mask']`...
+            module.register_buffer(name + "_mask", mask)
+            # ... and the new pruned tensor to `module[name]`
+            setattr(module, name, method.apply_mask(module))
+            # associate the pruning method to the module via a hook to
+            # compute the function before every forward() (compile by run)
+            module.register_forward_pre_hook(method)
+
+        except Exception as e:
+            if not isinstance(method, PruningContainer):
+                orig = getattr(module, name + "_orig")
+                module.register_parameter(name, orig)
+                del module._parameters[name + "_orig"]
+            raise e
+
+        return method
+
+    def prune(self, t, default_mask=None, importance_scores=None):
+        r"""Compute and returns a pruned version of input tensor ``t``.
+
+        According to the pruning rule specified in :meth:`compute_mask`.
+
+        Args:
+            t (torch.Tensor): tensor to prune (of same dimensions as
+                ``default_mask``).
+            importance_scores (torch.Tensor): tensor of importance scores (of
+                same shape as ``t``) used to compute mask for pruning ``t``.
+                The values in this tensor indicate the importance of the
+                corresponding elements in the ``t`` that is being pruned.
+                If unspecified or None, the tensor ``t`` will be used in its place.
+            default_mask (torch.Tensor, optional): mask from previous pruning
+                iteration, if any. To be considered when determining what
+                portion of the tensor that pruning should act on. If None,
+                default to a mask of ones.
+
+        Returns:
+            pruned version of tensor ``t``.
+        """
+        if importance_scores is not None:
+            assert importance_scores.shape == t.shape, (
+                "importance_scores should have the same shape as tensor t"
+            )
+        else:
+            importance_scores = t
+        default_mask = default_mask if default_mask is not None else torch.ones_like(t)
+        return t * self.compute_mask(importance_scores, default_mask=default_mask)
+
+    def remove(self, module) -> None:
+        r"""Remove the pruning reparameterization from a module.
+
+        The pruned parameter named ``name`` remains permanently pruned,
+        and the parameter named ``name+'_orig'`` is removed from the parameter list.
+        Similarly, the buffer named ``name+'_mask'`` is removed from the buffers.
+
+        Note:
+            Pruning itself is NOT undone or reversed!
+        """
+        # before removing pruning from a tensor, it has to have been applied
+        assert self._tensor_name is not None, (
+            f"Module {module} has to be pruned            before pruning can be removed"
+        )  # this gets set in apply()
+
+        # to update module[name] to latest trained weights
+        weight = self.apply_mask(module)  # masked weights
+
+        # delete and reset
+        if hasattr(module, self._tensor_name):
+            delattr(module, self._tensor_name)
+        orig = module._parameters[self._tensor_name + "_orig"]
+        orig.data = weight.data
+        del module._parameters[self._tensor_name + "_orig"]
+        del module._buffers[self._tensor_name + "_mask"]
+        setattr(module, self._tensor_name, orig)
+
+
+class PruningContainer(BasePruningMethod):
+    """Container holding a sequence of pruning methods for iterative pruning.
+
+    Keeps track of the order in which pruning methods are applied and handles
+    combining successive pruning calls.
+
+    Accepts as argument an instance of a BasePruningMethod or an iterable of
+    them.
+    """
+
+    def __init__(self, *args) -> None:
+        self._pruning_methods: tuple[BasePruningMethod, ...] = ()
+        if not isinstance(args, Iterable):  # only 1 item
+            self._tensor_name = args._tensor_name
+            self.add_pruning_method(args)
+        # pyrefly: ignore [bad-argument-type]
+        elif len(args) == 1:  # only 1 item in a tuple
+            # pyrefly: ignore [index-error]
+            self._tensor_name = args[0]._tensor_name
+            # pyrefly: ignore [index-error]
+            self.add_pruning_method(args[0])
+        else:  # manual construction from list or other iterable (or no args)
+            for method in args:
+                self.add_pruning_method(method)
+
+    def add_pruning_method(self, method) -> None:
+        r"""Add a child pruning ``method`` to the container.
+
+        Args:
+            method (subclass of BasePruningMethod): child pruning method
+                to be added to the container.
+        """
+        # check that we're adding a pruning method to the container
+        if not isinstance(method, BasePruningMethod) and method is not None:
+            raise TypeError(f"{type(method)} is not a BasePruningMethod subclass")
+        elif method is not None and self._tensor_name != method._tensor_name:
+            raise ValueError(
+                "Can only add pruning methods acting on "
+                f"the parameter named '{self._tensor_name}' to PruningContainer {self}."
+                + f" Found '{method._tensor_name}'"
+            )
+        # if all checks passed, add to _pruning_methods tuple
+        self._pruning_methods += (method,)  # type: ignore[operator]
+
+    def __len__(self) -> int:
+        return len(self._pruning_methods)
+
+    def __iter__(self):
+        return iter(self._pruning_methods)
+
+    def __getitem__(self, idx):
+        return self._pruning_methods[idx]
+
+    def compute_mask(self, t, default_mask):
+        r"""Apply the latest ``method`` by computing the new partial masks and returning its combination with the ``default_mask``.
+
+        The new partial mask should be computed on the entries or channels
+        that were not zeroed out by the ``default_mask``.
+        Which portions of the tensor ``t`` the new mask will be calculated from
+        depends on the ``PRUNING_TYPE`` (handled by the type handler):
+
+        * for 'unstructured', the mask will be computed from the raveled
+          list of nonmasked entries;
+
+        * for 'structured', the mask will be computed from the nonmasked
+          channels in the tensor;
+
+        * for 'global', the mask will be computed across all entries.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+                (of same dimensions as ``default_mask``).
+            default_mask (torch.Tensor): mask from previous pruning iteration.
+
+        Returns:
+            mask (torch.Tensor): new mask that combines the effects
+            of the ``default_mask`` and the new mask from the current
+            pruning ``method`` (of same dimensions as ``default_mask`` and
+            ``t``).
+        """
+
+        def _combine_masks(method, t, mask):
+            r"""Combine the masks from all pruning methods and returns a new mask.
+
+            Args:
+                method (a BasePruningMethod subclass): pruning method
+                    currently being applied.
+                t (torch.Tensor): tensor representing the parameter to prune
+                    (of same dimensions as mask).
+                mask (torch.Tensor): mask from previous pruning iteration
+
+            Returns:
+                new_mask (torch.Tensor): new mask that combines the effects
+                    of the old mask and the new mask from the current
+                    pruning method (of same dimensions as mask and t).
+            """
+            new_mask = mask  # start off from existing mask
+            new_mask = new_mask.to(dtype=t.dtype)
+
+            # compute a slice of t onto which the new pruning method will operate
+            if method.PRUNING_TYPE == "unstructured":
+                # prune entries of t where the mask is 1
+                slc = mask == 1
+
+            # for struct pruning, exclude channels that have already been
+            # entirely pruned
+            elif method.PRUNING_TYPE == "structured":
+                if not hasattr(method, "dim"):
+                    raise AttributeError(
+                        "Pruning methods of PRUNING_TYPE "
+                        '"structured" need to have the attribute `dim` defined.'
+                    )
+
+                # find the channels to keep by removing the ones that have been
+                # zeroed out already (i.e. where sum(entries) == 0)
+                n_dims = t.dim()  # "is this a 2D tensor? 3D? ..."
+                dim = method.dim
+                # convert negative indexing
+                if dim < 0:
+                    dim = n_dims + dim
+                # if dim is still negative after subtracting it from n_dims
+                if dim < 0:
+                    raise IndexError(
+                        f"Index is out of bounds for tensor with dimensions {n_dims}"
+                    )
+                # find channels along dim = dim that aren't already tots 0ed out
+                keep_channel = mask.sum(dim=[d for d in range(n_dims) if d != dim]) != 0
+                # create slice to identify what to prune
+                slc = [slice(None)] * n_dims
+                slc[dim] = keep_channel
+
+            elif method.PRUNING_TYPE == "global":
+                n_dims = len(t.shape)  # "is this a 2D tensor? 3D? ..."
+                slc = [slice(None)] * n_dims
+
+            else:
+                raise ValueError(f"Unrecognized PRUNING_TYPE {method.PRUNING_TYPE}")
+
+            # compute the new mask on the unpruned slice of the tensor t
+            if isinstance(slc, list):
+                slc = tuple(slc)
+            partial_mask = method.compute_mask(t[slc], default_mask=mask[slc])
+            new_mask[slc] = partial_mask.to(dtype=new_mask.dtype)
+
+            return new_mask
+
+        method = self._pruning_methods[-1]
+        mask = _combine_masks(method, t, default_mask)
+        return mask
+
+
+class Identity(BasePruningMethod):
+    r"""Utility pruning method that does not prune any units but generates the pruning parametrization with a mask of ones."""
+
+    PRUNING_TYPE = "unstructured"
+
+    def compute_mask(self, t, default_mask):
+        mask = default_mask
+        return mask
+
+    @classmethod
+    def apply(cls, module, name):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+        """
+        return super().apply(module, name)
+
+
+class RandomUnstructured(BasePruningMethod):
+    r"""Prune (currently unpruned) units in a tensor at random.
+
+    Args:
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+    """
+
+    PRUNING_TYPE = "unstructured"
+
+    def __init__(self, amount) -> None:
+        # Check range of validity of pruning amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+
+    def compute_mask(self, t, default_mask):
+        # Check that the amount of units to prune is not > than the number of
+        # parameters in t
+        tensor_size = t.nelement()
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        mask = default_mask.clone(memory_format=torch.contiguous_format)
+
+        if nparams_toprune != 0:  # k=0 not supported by torch.kthvalue
+            prob = torch.rand_like(t)
+            topk = torch.topk(prob.view(-1), k=nparams_toprune)
+            mask.view(-1)[topk.indices] = 0
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+        """
+        return super().apply(module, name, amount=amount)
+
+
+class L1Unstructured(BasePruningMethod):
+    r"""Prune (currently unpruned) units in a tensor by zeroing out the ones with the lowest L1-norm.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+    """
+
+    PRUNING_TYPE = "unstructured"
+
+    def __init__(self, amount) -> None:
+        # Check range of validity of pruning amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+
+    def compute_mask(self, t, default_mask):
+        # Check that the amount of units to prune is not > than the number of
+        # parameters in t
+        tensor_size = t.nelement()
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        mask = default_mask.clone(memory_format=torch.contiguous_format)
+
+        if nparams_toprune != 0:  # k=0 not supported by torch.kthvalue
+            # largest=True --> top k; largest=False --> bottom k
+            # Prune the smallest k
+            topk = torch.topk(torch.abs(t).view(-1), k=nparams_toprune, largest=False)
+            # topk will have .indices and .values
+            mask.view(-1)[topk.indices] = 0
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, importance_scores=None):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            importance_scores (torch.Tensor): tensor of importance scores (of same
+                shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the corresponding
+                elements in the parameter being pruned.
+                If unspecified or None, the module parameter will be used in its place.
+        """
+        return super().apply(
+            module, name, amount=amount, importance_scores=importance_scores
+        )
+
+
+class RandomStructured(BasePruningMethod):
+    r"""Prune entire (currently unpruned) channels in a tensor at random.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        dim (int, optional): index of the dim along which we define
+            channels to prune. Default: -1.
+    """
+
+    PRUNING_TYPE = "structured"
+
+    def __init__(self, amount, dim=-1) -> None:
+        # Check range of validity of amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+        self.dim = dim
+
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a random mask to
+        apply on top of the ``default_mask`` by randomly zeroing out channels
+        along the specified dim of the tensor.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+            default_mask (torch.Tensor): Base mask from previous pruning
+                iterations, that need to be respected after the new mask is
+                applied. Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+
+        Raises:
+            IndexError: if ``self.dim >= len(t.shape)``
+        """
+        # Check that tensor has structure (i.e. more than 1 dimension) such
+        # that the concept of "channels" makes sense
+        _validate_structured_pruning(t)
+
+        # Check that self.dim is a valid dim to index t, else raise IndexError
+        _validate_pruning_dim(t, self.dim)
+
+        # Check that the amount of channels to prune is not > than the number of
+        # channels in t along the dim to prune
+        tensor_size = t.shape[self.dim]
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        # Compute binary mask by initializing it to all 0s and then filling in
+        # 1s wherever topk.indices indicates, along self.dim.
+        # mask has the same shape as tensor t
+        def make_mask(t, dim, nchannels, nchannels_toprune):
+            # generate a random number in [0, 1] to associate to each channel
+            prob = torch.rand(nchannels)
+            # generate mask for each channel by 0ing out the channels that
+            # got assigned the k = nchannels_toprune lowest values in prob
+            threshold = torch.kthvalue(prob, k=nchannels_toprune).values
+            channel_mask = prob > threshold
+
+            mask = torch.zeros_like(t)
+            slc = [slice(None)] * len(t.shape)
+            slc[dim] = channel_mask
+            slc = tuple(slc)
+            mask[slc] = 1
+            return mask
+
+        if nparams_toprune == 0:  # k=0 not supported by torch.kthvalue
+            mask = default_mask
+        else:
+            # apply the new structured mask on top of prior (potentially
+            # unstructured) mask
+            mask = make_mask(t, self.dim, tensor_size, nparams_toprune)
+            mask *= default_mask.to(dtype=mask.dtype)
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, dim=-1):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            dim (int, optional): index of the dim along which we define
+                channels to prune. Default: -1.
+        """
+        return super().apply(module, name, amount=amount, dim=dim)
+
+
+class LnStructured(BasePruningMethod):
+    r"""Prune entire (currently unpruned) channels in a tensor based on their L\ ``n``-norm.
+
+    Args:
+        amount (int or float): quantity of channels to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument ``p`` in :func:`torch.norm`.
+        dim (int, optional): index of the dim along which we define
+            channels to prune. Default: -1.
+    """
+
+    PRUNING_TYPE = "structured"
+
+    def __init__(self, amount, n, dim=-1) -> None:
+        # Check range of validity of amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+        self.n = n
+        self.dim = dim
+
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a mask to apply on
+        top of the ``default_mask`` by zeroing out the channels along the
+        specified dim with the lowest L\ ``n``-norm.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+            default_mask (torch.Tensor): Base mask from previous pruning
+                iterations, that need to be respected after the new mask is
+                applied.  Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+
+        Raises:
+            IndexError: if ``self.dim >= len(t.shape)``
+        """
+        # Check that tensor has structure (i.e. more than 1 dimension) such
+        # that the concept of "channels" makes sense
+        _validate_structured_pruning(t)
+        # Check that self.dim is a valid dim to index t, else raise IndexError
+        _validate_pruning_dim(t, self.dim)
+
+        # Check that the amount of channels to prune is not > than the number of
+        # channels in t along the dim to prune
+        tensor_size = t.shape[self.dim]
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        nparams_tokeep = tensor_size - nparams_toprune
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        # Structured pruning prunes entire channels so we need to know the
+        # L_n norm along each channel to then find the topk based on this
+        # metric
+        norm = _compute_norm(t, self.n, self.dim)
+        # largest=True --> top k; largest=False --> bottom k
+        # Keep the largest k channels along dim=self.dim
+        topk = torch.topk(norm, k=nparams_tokeep, largest=True)
+        # topk will have .indices and .values
+
+        # Compute binary mask by initializing it to all 0s and then filling in
+        # 1s wherever topk.indices indicates, along self.dim.
+        # mask has the same shape as tensor t
+        def make_mask(t, dim, indices):
+            # init mask to 0
+            mask = torch.zeros_like(t)
+            # e.g.: slc = [None, None, None], if len(t.shape) = 3
+            slc = [slice(None)] * len(t.shape)
+            # replace a None at position=dim with indices
+            # e.g.: slc = [None, None, [0, 2, 3]] if dim=2 & indices=[0,2,3]
+            slc[dim] = indices
+            slc = tuple(slc)
+            # use slc to slice mask and replace all its entries with 1s
+            # e.g.: mask[:, :, [0, 2, 3]] = 1
+            mask[slc] = 1
+            return mask
+
+        if nparams_toprune == 0:  # k=0 not supported by torch.kthvalue
+            mask = default_mask
+        else:
+            mask = make_mask(t, self.dim, topk.indices)
+            mask *= default_mask.to(dtype=mask.dtype)
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, n, dim, importance_scores=None):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+                entries for argument ``p`` in :func:`torch.norm`.
+            dim (int): index of the dim along which we define channels to
+                prune.
+            importance_scores (torch.Tensor): tensor of importance scores (of same
+                shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the corresponding
+                elements in the parameter being pruned.
+                If unspecified or None, the module parameter will be used in its place.
+        """
+        return super().apply(
+            module,
+            name,
+            amount=amount,
+            n=n,
+            dim=dim,
+            importance_scores=importance_scores,
+        )
+
+
+class CustomFromMask(BasePruningMethod):
+    PRUNING_TYPE = "global"
+
+    def __init__(self, mask) -> None:
+        self.mask = mask
+
+    def compute_mask(self, t, default_mask):
+        assert default_mask.shape == self.mask.shape
+        mask = default_mask * self.mask.to(dtype=default_mask.dtype)
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, mask):  # type: ignore[override]
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+        """
+        return super().apply(module, name, mask=mask)
+
+
+def identity(module, name):
+    r"""Apply pruning reparametrization without pruning any units.
+
+    Applies pruning reparametrization to the tensor corresponding to the
+    parameter called ``name`` in ``module`` without actually pruning any
+    units. Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Note:
+        The mask is a tensor of ones.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune.
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.identity(nn.Linear(2, 3), "bias")
+        >>> print(m.bias_mask)
+        tensor([1., 1., 1.])
+    """
+    Identity.apply(module, name)
+    return module
+
+
+def random_unstructured(module, name, amount):
+    r"""Prune tensor by removing random (currently unpruned) units.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) units
+    selected at random.
+    Modifies module in place (and also return the modified module) by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.random_unstructured(nn.Linear(2, 3), "weight", amount=1)
+        >>> torch.sum(m.weight_mask == 0)
+        tensor(1)
+
+    """
+    RandomUnstructured.apply(module, name, amount)
+    return module
+
+
+def l1_unstructured(module, name, amount, importance_scores=None):
+    r"""Prune tensor by removing units with the lowest L1-norm.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified `amount` of (currently unpruned) units with the
+    lowest L1-norm.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        importance_scores (torch.Tensor): tensor of importance scores (of same
+            shape as module parameter) used to compute mask for pruning.
+            The values in this tensor indicate the importance of the corresponding
+            elements in the parameter being pruned.
+            If unspecified or None, the module parameter will be used in its place.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.l1_unstructured(nn.Linear(2, 3), "weight", amount=0.2)
+        >>> m.state_dict().keys()
+        odict_keys(['bias', 'weight_orig', 'weight_mask'])
+    """
+    L1Unstructured.apply(
+        module, name, amount=amount, importance_scores=importance_scores
+    )
+    return module
+
+
+def random_structured(module, name, amount, dim):
+    r"""Prune tensor by removing random channels along the specified dimension.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) channels
+    along the specified ``dim`` selected at random.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        dim (int): index of the dim along which we define channels to prune.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.random_structured(nn.Linear(5, 3), "weight", amount=3, dim=1)
+        >>> columns_pruned = int(sum(torch.sum(m.weight, dim=0) == 0))
+        >>> print(columns_pruned)
+        3
+    """
+    RandomStructured.apply(module, name, amount, dim)
+    return module
+
+
+def ln_structured(module, name, amount, n, dim, importance_scores=None):
+    r"""Prune tensor by removing channels with the lowest L\ ``n``-norm along the specified dimension.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) channels
+    along the specified ``dim`` with the lowest L\ ``n``-norm.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument ``p`` in :func:`torch.norm`.
+        dim (int): index of the dim along which we define channels to prune.
+        importance_scores (torch.Tensor): tensor of importance scores (of same
+            shape as module parameter) used to compute mask for pruning.
+            The values in this tensor indicate the importance of the corresponding
+            elements in the parameter being pruned.
+            If unspecified or None, the module parameter will be used in its place.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = prune.ln_structured(
+        ...     nn.Conv2d(5, 3, 2), "weight", amount=0.3, dim=1, n=float("-inf")
+        ... )
+    """
+    LnStructured.apply(
+        module, name, amount, n, dim, importance_scores=importance_scores
+    )
+    return module
+
+
+def global_unstructured(
+    parameters, pruning_method, importance_scores=None, **kwargs
+) -> None:
+    r"""
+    Globally prunes tensors corresponding to all parameters in ``parameters`` by applying the specified ``pruning_method``.
+
+    Modifies modules in place by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        parameters (Iterable of (module, name) tuples): parameters of
+            the model to prune in a global fashion, i.e. by aggregating all
+            weights prior to deciding which ones to prune. module must be of
+            type :class:`nn.Module`, and name must be a string.
+        pruning_method (function): a valid pruning function from this module,
+            or a custom one implemented by the user that satisfies the
+            implementation guidelines and has ``PRUNING_TYPE='unstructured'``.
+        importance_scores (dict): a dictionary mapping (module, name) tuples to
+            the corresponding parameter's importance scores tensor. The tensor
+            should be the same shape as the parameter, and is used for computing
+            mask for pruning.
+            If unspecified or None, the parameter will be used in place of its
+            importance scores.
+        kwargs: other keyword arguments such as:
+            amount (int or float): quantity of parameters to prune across the
+            specified parameters.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+
+    Raises:
+        TypeError: if ``PRUNING_TYPE != 'unstructured'``
+
+    Note:
+        Since global structured pruning doesn't make much sense unless the
+        norm is normalized by the size of the parameter, we now limit the
+        scope of global pruning to unstructured methods.
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> from collections import OrderedDict
+        >>> net = nn.Sequential(
+        ...     OrderedDict(
+        ...         [
+        ...             ("first", nn.Linear(10, 4)),
+        ...             ("second", nn.Linear(4, 1)),
+        ...         ]
+        ...     )
+        ... )
+        >>> parameters_to_prune = (
+        ...     (net.first, "weight"),
+        ...     (net.second, "weight"),
+        ... )
+        >>> prune.global_unstructured(
+        ...     parameters_to_prune,
+        ...     pruning_method=prune.L1Unstructured,
+        ...     amount=10,
+        ... )
+        >>> print(sum(torch.nn.utils.parameters_to_vector(net.buffers()) == 0))
+        tensor(10)
+
+    """
+    # ensure parameters is a list or generator of tuples
+    if not isinstance(parameters, Iterable):
+        raise TypeError("global_unstructured(): parameters is not an Iterable")
+
+    importance_scores = importance_scores if importance_scores is not None else {}
+    if not isinstance(importance_scores, dict):
+        raise TypeError("global_unstructured(): importance_scores must be of type dict")
+
+    # flatten importance scores to consider them all at once in global pruning
+    relevant_importance_scores = torch.nn.utils.parameters_to_vector(
+        # pyrefly: ignore [bad-argument-type]
+        [
+            importance_scores.get((module, name), getattr(module, name))
+            for (module, name) in parameters
+        ]
+    )
+    # similarly, flatten the masks (if they exist), or use a flattened vector
+    # of 1s of the same dimensions as t
+    default_mask = torch.nn.utils.parameters_to_vector(
+        [
+            getattr(module, name + "_mask", torch.ones_like(getattr(module, name)))
+            for (module, name) in parameters
+        ]
+    )
+
+    # use the canonical pruning methods to compute the new mask, even if the
+    # parameter is now a flattened out version of `parameters`
+    container = PruningContainer()
+    container._tensor_name = "temp"  # to make it match that of `method`
+    method = pruning_method(**kwargs)
+    method._tensor_name = "temp"  # to make it match that of `container`
+    if method.PRUNING_TYPE != "unstructured":
+        raise TypeError(
+            'Only "unstructured" PRUNING_TYPE supported for '
+            f"the `pruning_method`. Found method {pruning_method} of type {method.PRUNING_TYPE}"
+        )
+
+    container.add_pruning_method(method)
+
+    # use the `compute_mask` method from `PruningContainer` to combine the
+    # mask computed by the new method with the pre-existing mask
+    final_mask = container.compute_mask(relevant_importance_scores, default_mask)
+
+    # Pointer for slicing the mask to match the shape of each parameter
+    pointer = 0
+    for module, name in parameters:
+        param = getattr(module, name)
+        # The length of the parameter
+        num_param = param.numel()
+        # Slice the mask, reshape it
+        param_mask = final_mask[pointer : pointer + num_param].view_as(param)
+        # Assign the correct pre-computed mask to each parameter and add it
+        # to the forward_pre_hooks like any other pruning method
+        custom_from_mask(module, name, mask=param_mask)
+
+        # Increment the pointer to continue slicing the final_mask
+        pointer += num_param
+
+
+def custom_from_mask(module, name, mask):
+    r"""Prune tensor corresponding to parameter called ``name`` in ``module`` by applying the pre-computed mask in ``mask``.
+
+    Modifies module in place (and also return the modified module) by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+        mask (Tensor): binary mask to be applied to the parameter.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = prune.custom_from_mask(
+        ...     nn.Linear(5, 3), name="bias", mask=torch.tensor([0, 1, 0])
+        ... )
+        >>> print(m.bias_mask)
+        tensor([0., 1., 0.])
+
+    """
+    CustomFromMask.apply(module, name, mask)
+    return module
+
+
+def remove(module, name):
+    r"""Remove the pruning reparameterization from a module and the pruning method from the forward hook.
+
+    The pruned parameter named ``name`` remains permanently pruned, and the parameter
+    named ``name+'_orig'`` is removed from the parameter list. Similarly,
+    the buffer named ``name+'_mask'`` is removed from the buffers.
+
+    Note:
+        Pruning itself is NOT undone or reversed!
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+
+    Examples:
+        >>> m = random_unstructured(nn.Linear(5, 7), name="weight", amount=0.2)
+        >>> m = remove(m, name="weight")
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, BasePruningMethod) and hook._tensor_name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError(
+        f"Parameter '{name}' of module {module} has to be pruned before pruning can be removed"
+    )
+
+
+def is_pruned(module) -> bool:
+    r"""Check if a module is pruned by looking for pruning pre-hooks.
+
+    Check whether ``module`` is pruned by looking for
+    ``forward_pre_hooks`` in its modules that inherit from the
+    :class:`BasePruningMethod`.
+
+    Args:
+        module (nn.Module): object that is either pruned or unpruned
+
+    Returns:
+        binary answer to whether ``module`` is pruned.
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = nn.Linear(5, 7)
+        >>> print(prune.is_pruned(m))
+        False
+        >>> prune.random_unstructured(m, name="weight", amount=0.2)
+        >>> print(prune.is_pruned(m))
+        True
+    """
+    for _, submodule in module.named_modules():
+        for hook in submodule._forward_pre_hooks.values():
+            if isinstance(hook, BasePruningMethod):
+                return True
+    return False
+
+
+def _validate_pruning_amount_init(amount) -> None:
+    r"""Validate helper to check the range of amount at init.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+
+    Raises:
+        ValueError: if amount is a float not in [0, 1], or if it's a negative
+            integer.
+        TypeError: if amount is neither a float nor an integer.
+
+    Note:
+        This does not take into account the number of parameters in the
+        tensor to be pruned, which is known only at prune.
+    """
+    if not isinstance(amount, numbers.Real):
+        raise TypeError(f"Invalid type for amount: {amount}. Must be int or float.")
+
+    if (isinstance(amount, numbers.Integral) and amount < 0) or (
+        not isinstance(amount, numbers.Integral)  # so it's a float
+        and (float(amount) > 1.0 or float(amount) < 0.0)
+    ):
+        raise ValueError(
+            f"amount={amount} should either be a float in the range [0, 1] or a non-negative integer"
+        )
+
+
+def _validate_pruning_amount(amount, tensor_size) -> None:
+    r"""Validate that the pruning amount is meaningful wrt to the size of the data.
+
+    Validation helper to check that the amount of parameters to prune
+    is meaningful wrt to the size of the data (`tensor_size`).
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+        tensor_size (int): absolute number of parameters in the tensor
+            to prune.
+    """
+    # TODO: consider removing this check and allowing users to specify
+    # a number of units to prune that is greater than the number of units
+    # left to prune. In this case, the tensor will just be fully pruned.
+
+    if isinstance(amount, numbers.Integral) and amount > tensor_size:
+        raise ValueError(
+            f"amount={amount} should be smaller than the number of parameters to prune={tensor_size}"
+        )
+
+
+def _validate_structured_pruning(t) -> None:
+    r"""Validate that the tensor to be pruned is at least 2-Dimensional.
+
+    Validation helper to check that the tensor to be pruned is multi-
+    dimensional, such that the concept of "channels" is well-defined.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+
+    Raises:
+        ValueError: if the tensor `t` is not at least 2D.
+    """
+    shape = t.shape
+    if len(shape) <= 1:
+        raise ValueError(
+            "Structured pruning can only be applied to "
+            "multidimensional tensors. Found tensor of shape "
+            f"{shape} with {len(shape)} dims"
+        )
+
+
+def _compute_nparams_toprune(amount, tensor_size):
+    r"""Convert the pruning amount from a percentage to absolute value.
+
+    Since amount can be expressed either in absolute value or as a
+    percentage of the number of units/channels in a tensor, this utility
+    function converts the percentage to absolute value to standardize
+    the handling of pruning.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+        tensor_size (int): absolute number of parameters in the tensor
+            to prune.
+
+    Returns:
+        int: the number of units to prune in the tensor
+    """
+    # incorrect type already checked in _validate_pruning_amount_init
+    if isinstance(amount, numbers.Integral):
+        return amount
+    else:
+        return round(amount * tensor_size)
+
+
+def _validate_pruning_dim(t, dim) -> None:
+    r"""Validate that the pruning dimension is within the bounds of the tensor dimension.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+        dim (int): index of the dim along which we define channels to prune
+    """
+    if dim >= t.dim():
+        raise IndexError(f"Invalid index {dim} for tensor of size {t.shape}")
+
+
+def _compute_norm(t, n, dim):
+    r"""Compute the L_n-norm of a tensor along all dimensions except for the specified dimension.
+
+    The L_n-norm will be computed across all entries in tensor `t` along all dimension
+    except for the one identified by dim.
+    Example: if `t` is of shape, say, 3x2x4 and dim=2 (the last dim),
+    then norm will have Size [4], and each entry will represent the
+    `L_n`-norm computed using the 3x2=6 entries for each of the 4 channels.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument p in torch.norm
+        dim (int): dim identifying the channels to prune
+
+    Returns:
+        norm (torch.Tensor): L_n norm computed across all dimensions except
+            for `dim`. By construction, `norm.shape = t.shape[-1]`.
+    """
+    # dims = all axes, except for the one identified by `dim`
+    dims = list(range(t.dim()))
+    # convert negative indexing
+    if dim < 0:
+        dim = dims[dim]
+    dims.remove(dim)
+
+    norm = torch.norm(t, p=n, dim=dims)
+    return norm
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0530d99f94e0a0aa5fc5821ebefd85513e44c9f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/rnn.py
@@ -0,0 +1,606 @@
+import warnings
+from collections.abc import Callable, Iterable
+from typing import Any, NamedTuple, overload, TypeVar
+from typing_extensions import Self
+
+import torch
+from torch import _VF, Tensor
+
+
+__all__ = [
+    "PackedSequence",
+    "invert_permutation",
+    "pack_padded_sequence",
+    "pad_packed_sequence",
+    "pad_sequence",
+    "unpad_sequence",
+    "pack_sequence",
+    "unpack_sequence",
+]
+
+_T = TypeVar("_T")
+_R = TypeVar("_R")
+
+
+class PackedSequence_(NamedTuple):
+    data: torch.Tensor
+    batch_sizes: torch.Tensor
+    sorted_indices: torch.Tensor | None
+    unsorted_indices: torch.Tensor | None
+
+
+def bind(optional: _T | None, fn: Callable[[_T], _R]) -> _R | None:
+    if optional is None:
+        return None
+    return fn(optional)
+
+
+class PackedSequence(PackedSequence_):
+    r"""Holds the data and list of :attr:`batch_sizes` of a packed sequence.
+
+    All RNN modules accept packed sequences as inputs.
+
+    Note:
+        Instances of this class should never be created manually. They are meant
+        to be instantiated by functions like :func:`pack_padded_sequence`.
+
+        Batch sizes represent the number elements at each sequence step in
+        the batch, not the varying sequence lengths passed to
+        :func:`pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
+        the :class:`PackedSequence` would contain data ``axbc`` with
+        ``batch_sizes=[2,1,1]``.
+
+    Attributes:
+        data (Tensor): Tensor containing packed sequence
+        batch_sizes (Tensor): Tensor of integers holding
+            information about the batch size at each sequence step
+        sorted_indices (Tensor, optional): Tensor of integers holding how this
+            :class:`PackedSequence` is constructed from sequences.
+        unsorted_indices (Tensor, optional): Tensor of integers holding how this
+            to recover the original sequences with correct order.
+
+    .. note::
+        :attr:`data` can be on arbitrary device and of arbitrary dtype.
+        :attr:`sorted_indices` and :attr:`unsorted_indices` must be ``torch.int64``
+        tensors on the same device as :attr:`data`.
+
+        However, :attr:`batch_sizes` should always be a CPU ``torch.int64`` tensor.
+
+        This invariant is maintained throughout :class:`PackedSequence` class,
+        and all functions that construct a :class:`PackedSequence` in PyTorch
+        (i.e., they only pass in tensors conforming to this constraint).
+    """
+
+    def __new__(
+        cls,
+        data: Tensor,
+        batch_sizes: Tensor | None = None,
+        sorted_indices: Tensor | None = None,
+        unsorted_indices: Tensor | None = None,
+    ) -> Self:
+        return super().__new__(
+            cls,
+            *_packed_sequence_init_args(
+                data, batch_sizes, sorted_indices, unsorted_indices
+            ),
+        )
+
+    # NOTE [ device and dtype of a PackedSequence ]
+    #
+    # See the note above in doc string (starting with ":attr:`data` can be on
+    # arbitrary device...").
+    def pin_memory(self) -> Self:
+        # Why not convert `batch_sizes`?
+        # See NOTE [ device and dtype of a PackedSequence ]
+        return type(self)(
+            self.data.pin_memory(),
+            self.batch_sizes,
+            bind(self.sorted_indices, lambda t: t.pin_memory()),
+            bind(self.unsorted_indices, lambda t: t.pin_memory()),
+        )
+
+    @overload
+    def to(
+        self,
+        dtype: torch.dtype,
+        non_blocking: bool = ...,
+        copy: bool = ...,
+    ) -> Self: ...
+
+    @overload
+    def to(
+        self,
+        device: str | torch.device | int | None = ...,
+        dtype: torch.dtype | None = ...,
+        non_blocking: bool = ...,
+        copy: bool = ...,
+    ) -> Self: ...
+
+    @overload
+    def to(
+        self,
+        other: Tensor,
+        non_blocking: bool = ...,
+        copy: bool = ...,
+    ) -> Self: ...
+
+    def to(self, *args: Any, **kwargs: Any) -> Self:
+        r"""Perform dtype and/or device conversion on `self.data`.
+
+        It has similar signature as :meth:`torch.Tensor.to`, except optional
+        arguments like `non_blocking` and `copy` should be passed as kwargs,
+        not args, or they will not apply to the index tensors.
+
+        .. note::
+
+            If the ``self.data`` Tensor already has the correct :class:`torch.dtype`
+            and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, returns a copy with the desired configuration.
+        """
+        # Why not convert `batch_sizes`?
+        # See NOTE [ device and dtype of a PackedSequence ]
+        data = self.data.to(*args, **kwargs)
+        if data is self.data:
+            return self
+        else:
+            # Does not forward device or dtype arg/kwargs, device is set from data.device
+            kwargs = dict(
+                filter(lambda t: t[0] != "device" and t[0] != "dtype", kwargs.items())
+            )
+            sorted_indices = bind(
+                self.sorted_indices, lambda t: t.to(data.device, **kwargs)
+            )
+            unsorted_indices = bind(
+                self.unsorted_indices, lambda t: t.to(data.device, **kwargs)
+            )
+            return type(self)(data, self.batch_sizes, sorted_indices, unsorted_indices)
+
+    def cuda(self, *args: Any, **kwargs: Any) -> Self:
+        # Tests to see if 'cuda' should be added to kwargs
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(
+            *args, **kwargs
+        )
+        if ex.is_cuda:
+            return self.to(*args, **kwargs)
+        kwargs["device"] = "cuda"
+        return self.to(*args, **kwargs)
+
+    def cpu(self, *args: Any, **kwargs: Any) -> Self:
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(
+            *args, **kwargs
+        )
+        if ex.device.type == "cpu":
+            return self.to(*args, **kwargs)
+        kwargs["device"] = "cpu"
+        return self.to(*args, **kwargs)
+
+    def double(self) -> Self:
+        return self.to(dtype=torch.double)
+
+    def float(self) -> Self:
+        return self.to(dtype=torch.float)
+
+    def half(self) -> Self:
+        return self.to(dtype=torch.half)
+
+    def long(self) -> Self:
+        return self.to(dtype=torch.long)
+
+    def int(self) -> Self:
+        return self.to(dtype=torch.int)
+
+    def short(self) -> Self:
+        return self.to(dtype=torch.short)
+
+    def char(self) -> Self:
+        return self.to(dtype=torch.int8)
+
+    def byte(self) -> Self:
+        return self.to(dtype=torch.uint8)
+
+    @property
+    def is_cuda(self) -> bool:
+        r"""Return true if `self.data` stored on a gpu."""
+        return self.data.is_cuda
+
+    def is_pinned(self) -> bool:
+        r"""Return true if `self.data` stored on in pinned memory."""
+        return self.data.is_pinned()
+
+
+# TorchScript doesn't support constructors on named tuples, so we use this helper
+# method to construct PackedSequence
+def _packed_sequence_init_args(
+    data: Tensor,
+    batch_sizes: Tensor | None = None,
+    sorted_indices: Tensor | None = None,
+    unsorted_indices: Tensor | None = None,
+) -> tuple[Tensor, Tensor, Tensor | None, Tensor | None]:
+    # NB: if unsorted_indices is provided, it should be the inverse permutation
+    # to sorted_indices. Don't assert it here because the PackedSequence ctor
+    # should only be used internally.
+
+    if unsorted_indices is None:
+        unsorted_indices = invert_permutation(sorted_indices)
+
+    # support being called as `PackedSequence(data, batch_sizes, sorted_indices)`
+    if batch_sizes is not None:
+        # TODO: Re-enable this check (.type isn't supported in TorchScript)
+        if batch_sizes.device.type != "cpu":
+            raise ValueError(
+                "batch_sizes should always be on CPU. "
+                "Instances of PackedSequence should never be created manually. "
+                "They should be instantiated by functions like pack_sequence "
+                "and pack_padded_sequences in nn.utils.rnn. "
+                "https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_sequence"
+            )
+        return data, batch_sizes, sorted_indices, unsorted_indices
+
+    # support being called as `PackedSequence((data, batch_sizes), *, sorted_indices)`
+    else:
+        assert isinstance(data, (list, tuple)) and len(data) == 2
+        return data[0], data[1], sorted_indices, unsorted_indices
+
+
+def _packed_sequence_init(
+    data: Tensor,
+    batch_sizes: Tensor | None = None,
+    sorted_indices: Tensor | None = None,
+    unsorted_indices: Tensor | None = None,
+) -> PackedSequence:
+    data, batch_sizes, sorted_indices, unsorted_indices = _packed_sequence_init_args(
+        data, batch_sizes, sorted_indices, unsorted_indices
+    )
+    return PackedSequence(data, batch_sizes, sorted_indices, unsorted_indices)
+
+
+def invert_permutation(permutation: Tensor | None) -> Tensor | None:
+    """Returns the inverse of ``permutation``.
+
+    This is useful for converting between sorted and unsorted indices in
+    a :class:`~nn.utils.rnn.PackedSequence`.
+
+    Args:
+        permutation (Tensor, optional): a 1-D tensor of indices to invert
+    """
+    if permutation is None:
+        return None
+    output = torch.empty_like(permutation, memory_format=torch.legacy_contiguous_format)
+    output.scatter_(
+        0, permutation, torch.arange(0, permutation.numel(), device=permutation.device)
+    )
+    return output
+
+
+def pack_padded_sequence(
+    input: Tensor,
+    lengths: Tensor | list[int],
+    batch_first: bool = False,
+    enforce_sorted: bool = True,
+) -> PackedSequence:
+    r"""Packs a Tensor containing padded sequences of variable length.
+
+    :attr:`input` can be of size ``T x B x *`` (if :attr:`batch_first` is ``False``)
+    or ``B x T x *`` (if :attr:`batch_first` is ``True``) where ``T`` is the length
+    of the longest sequence, ``B`` is the batch size, and ``*`` is any number of dimensions
+    (including 0).
+
+    For unsorted sequences, use `enforce_sorted = False`. If :attr:`enforce_sorted` is
+    ``True``, the sequences should be sorted by length in a decreasing order, i.e.
+    ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the shortest
+    one. `enforce_sorted = True` is only necessary for ONNX export.
+
+    It is an inverse operation to :func:`pad_packed_sequence`, and hence :func:`pad_packed_sequence`
+    can be used to recover the underlying tensor packed in :class:`PackedSequence`.
+
+    Note:
+        This function accepts any input that has at least two dimensions. You
+        can apply it to pack the labels, and use the output of the RNN with
+        them to compute the loss directly. A Tensor can be retrieved from
+        a :class:`PackedSequence` object by accessing its ``.data`` attribute.
+
+    Args:
+        input (Tensor): padded batch of variable length sequences.
+        lengths (Tensor or list(int)): list of sequence lengths of each batch
+            element (must be on the CPU if provided as a tensor).
+        batch_first (bool, optional): if ``True``, the input is expected in ``B x T x *``
+            format, ``T x B x *`` otherwise. Default: ``False``.
+        enforce_sorted (bool, optional): if ``True``, the input is expected to
+            contain sequences sorted by length in a decreasing order. If
+            ``False``, the input will get sorted unconditionally. Default: ``True``.
+
+    .. warning::
+        The dim of ``input`` tensor will be truncated if its length larger than
+        correspond value in ``length``.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    if not isinstance(lengths, torch.Tensor):
+        if torch._C._get_tracing_state():
+            warnings.warn(
+                "pack_padded_sequence has been called with a Python list of "
+                "sequence lengths. The tracer cannot track the data flow of Python "
+                "values, and it will treat them as constants, likely rendering "
+                "the trace incorrect for any other combination of lengths.",
+                stacklevel=2,
+            )
+        lengths = torch.as_tensor(lengths, dtype=torch.int64, device="cpu")
+    else:
+        lengths = lengths.to(dtype=torch.int64)
+
+    if enforce_sorted:
+        sorted_indices = None
+    else:
+        lengths, sorted_indices = torch.sort(lengths, descending=True)
+        sorted_indices = sorted_indices.to(input.device)
+        batch_dim = 0 if batch_first else 1
+        input = input.index_select(batch_dim, sorted_indices)
+
+    data, batch_sizes = _VF._pack_padded_sequence(input, lengths, batch_first)
+    return _packed_sequence_init(data, batch_sizes, sorted_indices, None)
+
+
+def pad_packed_sequence(
+    sequence: PackedSequence,
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    total_length: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    r"""Pad a packed batch of variable length sequences.
+
+    It is an inverse operation to :func:`pack_padded_sequence`.
+
+    The returned Tensor's data will be of size ``T x B x *`` (if :attr:`batch_first` is ``False``)
+    or ``B x T x *`` (if :attr:`batch_first` is ``True``) , where ``T`` is the length of the longest
+    sequence and ``B`` is the batch size.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        >>> seq = torch.tensor([[1, 2, 0], [3, 0, 0], [4, 5, 6]])
+        >>> lens = [2, 1, 3]
+        >>> packed = pack_padded_sequence(
+        ...     seq, lens, batch_first=True, enforce_sorted=False
+        ... )
+        >>> packed
+        PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]),
+                       sorted_indices=tensor([2, 0, 1]), unsorted_indices=tensor([1, 2, 0]))
+        >>> seq_unpacked, lens_unpacked = pad_packed_sequence(packed, batch_first=True)
+        >>> seq_unpacked
+        tensor([[1, 2, 0],
+                [3, 0, 0],
+                [4, 5, 6]])
+        >>> lens_unpacked
+        tensor([2, 1, 3])
+
+    .. note::
+        :attr:`total_length` is useful to implement the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`this FAQ section <pack-rnn-unpack-with-data-parallelism>` for
+        details.
+
+    Args:
+        sequence (PackedSequence): batch to pad
+        batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
+            format, ``T x B x *`` otherwise.
+        padding_value (float, optional): values for padded elements.
+        total_length (int, optional): if not ``None``, the output will be padded to
+            have length :attr:`total_length`. This method will throw :class:`ValueError`
+            if :attr:`total_length` is less than the max sequence length in
+            :attr:`sequence`.
+
+    Returns:
+        Tuple of Tensor containing the padded sequence, and a Tensor
+        containing the list of lengths of each sequence in the batch.
+        Batch elements will be re-ordered as they were ordered originally when
+        the batch was passed to ``pack_padded_sequence`` or ``pack_sequence``.
+    """
+    max_seq_length = sequence.batch_sizes.size(0)
+    if total_length is not None:
+        if total_length < max_seq_length:
+            raise ValueError(
+                "Expected total_length to be at least the length "
+                "of the longest sequence in input, but got "
+                f"total_length={total_length} and max sequence length being {max_seq_length}"
+            )
+        max_seq_length = total_length
+    padded_output, lengths = _VF._pad_packed_sequence(
+        sequence.data, sequence.batch_sizes, batch_first, padding_value, max_seq_length
+    )
+    unsorted_indices = sequence.unsorted_indices
+    if unsorted_indices is not None:
+        batch_dim = 0 if batch_first else 1
+        return (
+            padded_output.index_select(batch_dim, unsorted_indices),
+            lengths[unsorted_indices.cpu()],
+        )
+    return padded_output, lengths
+
+
+# NOTE: for JIT-compatibility, we need to be more restrictive here and use specific types instead of Iterable.
+def pad_sequence(
+    sequences: Tensor | list[Tensor],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    padding_side: str = "right",
+) -> Tensor:
+    r"""Pad a list of variable length Tensors with :attr:`padding_value`.
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension, and pads them
+    to equal length. :attr:`sequences` can be list of sequences with size ``L x *``,
+    where `L` is length of the sequence and ``*`` is any number of dimensions
+    (including ``0``). If :attr:`batch_first` is ``False``, the output is of size
+    ``T x B x *``, and ``B x T x *`` otherwise, where ``B`` is the batch size
+    (the number of elements in :attr:`sequences`), ``T`` is the length of the longest
+    sequence.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        torch.Size([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Args:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
+            format, ``T x B x *`` otherwise.
+        padding_value (float, optional): value for padded elements. Default: ``0``.
+        padding_side (str, optional): the side to pad the sequences on.
+            Default: ``'right'``.
+
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+    if not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        # JIT doesn't support `Iterable`
+        if not isinstance(sequences, Iterable):
+            msg = (
+                "pad_sequence: Expected iterable for input sequences, but got arg of type: "
+                f"{type(sequences)}"
+            )
+            raise RuntimeError(msg)
+
+        # In JIT context this leads to,
+        # RuntimeError: cannot statically infer the expected size of a list in this context
+        sequences = tuple(sequences)  # type: ignore[assignment]
+    else:
+        # For JIT, we only support Union[Tensor, Tuple[Tensor]]
+        if isinstance(sequences, torch.Tensor):
+            sequences = sequences.unbind(0)  # type: ignore[assignment]
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    return torch._C._nn.pad_sequence(
+        sequences,  # type: ignore[arg-type]
+        batch_first,
+        padding_value,
+        padding_side,  # type: ignore[arg-type]
+    )
+
+
+def unpad_sequence(
+    padded_sequences: Tensor,
+    lengths: Tensor,
+    batch_first: bool = False,
+) -> list[Tensor]:
+    r"""Unpad padded Tensor into a list of variable length Tensors.
+
+    ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> sequences = [a, b, c]
+        >>> padded_sequences = pad_sequence(sequences)
+        >>> lengths = torch.as_tensor([v.size(0) for v in sequences])
+        >>> unpadded_sequences = unpad_sequence(padded_sequences, lengths)
+        >>> torch.allclose(sequences[0], unpadded_sequences[0])
+        True
+        >>> torch.allclose(sequences[1], unpadded_sequences[1])
+        True
+        >>> torch.allclose(sequences[2], unpadded_sequences[2])
+        True
+
+    Args:
+        padded_sequences (Tensor): padded sequences.
+        lengths (Tensor): length of original (unpadded) sequences.
+        batch_first (bool, optional): whether batch dimension first or not. Default: ``False``.
+
+    Returns:
+        a list of :class:`Tensor` objects
+    """
+    unpadded_sequences = []
+
+    if not batch_first:
+        padded_sequences.transpose_(0, 1)
+
+    max_length = padded_sequences.shape[1]
+    idx = torch.arange(max_length, device=lengths.device)
+
+    for seq, length in zip(padded_sequences, lengths, strict=True):
+        mask = idx < length
+        unpacked_seq = seq[mask]
+        unpadded_sequences.append(unpacked_seq)
+
+    return unpadded_sequences
+
+
+def pack_sequence(
+    sequences: list[Tensor],
+    enforce_sorted: bool = True,
+) -> PackedSequence:
+    r"""Packs a list of variable length Tensors.
+
+    Consecutive call of the next functions: ``pad_sequence``, ``pack_padded_sequence``.
+
+    ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
+    the length of a sequence and `*` is any number of trailing dimensions,
+    including ``0``.
+
+    For unsorted sequences, use `enforce_sorted = False`. If ``enforce_sorted``
+    is ``True``, the sequences should be sorted in the order of decreasing length.
+    ``enforce_sorted = True`` is only necessary for ONNX export.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_sequence
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5])
+        >>> c = torch.tensor([6])
+        >>> pack_sequence([a, b, c])
+        PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)
+
+    Args:
+        sequences (list[Tensor]): A list of sequences of decreasing length.
+        enforce_sorted (bool, optional): if ``True``, checks that the input
+            contains sequences sorted by length in a decreasing order. If
+            ``False``, this condition is not checked. Default: ``True``.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    lengths = torch.as_tensor([v.size(0) for v in sequences])
+    return pack_padded_sequence(
+        pad_sequence(sequences), lengths, enforce_sorted=enforce_sorted
+    )
+
+
+def unpack_sequence(packed_sequences: PackedSequence) -> list[Tensor]:
+    r"""Unpack PackedSequence into a list of variable length Tensors.
+
+    ``packed_sequences`` should be a PackedSequence object.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_sequence, unpack_sequence
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5])
+        >>> c = torch.tensor([6])
+        >>> sequences = [a, b, c]
+        >>> print(sequences)
+        [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])]
+        >>> packed_sequences = pack_sequence(sequences)
+        >>> print(packed_sequences)
+        PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)
+        >>> unpacked_sequences = unpack_sequence(packed_sequences)
+        >>> print(unpacked_sequences)
+        [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])]
+
+    Args:
+        packed_sequences (PackedSequence): A PackedSequence object.
+
+    Returns:
+        a list of :class:`Tensor` objects
+    """
+    padded_sequences, lengths = pad_packed_sequence(packed_sequences, batch_first=True)
+    unpacked_sequences = unpad_sequence(padded_sequences, lengths, batch_first=True)
+    return unpacked_sequences
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11613a51dac49d5a52d2c55f51734de37bd9e47
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/spectral_norm.py
@@ -0,0 +1,368 @@
+# mypy: allow-untyped-defs
+"""Spectral Normalization from https://arxiv.org/abs/1802.05957."""
+
+from typing import Any, TypeVar
+
+import torch
+import torch.nn.functional as F
+from torch.nn.modules import Module
+
+
+__all__ = [
+    "SpectralNorm",
+    "SpectralNormLoadStateDictPreHook",
+    "SpectralNormStateDictHook",
+    "spectral_norm",
+    "remove_spectral_norm",
+]
+
+
+class SpectralNorm:
+    # Invariant before and after each forward call:
+    #   u = F.normalize(W @ v)
+    # NB: At initialization, this invariant is not enforced
+
+    _version: int = 1
+    # At version 1:
+    #   made  `W` not a buffer,
+    #   added `v` as a buffer, and
+    #   made eval mode use `W = u @ W_orig @ v` rather than the stored `W`.
+    name: str
+    dim: int
+    n_power_iterations: int
+    eps: float
+
+    def __init__(
+        self,
+        name: str = "weight",
+        n_power_iterations: int = 1,
+        dim: int = 0,
+        eps: float = 1e-12,
+    ) -> None:
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError(
+                "Expected n_power_iterations to be positive, but "
+                f"got n_power_iterations={n_power_iterations}"
+            )
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor:
+        weight_mat = weight
+        if self.dim != 0:
+            # permute dim to front
+            weight_mat = weight_mat.permute(
+                self.dim, *[d for d in range(weight_mat.dim()) if d != self.dim]
+            )
+        height = weight_mat.size(0)
+        return weight_mat.reshape(height, -1)
+
+    def compute_weight(self, module: Module, do_power_iteration: bool) -> torch.Tensor:
+        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
+        #     updated in power iteration **in-place**. This is very important
+        #     because in `DataParallel` forward, the vectors (being buffers) are
+        #     broadcast from the parallelized module to each module replica,
+        #     which is a new module object created on the fly. And each replica
+        #     runs its own spectral norm power iteration. So simply assigning
+        #     the updated vectors to the module this function runs on will cause
+        #     the update to be lost forever. And the next time the parallelized
+        #     module is replicated, the same randomly initialized vectors are
+        #     broadcast and used!
+        #
+        #     Therefore, to make the change propagate back, we rely on two
+        #     important behaviors (also enforced via tests):
+        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
+        #          is already on correct device; and it makes sure that the
+        #          parallelized module is already on `device[0]`.
+        #       2. If the out tensor in `out=` kwarg has correct shape, it will
+        #          just fill in the values.
+        #     Therefore, since the same power iteration is performed on all
+        #     devices, simply updating the tensors in-place will make sure that
+        #     the module replica on `device[0]` will update the _u vector on the
+        #     parallelized module (by shared storage).
+        #
+        #    However, after we update `u` and `v` in-place, we need to **clone**
+        #    them before using them to normalize the weight. This is to support
+        #    backproping through two forward passes, e.g., the common pattern in
+        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
+        #    complain that variables needed to do backward for the first forward
+        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
+        weight = getattr(module, self.name + "_orig")
+        u = getattr(module, self.name + "_u")
+        v = getattr(module, self.name + "_v")
+        weight_mat = self.reshape_weight_to_matrix(weight)
+
+        if do_power_iteration:
+            with torch.no_grad():
+                for _ in range(self.n_power_iterations):
+                    # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+                    # are the first left and right singular vectors.
+                    # This power iteration produces approximations of `u` and `v`.
+                    v = F.normalize(
+                        torch.mv(weight_mat.t(), u), dim=0, eps=self.eps, out=v
+                    )
+                    u = F.normalize(torch.mv(weight_mat, v), dim=0, eps=self.eps, out=u)
+                if self.n_power_iterations > 0:
+                    # See above on why we need to clone
+                    u = u.clone(memory_format=torch.contiguous_format)
+                    v = v.clone(memory_format=torch.contiguous_format)
+
+        sigma = torch.dot(u, torch.mv(weight_mat, v))
+        weight = weight / sigma
+        return weight
+
+    def remove(self, module: Module) -> None:
+        with torch.no_grad():
+            weight = self.compute_weight(module, do_power_iteration=False)
+        delattr(module, self.name)
+        delattr(module, self.name + "_u")
+        delattr(module, self.name + "_v")
+        delattr(module, self.name + "_orig")
+        module.register_parameter(self.name, torch.nn.Parameter(weight.detach()))
+
+    def __call__(self, module: Module, inputs: Any) -> None:
+        setattr(
+            module,
+            self.name,
+            self.compute_weight(module, do_power_iteration=module.training),
+        )
+
+    def _solve_v_and_rescale(self, weight_mat, u, target_sigma):
+        # Tries to returns a vector `v` s.t. `u = F.normalize(W @ v)`
+        # (the invariant at top of this class) and `u @ W @ v = sigma`.
+        # This uses pinverse in case W^T W is not invertible.
+        v = torch.linalg.multi_dot(
+            [weight_mat.t().mm(weight_mat).pinverse(), weight_mat.t(), u.unsqueeze(1)]
+        ).squeeze(1)
+        return v.mul_(target_sigma / torch.dot(u, torch.mv(weight_mat, v)))
+
+    @staticmethod
+    def apply(
+        module: Module, name: str, n_power_iterations: int, dim: int, eps: float
+    ) -> "SpectralNorm":
+        for hook in module._forward_pre_hooks.values():
+            if isinstance(hook, SpectralNorm) and hook.name == name:
+                raise RuntimeError(
+                    f"Cannot register two spectral_norm hooks on the same parameter {name}"
+                )
+
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = module._parameters[name]
+        if weight is None:
+            raise ValueError(
+                f"`SpectralNorm` cannot be applied as parameter `{name}` is None"
+            )
+        if isinstance(weight, torch.nn.parameter.UninitializedParameter):
+            raise ValueError(
+                "The module passed to `SpectralNorm` can't have uninitialized parameters. "
+                "Make sure to run the dummy forward before applying spectral normalization"
+            )
+
+        with torch.no_grad():
+            weight_mat = fn.reshape_weight_to_matrix(weight)
+
+            h, w = weight_mat.size()
+            # randomly initialize `u` and `v`
+            u = F.normalize(weight.new_empty(h).normal_(0, 1), dim=0, eps=fn.eps)
+            v = F.normalize(weight.new_empty(w).normal_(0, 1), dim=0, eps=fn.eps)
+
+        delattr(module, fn.name)
+        module.register_parameter(fn.name + "_orig", weight)
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a plain
+        # attribute.
+        setattr(module, fn.name, weight.data)
+        module.register_buffer(fn.name + "_u", u)
+        module.register_buffer(fn.name + "_v", v)
+
+        module.register_forward_pre_hook(fn)
+        module._register_state_dict_hook(SpectralNormStateDictHook(fn))
+        module._register_load_state_dict_pre_hook(SpectralNormLoadStateDictPreHook(fn))
+        return fn
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormLoadStateDictPreHook:
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn) -> None:
+        self.fn = fn
+
+    # For state_dict with version None, (assuming that it has gone through at
+    # least one training forward), we have
+    #
+    #    u = F.normalize(W_orig @ v)
+    #    W = W_orig / sigma, where sigma = u @ W_orig @ v
+    #
+    # To compute `v`, we solve `W_orig @ x = u`, and let
+    #    v = x / (u @ W_orig @ x) * (W / W_orig).
+    def __call__(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        fn = self.fn
+        version = local_metadata.get("spectral_norm", {}).get(
+            fn.name + ".version", None
+        )
+        if version is None or version < 1:
+            weight_key = prefix + fn.name
+            if (
+                version is None
+                and all(weight_key + s in state_dict for s in ("_orig", "_u", "_v"))
+                and weight_key not in state_dict
+            ):
+                # Detect if it is the updated state dict and just missing metadata.
+                # This could happen if the users are crafting a state dict themselves,
+                # so we just pretend that this is the newest.
+                return
+            has_missing_keys = False
+            for suffix in ("_orig", "", "_u"):
+                key = weight_key + suffix
+                if key not in state_dict:
+                    has_missing_keys = True
+                    if strict:
+                        missing_keys.append(key)
+            if has_missing_keys:
+                return
+            with torch.no_grad():
+                weight_orig = state_dict[weight_key + "_orig"]
+                weight = state_dict.pop(weight_key)
+                sigma = (weight_orig / weight).mean()
+                weight_mat = fn.reshape_weight_to_matrix(weight_orig)
+                u = state_dict[weight_key + "_u"]
+                v = fn._solve_v_and_rescale(weight_mat, u, sigma)
+                state_dict[weight_key + "_v"] = v
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormStateDictHook:
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn) -> None:
+        self.fn = fn
+
+    def __call__(self, module, state_dict, prefix, local_metadata) -> None:
+        if "spectral_norm" not in local_metadata:
+            local_metadata["spectral_norm"] = {}
+        key = self.fn.name + ".version"
+        if key in local_metadata["spectral_norm"]:
+            raise RuntimeError(f"Unexpected key in metadata['spectral_norm']: {key}")
+        local_metadata["spectral_norm"][key] = self.fn._version
+
+
+T_module = TypeVar("T_module", bound=Module)
+
+
+def spectral_norm(
+    module: T_module,
+    name: str = "weight",
+    n_power_iterations: int = 1,
+    eps: float = 1e-12,
+    dim: int | None = None,
+) -> T_module:
+    r"""Apply spectral normalization to a parameter in the given module.
+
+    .. math::
+        \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})},
+        \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generative Adversarial Networks (GANs) by rescaling the weight tensor
+    with spectral norm :math:`\sigma` of the weight matrix calculated using
+    power iteration method. If the dimension of the weight tensor is greater
+    than 2, it is reshaped to 2D in power iteration method to get spectral
+    norm. This is implemented via a hook that calculates spectral norm and
+    rescales weight before every :meth:`~Module.forward` call.
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectral norm
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms
+        dim (int, optional): dimension corresponding to number of outputs,
+            the default is ``0``, except for modules that are instances of
+            ConvTranspose{1,2,3}d, when it is ``1``
+
+    Returns:
+        The original module with the spectral norm hook
+
+    .. note::
+        This function has been reimplemented as
+        :func:`torch.nn.utils.parametrizations.spectral_norm` using the new
+        parametrization functionality in
+        :func:`torch.nn.utils.parametrize.register_parametrization`. Please use
+        the newer version. This function will be deprecated in a future version
+        of PyTorch.
+
+    Example::
+
+        >>> m = spectral_norm(nn.Linear(20, 40))
+        >>> m
+        Linear(in_features=20, out_features=40, bias=True)
+        >>> m.weight_u.size()
+        torch.Size([40])
+
+    """
+    if dim is None:
+        if isinstance(
+            module,
+            (
+                torch.nn.ConvTranspose1d,
+                torch.nn.ConvTranspose2d,
+                torch.nn.ConvTranspose3d,
+            ),
+        ):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+    # pyrefly: ignore [bad-return]
+    return module
+
+
+def remove_spectral_norm(module: T_module, name: str = "weight") -> T_module:
+    r"""Remove the spectral normalization reparameterization from a module.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = spectral_norm(nn.Linear(40, 10))
+        >>> remove_spectral_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, SpectralNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            break
+    else:
+        raise ValueError(f"spectral_norm of '{name}' not found in {module}")
+
+    for k, hook in module._state_dict_hooks.items():
+        if isinstance(hook, SpectralNormStateDictHook) and hook.fn.name == name:
+            del module._state_dict_hooks[k]
+            break
+
+    for k, hook in module._load_state_dict_pre_hooks.items():
+        if isinstance(hook, SpectralNormLoadStateDictPreHook) and hook.fn.name == name:
+            del module._load_state_dict_pre_hooks[k]
+            break
+
+    return module
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f0afdeb52923a029a1843e1f2cfc702ab7473b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/stateless.py
@@ -0,0 +1,279 @@
+# mypy: allow-untyped-defs
+import contextlib
+from typing import Any
+from typing_extensions import deprecated
+
+import torch
+from torch import Tensor
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+
+__all__ = ["functional_call"]
+
+
+def _untie_named_tensors_map(
+    module: "torch.nn.Module",
+    parameters_and_buffers: dict[str, Tensor],
+) -> dict[str, Tensor]:
+    """
+    Unties all tied tensors in the module to parameters_and_buffers.
+
+    This function returns a new untied_parameters_and_buffers dictionary and leave the original
+    untied_parameters_and_buffers dictionary unchanged. It adds new (missing) keys for tied tensors
+    in the module to untied_parameters_and_buffers. The value of the new key is the user-given value
+    in the original parameters_and_buffers dictionary.
+
+    If there are more than one user-given values for the same tied tensor, it will raise an error.
+
+    For example, if the module has two tied weights self.foo and self.tied_foo and the user passes
+    {'foo': foo_value, ...}, this will return {'foo': foo_value, 'tied_foo': foo_value, ...}. If the
+    user passes {'foo': foo_value, 'tied_foo': tied_foo_value, ...}, it will raise an error. If the
+    user passes {'foo': foo_value, 'tied_foo': foo_value, ...}, it will not raise an error.
+
+    Args:
+        module (torch.nn.Module): the module to determine which tensors are tied.
+        parameters_and_buffers (Dict[str, Tensor]): a map of {name: tensor} for reparamaterizing the module.
+
+    Returns:
+        A new untied version of the parameters_and_buffers dictionary.
+
+    Raises:
+        ValueError: if there are more than one user-given values for the same tied tensor.
+    """
+    # A map of {name: tensor} for all tensors (including tied ones) in the module.
+    all_named_tensors: dict[str, Tensor] = {}
+    all_named_tensors.update(module.named_parameters(remove_duplicate=False))
+    all_named_tensors.update(module.named_buffers(remove_duplicate=False))
+
+    # A map of {tensor: set(all_tied_names)} for all tensor names in the module.
+    tensor_to_tied_names_map: dict[Tensor, set[str]] = {}
+    for name, tensor in all_named_tensors.items():
+        if tensor not in tensor_to_tied_names_map:
+            tensor_to_tied_names_map[tensor] = set()
+        tensor_to_tied_names_map[tensor].add(name)
+
+    # A map of {tied_name: set(all_tied_names)} for all tensor names in the module.
+    # If a name is not tied, it will not be in this map.
+    tied_names_map: dict[str, set[str]] = {}
+    for tied_names in tensor_to_tied_names_map.values():
+        if len(tied_names) > 1:
+            for tied_name in tied_names:
+                tied_names_map[tied_name] = tied_names
+
+    # Make sure the user didn't pass multiple values for the same tied tensor.
+    given_names = set(parameters_and_buffers.keys())
+    # same as given_names.intersection(tied_names_map.keys()) but dynamo can't
+    # handle that
+    given_names_for_tied_tensors: set[str] = set()
+    for name in given_names:
+        if name in tied_names_map:
+            given_names_for_tied_tensors.add(name)
+
+    for given_name in given_names_for_tied_tensors:
+        tied_names = tied_names_map[given_name]
+        if (
+            # Detect if there are multiple keys present for the same tied tensor.
+            len(tied_names.intersection(given_names_for_tied_tensors)) > 1
+            # Only raise an error if the user passed multiple values for the same tied tensor.
+            # If all given values are the same, don't raise.
+            and len({parameters_and_buffers[tied_name] for tied_name in tied_names})
+            != 1
+        ):
+            raise ValueError(
+                f"functional_call got multiple values for keys {sorted(tied_names)}, "
+                f"which are tied. Consider using tie_weights=False"
+            )
+
+    # Untie the given named tensor map
+    # Make a copy for not modifying the original dict
+    untied_parameters_and_buffers = parameters_and_buffers.copy()
+    for given_name in given_names_for_tied_tensors:
+        for tied_name in tied_names_map[given_name]:
+            untied_parameters_and_buffers[tied_name] = parameters_and_buffers[
+                given_name
+            ]
+    return untied_parameters_and_buffers
+
+
+@contextlib.contextmanager
+def _reparametrize_module(
+    module: "torch.nn.Module",
+    parameters_and_buffers: dict[str, Tensor],
+    tie_weights: bool = False,
+    strict: bool = False,
+    stack_weights: bool = False,
+):
+    if tie_weights:
+        untied_parameters_and_buffers = _untie_named_tensors_map(
+            module, parameters_and_buffers
+        )
+    else:
+        untied_parameters_and_buffers = parameters_and_buffers
+
+    accessor = NamedMemberAccessor(module)
+    if strict:
+        missing_keys, unexpected_keys = accessor.check_keys(
+            untied_parameters_and_buffers
+        )
+        error_msgs = []
+        if len(unexpected_keys) > 0:
+            error_msgs.append(
+                f"Unexpected key(s): {', '.join(map(repr, unexpected_keys))}."
+            )
+        if len(missing_keys) > 0:
+            error_msgs.append(f"Missing key(s): {', '.join(map(repr, missing_keys))}.")
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in reparametrizing for {}:\n\t{}".format(
+                    module._get_name(), "\n\t".join(error_msgs)
+                )
+            )
+
+    orig_parameters_and_buffers: dict[str, Tensor] = {}
+    try:
+        orig_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            untied_parameters_and_buffers, allow_missing=True
+        )
+        yield
+    finally:
+        if stack_weights:
+            # When stacking is enabled, we will restore the weights in LIFO order.
+            orig_parameters_and_buffers = dict(
+                reversed(orig_parameters_and_buffers.items())
+            )
+        new_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            orig_parameters_and_buffers, allow_missing=True
+        )
+        # Sometimes the module is not completely stateless and has some in-place modifications on
+        # the _parameters and _buffers dictionaries.
+        # Write the changed parameters and buffers back to the original dict.
+        parameters_and_buffers.update(
+            {
+                k: new_parameters_and_buffers[k]
+                for k in parameters_and_buffers
+                if k in new_parameters_and_buffers
+            }
+        )
+
+
+@deprecated(
+    "`torch.nn.utils.stateless.functional_call` is deprecated as of PyTorch 2.0 "
+    "and will be removed in a future version of PyTorch. "
+    "Please use `torch.func.functional_call` instead which is a drop-in replacement.",
+    category=FutureWarning,
+)
+def functional_call(
+    module: "torch.nn.Module",
+    parameters_and_buffers: dict[str, Tensor],
+    args: Any | tuple | None = None,
+    kwargs: dict[str, Any] | None = None,
+    *,
+    tie_weights: bool = True,
+    strict: bool = False,
+):
+    r"""Perform a functional call on the module by replacing the module parameters and buffers with the provided ones.
+
+    .. warning::
+
+        This API is deprecated as of PyTorch 2.0 and will be removed in a future
+        version of PyTorch. Please use :func:`torch.func.functional_call` instead,
+        which is a drop-in replacement for this API.
+
+    .. note:: If the module has active parametrizations, passing a value in the
+        :attr:`parameters_and_buffers` argument with the name set to the regular parameter
+        name will completely disable the parametrization.
+        If you want to apply the parametrization function to the value passed
+        please set the key as ``{submodule_name}.parametrizations.{parameter_name}.original``.
+
+    .. note:: If the module performs in-place operations on parameters/buffers, these will be reflected
+        in the `parameters_and_buffers` input.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # does self.foo = self.foo + 1
+            >>> print(mod.foo)  # tensor(0.)
+            >>> functional_call(mod, a, torch.ones(()))
+            >>> print(mod.foo)  # tensor(0.)
+            >>> print(a['foo'])  # tensor(1.)
+
+    .. note:: If the module has tied weights, whether or not functional_call respects the tying is determined by the
+        tie_weights flag.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # has both self.foo and self.foo_tied which are tied. Returns x + self.foo + self.foo_tied
+            >>> print(mod.foo)  # tensor(1.)
+            >>> mod(torch.zeros(()))  # tensor(2.)
+            >>> functional_call(mod, a, torch.zeros(()))  # tensor(0.) since it will change self.foo_tied too
+            >>> functional_call(mod, a, torch.zeros(()), tie_weights=False)  # tensor(1.)--self.foo_tied is not updated
+            >>> new_a = {'foo': torch.zeros(()), 'foo_tied': torch.zeros(())}
+            >>> functional_call(mod, new_a, torch.zeros()) # tensor(0.)
+
+    Args:
+        module (torch.nn.Module): the module to call
+        parameters_and_buffers (dict of str and Tensor): the parameters that will be used in
+            the module call.
+        args (Any or tuple): arguments to be passed to the module call. If not a tuple, considered a single argument.
+        kwargs (dict): keyword arguments to be passed to the module call
+        tie_weights (bool, optional): If True, then parameters and buffers tied in the original model will be treated as
+            tied in the reparamaterized version. Therefore, if True and different values are passed for the tied
+            parameters and buffers, it will error. If False, it will not respect the originally tied parameters and
+            buffers unless the values passed for both weights are the same. Default: True.
+        strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and
+            buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will
+            error. Default: False.
+
+    Returns:
+        Any: the result of calling ``module``.
+    """
+    return _functional_call(
+        module,
+        parameters_and_buffers,
+        args,
+        kwargs,
+        tie_weights=tie_weights,
+        strict=strict,
+    )
+
+
+def _functional_call(
+    module: "torch.nn.Module",
+    parameters_and_buffers: dict[str, Tensor],
+    args: Any | tuple | None = None,
+    kwargs: dict[str, Any] | None = None,
+    *,
+    tie_weights: bool = True,
+    strict: bool = False,
+):
+    # TODO allow kwargs such as unsafe and others for parametrization
+    if (
+        torch.jit.is_tracing()
+        or torch.jit.is_scripting()
+        or isinstance(
+            module,
+            (
+                torch.jit.RecursiveScriptModule,
+                torch.jit.ScriptModule,
+                torch.jit.ScriptFunction,
+            ),
+        )
+    ):
+        raise RuntimeError("The stateless API can't be used with Jitted modules")
+    if isinstance(module, torch.nn.DataParallel):
+        raise RuntimeError(
+            "The stateless API can't be used with nn.DataParallel module"
+        )
+    if kwargs is None:
+        kwargs = {}
+    if args is None:
+        args = ()
+    elif not isinstance(args, tuple):
+        args = (args,)
+    with _reparametrize_module(
+        module, parameters_and_buffers, tie_weights=tie_weights, strict=strict
+    ):
+        return module(*args, **kwargs)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b336e8b8c08e59b2ee3d12ab481bacb4b6aa33d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py
@@ -0,0 +1,165 @@
+# mypy: allow-untyped-defs
+r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
+
+from typing import Any, TypeVar
+from typing_extensions import deprecated
+
+from torch import _weight_norm, norm_except_dim
+from torch.nn.modules import Module
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+
+__all__ = ["WeightNorm", "weight_norm", "remove_weight_norm"]
+
+
+class WeightNorm:
+    name: str
+    dim: int
+
+    def __init__(self, name: str, dim: int) -> None:
+        if dim is None:
+            dim = -1
+        self.name = name
+        self.dim = dim
+
+    # TODO Make return type more specific
+    def compute_weight(self, module: Module) -> Any:
+        g = getattr(module, self.name + "_g")
+        v = getattr(module, self.name + "_v")
+        return _weight_norm(v, g, self.dim)
+
+    @staticmethod
+    @deprecated(
+        "`torch.nn.utils.weight_norm` is deprecated "
+        "in favor of `torch.nn.utils.parametrizations.weight_norm`.",
+        category=FutureWarning,
+    )
+    def apply(module, name: str, dim: int) -> "WeightNorm":
+        for hook in module._forward_pre_hooks.values():
+            if isinstance(hook, WeightNorm) and hook.name == name:
+                raise RuntimeError(
+                    f"Cannot register two weight_norm hooks on the same parameter {name}"
+                )
+
+        if dim is None:
+            dim = -1
+
+        fn = WeightNorm(name, dim)
+
+        weight = getattr(module, name)
+        if isinstance(weight, UninitializedParameter):
+            raise ValueError(
+                "The module passed to `WeightNorm` can't have uninitialized parameters. "
+                "Make sure to run the dummy forward before applying weight normalization"
+            )
+        # remove w from parameter list
+        del module._parameters[name]
+
+        # add g and v as new parameters and express w as g/||v|| * v
+        module.register_parameter(
+            name + "_g", Parameter(norm_except_dim(weight, 2, dim).data)
+        )
+        module.register_parameter(name + "_v", Parameter(weight.data))
+        setattr(module, name, fn.compute_weight(module))
+
+        # recompute weight before every forward()
+        module.register_forward_pre_hook(fn)
+
+        return fn
+
+    def remove(self, module: Module) -> None:
+        weight = self.compute_weight(module)
+        delattr(module, self.name)
+        del module._parameters[self.name + "_g"]
+        del module._parameters[self.name + "_v"]
+        setattr(module, self.name, Parameter(weight.data))
+
+    def __call__(self, module: Module, inputs: Any) -> None:
+        setattr(module, self.name, self.compute_weight(module))
+
+
+T_module = TypeVar("T_module", bound=Module)
+
+
+def weight_norm(module: T_module, name: str = "weight", dim: int = 0) -> T_module:
+    r"""Apply weight normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by :attr:`name` (e.g. ``'weight'``) with two parameters: one specifying the magnitude
+    (e.g. ``'weight_g'``) and one specifying the direction (e.g. ``'weight_v'``).
+    Weight normalization is implemented via a hook that recomputes the weight
+    tensor from the magnitude and direction before every :meth:`~Module.forward`
+    call.
+
+    By default, with ``dim=0``, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    ``dim=None``.
+
+    See https://arxiv.org/abs/1602.07868
+
+    .. warning::
+
+        This function is deprecated.  Use :func:`torch.nn.utils.parametrizations.weight_norm`
+        which uses the modern parametrization API.  The new ``weight_norm`` is compatible
+        with ``state_dict`` generated from old ``weight_norm``.
+
+        Migration guide:
+
+        * The magnitude (``weight_g``) and direction (``weight_v``) are now expressed
+          as ``parametrizations.weight.original0`` and ``parametrizations.weight.original1``
+          respectively.  If this is bothering you, please comment on
+          https://github.com/pytorch/pytorch/issues/102999
+
+        * To remove the weight normalization reparametrization, use
+          :func:`torch.nn.utils.parametrize.remove_parametrizations`.
+
+        * The weight is no longer recomputed once at module forward; instead, it will
+          be recomputed on every access.  To restore the old behavior, use
+          :func:`torch.nn.utils.parametrize.cached` before invoking the module
+          in question.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = weight_norm(nn.Linear(20, 40), name='weight')
+        >>> m
+        Linear(in_features=20, out_features=40, bias=True)
+        >>> m.weight_g.size()
+        torch.Size([40, 1])
+        >>> m.weight_v.size()
+        torch.Size([40, 20])
+
+    """
+    WeightNorm.apply(module, name, dim)
+    return module
+
+
+def remove_weight_norm(module: T_module, name: str = "weight") -> T_module:
+    r"""Remove the weight normalization reparameterization from a module.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = weight_norm(nn.Linear(20, 40))
+        >>> remove_weight_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError(f"weight_norm of '{name}' not found in {module}")